---
title: "Regression Analysis - Assumption Checking"
subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean"
author: "Irina"
date: today
format:
  html:
    theme: cosmo
    toc: true
    toc-depth: 3
    code-fold: false
    code-tools: true
    fig-width: 8
    fig-height: 6
execute:
  echo: true
  warning: false
  message: false
  eval: true
  output: true
  results: 'markup'
  cache: false
---

## Setup and Data Preparation

```{r setup}
#| label: setup
#| echo: true
#| output: true

# Load required libraries
library(dplyr)      # Must load first for %>% operator
library(car)
library(performance)
library(see)
library(ggplot2)
library(gridExtra)
library(lmtest)  # For bptest and durbinWatsonTest

# Set options
options(scipen = 999)

# Set working directory and load data
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
data <- read.csv("ehi1.csv")
```

```{r data-prep}
#| label: data-prep
#| echo: true
#| output: true

# Check for missing values
missing_summary <- data %>%
  select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
  summarise_all(~sum(is.na(.)))

print("Missing values check:")
print(missing_summary)

# Remove rows with missing values
data_clean <- data %>%
  select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
  filter(complete.cases(.))

print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x ")))

# Recode demo_sex as numeric for regression (0 = Female, 1 = Male)
data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0)

# Check demo_edu levels and create dummy variables
print("Education levels:")
edu_table <- table(data_clean$demo_edu)
print(edu_table)
print(paste("Number of education levels:", length(unique(data_clean$demo_edu))))

# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables)
# Using High School as reference category (excluded dummy)
data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0)
data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0)
data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0)

# Check what other education levels exist and create additional dummies
edu_levels <- unique(data_clean$demo_edu)
print("All education levels found:")
for(i in 1:length(edu_levels)) {
  print(paste(i, ":", edu_levels[i]))
}

# Create additional dummy variables for other education levels
# (You'll need to adjust these based on your actual data)
# Example for additional levels - adjust names and conditions as needed:
# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0)
# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0)
# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0)

# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1)

# Verify dummy coding
print("Sex recoding (0=Female, 1=Male):")
print(table(data_clean$demo_sex_numeric))
print("Education dummy variables (k-1 coding with High School as reference):")
print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)"))
print("College:", sum(data_clean$edu_college))
print("Undergraduate:", sum(data_clean$edu_undergrad))
print("Graduate:", sum(data_clean$edu_grad))
```

## Regression Models

```{r models}
#| label: models
#| echo: true
#| output: true

# Define the 6 regression models
models <- list()

# Model 1: demo_sex → eohiDGEN_mean
models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean)

# Model 2: demo_age → eohiDGEN_mean  
models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean)

# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference)
models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean)

# Model 4: demo_sex → ehi_global_mean
models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean)

# Model 5: demo_age → ehi_global_mean
models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean)

# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference)
models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean)
```

## Assumption Checking Functions

```{r functions}
#| label: functions
#| echo: true

# Function to check linearity assumption
check_linearity <- function(model, model_name) {
  print(paste("=== LINEARITY CHECK:", model_name, "==="))
  
  # Residuals vs Fitted plot
  plot(model, which = 1, main = paste("Linearity:", model_name))
  
  # Component + residual plot (partial residual plot)
  crPlots(model, main = paste("Component+Residual Plot:", model_name))
  
  return(NULL)
}

# Function to check normality of residuals
check_normality <- function(model, model_name) {
  print(paste("=== NORMALITY CHECK:", model_name, "==="))
  
  # Q-Q plot
  plot(model, which = 2, main = paste("Q-Q Plot:", model_name))
  
  # Shapiro-Wilk test
  residuals <- residuals(model)
  shapiro_test <- shapiro.test(residuals)
  print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5)))
  
  # Kolmogorov-Smirnov test
  ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals))
  print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5)))
  
  # Histogram of residuals
  hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) +
    geom_histogram(bins = 30, fill = "lightblue", color = "black") +
    ggtitle(paste("Residuals Histogram:", model_name)) +
    theme_minimal()
  print(hist_plot)
  
  return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value))
}

# Function to check homoscedasticity (constant variance)
check_homoscedasticity <- function(model, model_name) {
  print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "==="))
  
  # Scale-Location plot
  plot(model, which = 3, main = paste("Scale-Location Plot:", model_name))
  
  # Breusch-Pagan test
  bp_test <- bptest(model)
  print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5)))
  
  # White test (if available)
  tryCatch({
    white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2))
    print(paste("White test p-value:", format(white_test$p.value, digits = 5)))
  }, error = function(e) {
    print("White test not available for this model")
  })
  
  return(list(bp_p = bp_test$p.value))
}

# Function to check independence (no autocorrelation)
check_independence <- function(model, model_name) {
  print(paste("=== INDEPENDENCE CHECK:", model_name, "==="))
  
  # Durbin-Watson test
  dw_test <- durbinWatsonTest(model)
  print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5)))
  print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5)))
  
  # Residuals vs Order plot
  residuals_vs_order <- ggplot(data.frame(
    residuals = residuals(model),
    order = seq_along(residuals(model))
  ), aes(x = order, y = residuals)) +
    geom_point(color = "black") +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    ggtitle(paste("Residuals vs Order:", model_name)) +
    theme_minimal()
  print(residuals_vs_order)
  
  return(list(dw_stat = dw_test$dw, dw_p = dw_test$p))
}

# Function to check for influential observations
check_influence <- function(model, model_name) {
  print(paste("=== INFLUENCE CHECK:", model_name, "==="))
  
  # Cook's Distance plot
  plot(model, which = 4, main = paste("Cook's Distance:", model_name))
  
  # Calculate influence measures
  cooks_d <- cooks.distance(model)
  leverage <- hatvalues(model)
  dffits_val <- dffits(model)
  
  # Identify influential observations
  cooks_threshold <- 4/length(cooks_d)  # Cook's D threshold
  leverage_threshold <- 2 * (length(coef(model))/nobs(model))  # Leverage threshold
  dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model))  # DFFITS threshold
  
  influential_cooks <- which(cooks_d > cooks_threshold)
  influential_leverage <- which(leverage > leverage_threshold)
  influential_dffits <- which(abs(dffits_val) > dffits_threshold)
  
  print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5)))
  print(paste("Influential observations (Cook's D):", length(influential_cooks)))
  print(paste("Leverage threshold:", format(leverage_threshold, digits = 5)))
  print(paste("High leverage observations:", length(influential_leverage)))
  print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5)))
  print(paste("Influential observations (DFFITS):", length(influential_dffits)))
  
  if (length(influential_cooks) > 0) {
    print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", ")))
  }
  if (length(influential_leverage) > 0) {
    print(paste("High leverage cases:", paste(influential_leverage, collapse = ", ")))
  }
  if (length(influential_dffits) > 0) {
    print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", ")))
  }
  
  return(list(influential_cooks = influential_cooks,
              influential_leverage = influential_leverage,
              influential_dffits = influential_dffits))
}

# Function to get comprehensive model summary
get_model_summary <- function(model, model_name) {
  print(paste("=== MODEL SUMMARY:", model_name, "==="))
  
  # Basic model summary
  summary_model <- summary(model)
  print(summary_model)
  
  # R-squared and adjusted R-squared
  print(paste("R-squared:", format(summary_model$r.squared, digits = 5)))
  print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5)))
  
  # AIC and BIC
  aic_val <- AIC(model)
  bic_val <- BIC(model)
  print(paste("AIC:", format(aic_val, digits = 5)))
  print(paste("BIC:", format(bic_val, digits = 5)))
  
  return(list(summary = summary_model, r_squared = summary_model$r.squared, 
              adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val))
}
```

## Model 1: Sex → EOHI-DGEN Mean

```{r model1}
#| label: model1
#| echo: true

model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
```

## Model 2: Age → EOHI-DGEN Mean

```{r model2}
#| label: model2
#| echo: true

model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN")
model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN")
model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN")
model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN")
model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN")
model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN")
```

## Model 3: Education → EOHI-DGEN Mean

```{r model3}
#| label: model3
#| echo: true

model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN")
```

## Model 4: Sex → EHI-Global Mean

```{r model4}
#| label: model4
#| echo: true

model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global")
model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global")
model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global")
model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global")
model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global")
model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global")
```

## Model 5: Age → EHI-Global Mean

```{r model5}
#| label: model5
#| echo: true

model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global")
model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global")
model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global")
model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global")
model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global")
model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global")
```

## Model 6: Education → EHI-Global Mean

```{r model6}
#| label: model6
#| echo: true

model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global")
model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global")
model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global")
model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global")
model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global")
model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global")
```

## Summary Tables

### Assumption Violation Summary

```{r violation-summary}
#| label: violation-summary
#| echo: true

# Create summary table
violation_summary <- data.frame(
  Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN",
            "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"),
  Normality = c(
    ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
    ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
    ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
    ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
    ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
    ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK")
  ),
  Homoscedasticity = c(
    ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"),
    ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"),
    ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"),
    ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"),
    ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"),
    ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK")
  ),
  Independence = c(
    ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"),
    ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"),
    ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"),
    ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"),
    ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"),
    ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK")
  ),
  Influential_Obs = c(
    ifelse(length(model1_influence$influential_cooks) > 0, 
           paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"),
    ifelse(length(model2_influence$influential_cooks) > 0, 
           paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"),
    ifelse(length(model3_influence$influential_cooks) > 0, 
           paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"),
    ifelse(length(model4_influence$influential_cooks) > 0, 
           paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"),
    ifelse(length(model5_influence$influential_cooks) > 0, 
           paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"),
    ifelse(length(model6_influence$influential_cooks) > 0, 
           paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO")
  ),
  stringsAsFactors = FALSE
)

print(violation_summary)
```

### Model Comparison Summary

```{r comparison-summary}
#| label: comparison-summary
#| echo: true

# Create model comparison table
comparison_table <- data.frame(
  Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN",
            "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"),
  R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared,
                model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared),
  Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared,
                    model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared),
  AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic,
          model4_summary$aic, model5_summary$aic, model6_summary$aic),
  BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic,
          model4_summary$bic, model5_summary$bic, model6_summary$bic),
  Significant = c(
    ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
    ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
    ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
    ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
    ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
    ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO")
  ),
  stringsAsFactors = FALSE
)

print(comparison_table)
```

## Recommendations

### For Assumption Violations:

**1. Normality Violations:**
- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)
- Alternative: Use robust regression methods or bootstrapping

**2. Homoscedasticity Violations:**
- If violated: Use weighted least squares or robust standard errors
- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors

**3. Independence Violations:**
- If violated: Check for clustering or repeated measures structure
- Alternative: Use mixed-effects models or clustered standard errors

**4. Influential Observations:**
- If present: Examine these cases for data entry errors
- Consider: Running analysis with and without influential cases
- Alternative: Use robust regression methods

**5. Linearity Violations:**
- If violated: Add polynomial terms or use splines
- Alternative: Transform predictors or use non-parametric methods

---

*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.*