# Regression Analysis - Assumption Checking
# IVs: demo_sex, demo_age, demo_edu
# DVs: eohiDGEN_mean, ehi_global_mean
# Total: 6 regression models

options(scipen = 999)

# Load required libraries
library(car)
library(performance)
library(see)
library(ggplot2)
library(gridExtra)
library(dplyr)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
data <- read.csv("ehi1.csv")

# Check data structure
cat("Data dimensions:", dim(data), "\n")
cat("Variables of interest:\n")
cat("IVs: demo_sex, demo_age, demo_edu\n")
cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n")

# Check for missing values
cat("Missing values check:\n")
missing_summary <- data %>%
  select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
  summarise_all(~sum(is.na(.)))
print(missing_summary)

# Remove rows with missing values
data_clean <- data %>%
  select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
  filter(complete.cases(.))

cat("\nClean data dimensions:", dim(data_clean), "\n")

# Recode demo_sex as numeric for regression (0 = Female, 1 = Male)
data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0)

# Check demo_edu levels and recode if needed
cat("\nEducation levels:\n")
print(table(data_clean$demo_edu))

# Recode education as ordinal (assuming higher values = more education)
edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", 
                "University - Undergraduate", "University - Graduate")
data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels)

# Verify recoding
cat("\nSex recoding (0=Female, 1=Male):\n")
print(table(data_clean$demo_sex_numeric))
cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n")
print(table(data_clean$demo_edu_numeric))

# =============================================================================
# REGRESSION MODELS
# =============================================================================

# Define the 6 regression models
models <- list()

# Model 1: demo_sex → eohiDGEN_mean
models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean)

# Model 2: demo_age → eohiDGEN_mean  
models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean)

# Model 3: demo_edu → eohiDGEN_mean
models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean)

# Model 4: demo_sex → ehi_global_mean
models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean)

# Model 5: demo_age → ehi_global_mean
models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean)

# Model 6: demo_edu → ehi_global_mean
models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean)

# =============================================================================
# ASSUMPTION CHECKING FUNCTIONS
# =============================================================================

# Function to check linearity assumption
check_linearity <- function(model, model_name) {
  cat("\n=== LINEARITY CHECK:", model_name, "===\n")
  
  # Residuals vs Fitted plot
  residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name))
  
  # Component + residual plot (partial residual plot)
  cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name))
  
  # Return plots for later use
  return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot))
}

# Function to check normality of residuals
check_normality <- function(model, model_name) {
  cat("\n=== NORMALITY CHECK:", model_name, "===\n")
  
  # Q-Q plot
  qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name))
  
  # Shapiro-Wilk test
  residuals <- residuals(model)
  shapiro_test <- shapiro.test(residuals)
  cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n")
  
  # Kolmogorov-Smirnov test
  ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals))
  cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n")
  
  # Histogram of residuals
  hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) +
    geom_histogram(bins = 30, fill = "lightblue", color = "black") +
    ggtitle(paste("Residuals Histogram:", model_name)) +
    theme_minimal()
  
  return(list(qq_plot = qq_plot, hist_plot = hist_plot, 
              shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value))
}

# Function to check homoscedasticity (constant variance)
check_homoscedasticity <- function(model, model_name) {
  cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n")
  
  # Scale-Location plot
  scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name))
  
  # Breusch-Pagan test
  bp_test <- bptest(model)
  cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n")
  
  # White test (if available)
  tryCatch({
    white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2))
    cat("White test p-value:", format(white_test$p.value, digits = 5), "\n")
  }, error = function(e) {
    cat("White test not available for this model\n")
  })
  
  return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value))
}

# Function to check independence (no autocorrelation)
check_independence <- function(model, model_name) {
  cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n")
  
  # Durbin-Watson test
  dw_test <- durbinWatsonTest(model)
  cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n")
  cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n")
  
  # Residuals vs Order plot
  residuals_vs_order <- ggplot(data.frame(
    residuals = residuals(model),
    order = 1:length(residuals(model))
  ), aes(x = order, y = residuals)) +
    geom_point(color = "black") +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    ggtitle(paste("Residuals vs Order:", model_name)) +
    theme_minimal()
  
  return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p))
}

# Function to check for influential observations
check_influence <- function(model, model_name) {
  cat("\n=== INFLUENCE CHECK:", model_name, "===\n")
  
  # Cook's Distance plot
  cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name))
  
  # Calculate influence measures
  influence_measures <- influence.measures(model)
  cooks_d <- cooks.distance(model)
  leverage <- hatvalues(model)
  dffits_val <- dffits(model)
  
  # Identify influential observations
  cooks_threshold <- 4/length(cooks_d)  # Cook's D threshold
  leverage_threshold <- 2 * (length(coef(model))/nobs(model))  # Leverage threshold
  dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model))  # DFFITS threshold
  
  influential_cooks <- which(cooks_d > cooks_threshold)
  influential_leverage <- which(leverage > leverage_threshold)
  influential_dffits <- which(abs(dffits_val) > dffits_threshold)
  
  cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n")
  cat("Influential observations (Cook's D):", length(influential_cooks), "\n")
  cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n")
  cat("High leverage observations:", length(influential_leverage), "\n")
  cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n")
  cat("Influential observations (DFFITS):", length(influential_dffits), "\n")
  
  if (length(influential_cooks) > 0) {
    cat("Cook's D influential cases:", influential_cooks, "\n")
  }
  if (length(influential_leverage) > 0) {
    cat("High leverage cases:", influential_leverage, "\n")
  }
  if (length(influential_dffits) > 0) {
    cat("DFFITS influential cases:", influential_dffits, "\n")
  }
  
  return(list(cooks_plot = cooks_plot, 
              influential_cooks = influential_cooks,
              influential_leverage = influential_leverage,
              influential_dffits = influential_dffits))
}

# Function to get comprehensive model summary
get_model_summary <- function(model, model_name) {
  cat("\n=== MODEL SUMMARY:", model_name, "===\n")
  
  # Basic model summary
  summary_model <- summary(model)
  print(summary_model)
  
  # R-squared and adjusted R-squared
  cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n")
  cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n")
  
  # AIC and BIC
  aic_val <- AIC(model)
  bic_val <- BIC(model)
  cat("AIC:", format(aic_val, digits = 5), "\n")
  cat("BIC:", format(bic_val, digits = 5), "\n")
  
  return(list(summary = summary_model, r_squared = summary_model$r.squared, 
              adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val))
}