# Regression Analysis - Assumption Checking # IVs: demo_sex, demo_age, demo_edu # DVs: eohiDGEN_mean, ehi_global_mean options(scipen = 999) library(car) library(performance) library(see) library(ggplot2) library(gridExtra) library(dplyr) library(lmtest) # For bptest and durbinWatsonTest setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") data <- read.csv("ehi1.csv") # Check for missing values missing_summary <- data %>% select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% summarise_all(~sum(is.na(.))) print("Missing values check:") print(missing_summary) # Remove rows with missing values data_clean <- data %>% select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% filter(complete.cases(.)) print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) # Recode demo_sex as numeric for regression (0 = Female, 1 = Male) data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) # Check demo_edu levels and recode if needed print("Education levels:") print(table(data_clean$demo_edu)) # Recode education as ordinal (assuming higher values = more education) edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", "University - Undergraduate", "University - Graduate") data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) # Verify recoding print("Sex recoding (0=Female, 1=Male):") print(table(data_clean$demo_sex_numeric)) print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") print(table(data_clean$demo_edu_numeric)) # ============================================================================= # REGRESSION MODELS # ============================================================================= # Define the 6 regression models models <- list() # Model 1: demo_sex → eohiDGEN_mean models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) # Model 2: demo_age → eohiDGEN_mean models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) # Model 3: demo_edu → eohiDGEN_mean models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) # Model 4: demo_sex → ehi_global_mean models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) # Model 5: demo_age → ehi_global_mean models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) # Model 6: demo_edu → ehi_global_mean models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) # ============================================================================= # ASSUMPTION CHECKING FUNCTIONS # ============================================================================= # Function to check linearity assumption check_linearity <- function(model, model_name) { cat("\n=== LINEARITY CHECK:", model_name, "===\n") # Residuals vs Fitted plot residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) # Component + residual plot (partial residual plot) cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) # Return plots for later use return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) } # Function to check normality of residuals check_normality <- function(model, model_name) { cat("\n=== NORMALITY CHECK:", model_name, "===\n") # Q-Q plot qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) # Shapiro-Wilk test residuals <- residuals(model) shapiro_test <- shapiro.test(residuals) cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") # Kolmogorov-Smirnov test ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") # Histogram of residuals hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + geom_histogram(bins = 30, fill = "lightblue", color = "black") + ggtitle(paste("Residuals Histogram:", model_name)) + theme_minimal() return(list(qq_plot = qq_plot, hist_plot = hist_plot, shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) } # Function to check homoscedasticity (constant variance) check_homoscedasticity <- function(model, model_name) { cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") # Scale-Location plot scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) # Breusch-Pagan test bp_test <- bptest(model) cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") # White test (if available) tryCatch({ white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") }, error = function(e) { cat("White test not available for this model\n") }) return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) } # Function to check independence (no autocorrelation) check_independence <- function(model, model_name) { cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") # Durbin-Watson test dw_test <- durbinWatsonTest(model) cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") # Residuals vs Order plot residuals_vs_order <- ggplot(data.frame( residuals = residuals(model), order = seq_along(residuals(model)) ), aes(x = order, y = residuals)) + geom_point(color = "black") + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + ggtitle(paste("Residuals vs Order:", model_name)) + theme_minimal() return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) } # Function to check for influential observations check_influence <- function(model, model_name) { cat("\n=== INFLUENCE CHECK:", model_name, "===\n") # Cook's Distance plot cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) # Calculate influence measures cooks_d <- cooks.distance(model) leverage <- hatvalues(model) dffits_val <- dffits(model) # Identify influential observations cooks_threshold <- 4/length(cooks_d) # Cook's D threshold leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold influential_cooks <- which(cooks_d > cooks_threshold) influential_leverage <- which(leverage > leverage_threshold) influential_dffits <- which(abs(dffits_val) > dffits_threshold) cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") cat("Influential observations (Cook's D):", length(influential_cooks), "\n") cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") cat("High leverage observations:", length(influential_leverage), "\n") cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") cat("Influential observations (DFFITS):", length(influential_dffits), "\n") if (length(influential_cooks) > 0) { cat("Cook's D influential cases:", influential_cooks, "\n") } if (length(influential_leverage) > 0) { cat("High leverage cases:", influential_leverage, "\n") } if (length(influential_dffits) > 0) { cat("DFFITS influential cases:", influential_dffits, "\n") } return(list(cooks_plot = cooks_plot, influential_cooks = influential_cooks, influential_leverage = influential_leverage, influential_dffits = influential_dffits)) } # Function to get comprehensive model summary get_model_summary <- function(model, model_name) { cat("\n=== MODEL SUMMARY:", model_name, "===\n") # Basic model summary summary_model <- summary(model) print(summary_model) # R-squared and adjusted R-squared cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") # AIC and BIC aic_val <- AIC(model) bic_val <- BIC(model) cat("AIC:", format(aic_val, digits = 5), "\n") cat("BIC:", format(bic_val, digits = 5), "\n") return(list(summary = summary_model, r_squared = summary_model$r.squared, adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) } # ============================================================================= # RUN ASSUMPTION CHECKS FOR ALL MODELS # ============================================================================= # Create results storage assumption_results <- list() model_summaries <- list() # Model names for reference model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global") cat("\n", rep("=", 80), "\n") cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n") cat(rep("=", 80), "\n") # Run assumption checks for each model for (i in seq_along(models)) { model_name <- model_names[i] model <- models[[i]] cat("\n", rep("-", 60), "\n") cat("ANALYZING MODEL", i, ":", model_name, "\n") cat(rep("-", 60), "\n") # Store results assumption_results[[i]] <- list() assumption_results[[i]]$model_name <- model_name # 1. Model Summary model_summaries[[i]] <- get_model_summary(model, model_name) assumption_results[[i]]$summary <- model_summaries[[i]] # 2. Linearity Check assumption_results[[i]]$linearity <- check_linearity(model, model_name) # 3. Normality Check assumption_results[[i]]$normality <- check_normality(model, model_name) # 4. Homoscedasticity Check assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name) # 5. Independence Check assumption_results[[i]]$independence <- check_independence(model, model_name) # 6. Influence Check assumption_results[[i]]$influence <- check_influence(model, model_name) } # ============================================================================= # SUMMARY TABLE OF ASSUMPTION VIOLATIONS # ============================================================================= cat("\n", rep("=", 80), "\n") cat("ASSUMPTION VIOLATION SUMMARY\n") cat(rep("=", 80), "\n") # Create summary table violation_summary <- data.frame( Model = character(), Linearity = character(), Normality = character(), Homoscedasticity = character(), Independence = character(), Influential_Obs = character(), stringsAsFactors = FALSE ) # Populate summary table for (i in seq_along(models)) { model_name <- model_names[i] # Check normality (Shapiro-Wilk p < 0.05 indicates violation) normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK") # Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation) homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK") # Check independence (Durbin-Watson p < 0.05 indicates violation) independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK") # Check for influential observations influential_count <- length(assumption_results[[i]]$influence$influential_cooks) influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO") # Linearity is assessed visually, so we'll mark as "CHECK VISUALLY" linearity_status <- "CHECK VISUALLY" violation_summary <- rbind(violation_summary, data.frame( Model = model_name, Linearity = linearity_status, Normality = normality_violation, Homoscedasticity = homosced_violation, Independence = independence_violation, Influential_Obs = influential_status )) } print(violation_summary) # ============================================================================= # MODEL COMPARISON TABLE # ============================================================================= cat("\n", rep("=", 80), "\n") cat("MODEL COMPARISON SUMMARY\n") cat(rep("=", 80), "\n") # Create model comparison table comparison_table <- data.frame( Model = model_names, R_Squared = numeric(length(models)), Adj_R_Squared = numeric(length(models)), AIC = numeric(length(models)), BIC = numeric(length(models)), Significant = character(length(models)), stringsAsFactors = FALSE ) for (i in seq_along(models)) { summary_model <- model_summaries[[i]]$summary comparison_table$R_Squared[i] <- summary_model$r.squared comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared comparison_table$AIC[i] <- model_summaries[[i]]$aic comparison_table$BIC[i] <- model_summaries[[i]]$bic # Check if predictor is significant (p < 0.05) p_value <- summary_model$coefficients[2, 4] comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO") } print(comparison_table) # ============================================================================= # RECOMMENDATIONS # ============================================================================= cat("\n", rep("=", 80), "\n") cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n") cat(rep("=", 80), "\n") cat("\n1. NORMALITY VIOLATIONS:\n") cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n") cat(" - Alternative: Use robust regression methods or bootstrapping\n") cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n") cat(" - If violated: Use weighted least squares or robust standard errors\n") cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n") cat("\n3. INDEPENDENCE VIOLATIONS:\n") cat(" - If violated: Check for clustering or repeated measures structure\n") cat(" - Alternative: Use mixed-effects models or clustered standard errors\n") cat("\n4. INFLUENTIAL OBSERVATIONS:\n") cat(" - If present: Examine these cases for data entry errors\n") cat(" - Consider: Running analysis with and without influential cases\n") cat(" - Alternative: Use robust regression methods\n") cat("\n5. LINEARITY VIOLATIONS:\n") cat(" - If violated: Add polynomial terms or use splines\n") cat(" - Alternative: Transform predictors or use non-parametric methods\n") cat("\n", rep("=", 80), "\n") cat("ANALYSIS COMPLETE\n") cat(rep("=", 80), "\n")