# Regression Analysis - Assumption Checking # IVs: demo_sex, demo_age, demo_edu # DVs: eohiDGEN_mean, ehi_global_mean options(scipen = 999) library(car) library(performance) #library(see) library(ggplot2) library(gridExtra) library(dplyr) library(lmtest) # For bptest and durbinWatsonTest setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") data <- read.csv("ehi1.csv") # Check for missing values missing_summary <- data %>% select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% summarise_all(~sum(is.na(.))) print(missing_summary) # Remove rows with missing values data_clean <- data %>% select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% filter(complete.cases(.)) print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) # Recode demo_sex as numeric for regression (0 = Female, 1 = Male) data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) # Check demo_edu levels and recode if needed print(table(data_clean$demo_edu)) # Recode education as ordinal (assuming higher values = more education) edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", "University - Undergraduate", "University - Graduate") data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) # Center the age variable for regression analysis data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] # Verify recoding print(table(data_clean$demo_sex_numeric)) print(table(data_clean$demo_edu_numeric)) print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) # ============================================================================= # REGRESSION MODELS # ============================================================================= # Define the 6 regression models models <- list() # Model 1: demo_sex → eohiDGEN_mean models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) # Model 2: demo_age_1 → eohiDGEN_mean (centered) models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) # Model 3: demo_edu → eohiDGEN_mean models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) # Model 4: demo_sex → ehi_global_mean models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) # Model 5: demo_age_1 → ehi_global_mean (centered) models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) # Model 6: demo_edu → ehi_global_mean models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) # ============================================================================= # ASSUMPTION CHECKING FUNCTIONS # ============================================================================= # Function to check linearity assumption check_linearity <- function(model, model_name) { # Residuals vs Fitted plot plot(model, which = 1, main = paste("Linearity:", model_name)) # Component + residual plot (partial residual plot) crPlots(model, main = paste("Component+Residual Plot:", model_name)) return(NULL) } # Function to check normality of residuals check_normality <- function(model, model_name) { # Q-Q plot plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) # Shapiro-Wilk test residuals <- residuals(model) shapiro_test <- shapiro.test(residuals) print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) # Kolmogorov-Smirnov test ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) # Histogram of residuals hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + geom_histogram(bins = 30, fill = "lightblue", color = "black") + ggtitle(paste("Residuals Histogram:", model_name)) + theme_minimal() print(hist_plot) return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) } # Function to check homoscedasticity (constant variance) check_homoscedasticity <- function(model, model_name) { # Scale-Location plot plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) # Breusch-Pagan test bp_test <- bptest(model) print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) # White test (if available) tryCatch({ white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) print(paste("White test p-value:", format(white_test$p.value, digits = 5))) }, error = function(e) { print("White test not available for this model") }) return(list(bp_p = bp_test$p.value)) } # Function to check independence (no autocorrelation) check_independence <- function(model, model_name) { # Durbin-Watson test dw_test <- durbinWatsonTest(model) print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) # Residuals vs Order plot residuals_vs_order <- ggplot(data.frame( residuals = residuals(model), order = seq_along(residuals(model)) ), aes(x = order, y = residuals)) + geom_point(color = "black") + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + ggtitle(paste("Residuals vs Order:", model_name)) + theme_minimal() print(residuals_vs_order) return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) } # Function to check for influential observations check_influence <- function(model, model_name) { # Cook's Distance plot plot(model, which = 4, main = paste("Cook's Distance:", model_name)) # Calculate influence measures cooks_d <- cooks.distance(model) leverage <- hatvalues(model) dffits_val <- dffits(model) # Identify influential observations cooks_threshold <- 4/length(cooks_d) # Cook's D threshold leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold influential_cooks <- which(cooks_d > cooks_threshold) influential_leverage <- which(leverage > leverage_threshold) influential_dffits <- which(abs(dffits_val) > dffits_threshold) print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) print(paste("Influential observations (Cook's D):", length(influential_cooks))) print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) print(paste("High leverage observations:", length(influential_leverage))) print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) print(paste("Influential observations (DFFITS):", length(influential_dffits))) if (length(influential_cooks) > 0) { print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) } if (length(influential_leverage) > 0) { print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) } if (length(influential_dffits) > 0) { print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) } return(list(influential_cooks = influential_cooks, influential_leverage = influential_leverage, influential_dffits = influential_dffits)) } # Function to get comprehensive model summary get_model_summary <- function(model, model_name) { # Basic model summary summary_model <- summary(model) print(summary_model) # R-squared and adjusted R-squared print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) # AIC and BIC aic_val <- AIC(model) bic_val <- BIC(model) print(paste("AIC:", format(aic_val, digits = 5))) print(paste("BIC:", format(bic_val, digits = 5))) return(list(summary = summary_model, r_squared = summary_model$r.squared, adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) } # ============================================================================= # RUN ASSUMPTION CHECKS - MODEL BY MODEL # ============================================================================= # Model 1: Sex → EOHI-DGEN Mean model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") # Model 2: Age (centered) → EOHI-DGEN Mean print("\n=============================================") print("MODEL 2: Age (centered) → EOHI-DGEN Mean") print("=============================================") model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") # Model 3: Education → EOHI-DGEN Mean print("\n=============================================") print("MODEL 3: Education → EOHI-DGEN Mean") print("=============================================") model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") # Model 4: Sex → EHI-Global Mean print("\n=============================================") print("MODEL 4: Sex → EHI-Global Mean") print("=============================================") model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") # Model 5: Age (centered) → EHI-Global Mean print("\n=============================================") print("MODEL 5: Age (centered) → EHI-Global Mean") print("=============================================") model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") # Model 6: Education → EHI-Global Mean print("\n=============================================") print("MODEL 6: Education → EHI-Global Mean") print("=============================================") model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") # ============================================================================= # SUMMARY TABLE OF ASSUMPTION VIOLATIONS # ============================================================================= print("\n=============================================") print("ASSUMPTION VIOLATION SUMMARY") print("=============================================") # Create summary table violation_summary <- data.frame( Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), Normality = c( ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") ), Homoscedasticity = c( ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") ), Independence = c( ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") ), Influential_Obs = c( ifelse(length(model1_influence$influential_cooks) > 0, paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model2_influence$influential_cooks) > 0, paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model3_influence$influential_cooks) > 0, paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model4_influence$influential_cooks) > 0, paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model5_influence$influential_cooks) > 0, paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model6_influence$influential_cooks) > 0, paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") ), stringsAsFactors = FALSE ) print(violation_summary) # ============================================================================= # MODEL COMPARISON TABLE # ============================================================================= print("\n=============================================") print("MODEL COMPARISON SUMMARY") print("=============================================") # Create model comparison table comparison_table <- data.frame( Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, model4_summary$aic, model5_summary$aic, model6_summary$aic), BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, model4_summary$bic, model5_summary$bic, model6_summary$bic), Significant = c( ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") ), stringsAsFactors = FALSE ) print(comparison_table) print("\n=============================================") print("ANALYSIS COMPLETE") print("=============================================")