# Regression Analysis - Assumption Checking # IVs: demo_sex, demo_age, demo_edu # DVs: eohiDGEN_mean, ehi_global_mean # Total: 6 regression models options(scipen = 999) # Load required libraries library(car) library(performance) library(see) library(ggplot2) library(gridExtra) library(dplyr) setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") data <- read.csv("ehi1.csv") # Check data structure cat("Data dimensions:", dim(data), "\n") cat("Variables of interest:\n") cat("IVs: demo_sex, demo_age, demo_edu\n") cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n") # Check for missing values cat("Missing values check:\n") missing_summary <- data %>% select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% summarise_all(~sum(is.na(.))) print(missing_summary) # Remove rows with missing values data_clean <- data %>% select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% filter(complete.cases(.)) cat("\nClean data dimensions:", dim(data_clean), "\n") # Recode demo_sex as numeric for regression (0 = Female, 1 = Male) data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) # Check demo_edu levels and recode if needed cat("\nEducation levels:\n") print(table(data_clean$demo_edu)) # Recode education as ordinal (assuming higher values = more education) edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", "University - Undergraduate", "University - Graduate") data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) # Verify recoding cat("\nSex recoding (0=Female, 1=Male):\n") print(table(data_clean$demo_sex_numeric)) cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n") print(table(data_clean$demo_edu_numeric)) # ============================================================================= # REGRESSION MODELS # ============================================================================= # Define the 6 regression models models <- list() # Model 1: demo_sex → eohiDGEN_mean models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) # Model 2: demo_age → eohiDGEN_mean models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) # Model 3: demo_edu → eohiDGEN_mean models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) # Model 4: demo_sex → ehi_global_mean models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) # Model 5: demo_age → ehi_global_mean models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) # Model 6: demo_edu → ehi_global_mean models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) # ============================================================================= # ASSUMPTION CHECKING FUNCTIONS # ============================================================================= # Function to check linearity assumption check_linearity <- function(model, model_name) { cat("\n=== LINEARITY CHECK:", model_name, "===\n") # Residuals vs Fitted plot residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) # Component + residual plot (partial residual plot) cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) # Return plots for later use return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) } # Function to check normality of residuals check_normality <- function(model, model_name) { cat("\n=== NORMALITY CHECK:", model_name, "===\n") # Q-Q plot qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) # Shapiro-Wilk test residuals <- residuals(model) shapiro_test <- shapiro.test(residuals) cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") # Kolmogorov-Smirnov test ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") # Histogram of residuals hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + geom_histogram(bins = 30, fill = "lightblue", color = "black") + ggtitle(paste("Residuals Histogram:", model_name)) + theme_minimal() return(list(qq_plot = qq_plot, hist_plot = hist_plot, shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) } # Function to check homoscedasticity (constant variance) check_homoscedasticity <- function(model, model_name) { cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") # Scale-Location plot scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) # Breusch-Pagan test bp_test <- bptest(model) cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") # White test (if available) tryCatch({ white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") }, error = function(e) { cat("White test not available for this model\n") }) return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) } # Function to check independence (no autocorrelation) check_independence <- function(model, model_name) { cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") # Durbin-Watson test dw_test <- durbinWatsonTest(model) cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") # Residuals vs Order plot residuals_vs_order <- ggplot(data.frame( residuals = residuals(model), order = 1:length(residuals(model)) ), aes(x = order, y = residuals)) + geom_point(color = "black") + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + ggtitle(paste("Residuals vs Order:", model_name)) + theme_minimal() return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) } # Function to check for influential observations check_influence <- function(model, model_name) { cat("\n=== INFLUENCE CHECK:", model_name, "===\n") # Cook's Distance plot cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) # Calculate influence measures influence_measures <- influence.measures(model) cooks_d <- cooks.distance(model) leverage <- hatvalues(model) dffits_val <- dffits(model) # Identify influential observations cooks_threshold <- 4/length(cooks_d) # Cook's D threshold leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold influential_cooks <- which(cooks_d > cooks_threshold) influential_leverage <- which(leverage > leverage_threshold) influential_dffits <- which(abs(dffits_val) > dffits_threshold) cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") cat("Influential observations (Cook's D):", length(influential_cooks), "\n") cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") cat("High leverage observations:", length(influential_leverage), "\n") cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") cat("Influential observations (DFFITS):", length(influential_dffits), "\n") if (length(influential_cooks) > 0) { cat("Cook's D influential cases:", influential_cooks, "\n") } if (length(influential_leverage) > 0) { cat("High leverage cases:", influential_leverage, "\n") } if (length(influential_dffits) > 0) { cat("DFFITS influential cases:", influential_dffits, "\n") } return(list(cooks_plot = cooks_plot, influential_cooks = influential_cooks, influential_leverage = influential_leverage, influential_dffits = influential_dffits)) } # Function to get comprehensive model summary get_model_summary <- function(model, model_name) { cat("\n=== MODEL SUMMARY:", model_name, "===\n") # Basic model summary summary_model <- summary(model) print(summary_model) # R-squared and adjusted R-squared cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") # AIC and BIC aic_val <- AIC(model) bic_val <- BIC(model) cat("AIC:", format(aic_val, digits = 5), "\n") cat("BIC:", format(bic_val, digits = 5), "\n") return(list(summary = summary_model, r_squared = summary_model$r.squared, adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) }