--- title: "Regression Analysis - Assumption Checking" subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" author: "Irina" date: today format: html: theme: cosmo toc: true toc-depth: 3 code-fold: false code-tools: true fig-width: 8 fig-height: 6 execute: echo: true warning: false message: false eval: true output: true results: 'markup' cache: false --- ## Setup and Data Preparation ```{r setup} #| label: setup #| echo: true #| output: true # Load required libraries library(dplyr) # Must load first for %>% operator library(car) library(performance) library(see) library(ggplot2) library(gridExtra) library(lmtest) # For bptest and durbinWatsonTest # Set options options(scipen = 999) # Set working directory and load data setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") data <- read.csv("ehi1.csv") ``` ```{r data-prep} #| label: data-prep #| echo: true #| output: true # Check for missing values missing_summary <- data %>% select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% summarise_all(~sum(is.na(.))) print("Missing values check:") print(missing_summary) # Remove rows with missing values data_clean <- data %>% select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% filter(complete.cases(.)) print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) # Recode demo_sex as numeric for regression (0 = Female, 1 = Male) data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) # Check demo_edu levels and create dummy variables print("Education levels:") edu_table <- table(data_clean$demo_edu) print(edu_table) print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) # Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) # Using High School as reference category (excluded dummy) data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) # Check what other education levels exist and create additional dummies edu_levels <- unique(data_clean$demo_edu) print("All education levels found:") for(i in 1:length(edu_levels)) { print(paste(i, ":", edu_levels[i])) } # Create additional dummy variables for other education levels # (You'll need to adjust these based on your actual data) # Example for additional levels - adjust names and conditions as needed: # data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) # data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) # data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) # Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) # Verify dummy coding print("Sex recoding (0=Female, 1=Male):") print(table(data_clean$demo_sex_numeric)) print("Education dummy variables (k-1 coding with High School as reference):") print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) print("College:", sum(data_clean$edu_college)) print("Undergraduate:", sum(data_clean$edu_undergrad)) print("Graduate:", sum(data_clean$edu_grad)) ``` ## Regression Models ```{r models} #| label: models #| echo: true #| output: true # Define the 6 regression models models <- list() # Model 1: demo_sex → eohiDGEN_mean models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) # Model 2: demo_age → eohiDGEN_mean models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) # Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) # Model 4: demo_sex → ehi_global_mean models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) # Model 5: demo_age → ehi_global_mean models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) # Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) ``` ## Assumption Checking Functions ```{r functions} #| label: functions #| echo: true # Function to check linearity assumption check_linearity <- function(model, model_name) { print(paste("=== LINEARITY CHECK:", model_name, "===")) # Residuals vs Fitted plot plot(model, which = 1, main = paste("Linearity:", model_name)) # Component + residual plot (partial residual plot) crPlots(model, main = paste("Component+Residual Plot:", model_name)) return(NULL) } # Function to check normality of residuals check_normality <- function(model, model_name) { print(paste("=== NORMALITY CHECK:", model_name, "===")) # Q-Q plot plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) # Shapiro-Wilk test residuals <- residuals(model) shapiro_test <- shapiro.test(residuals) print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) # Kolmogorov-Smirnov test ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) # Histogram of residuals hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + geom_histogram(bins = 30, fill = "lightblue", color = "black") + ggtitle(paste("Residuals Histogram:", model_name)) + theme_minimal() print(hist_plot) return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) } # Function to check homoscedasticity (constant variance) check_homoscedasticity <- function(model, model_name) { print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) # Scale-Location plot plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) # Breusch-Pagan test bp_test <- bptest(model) print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) # White test (if available) tryCatch({ white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) print(paste("White test p-value:", format(white_test$p.value, digits = 5))) }, error = function(e) { print("White test not available for this model") }) return(list(bp_p = bp_test$p.value)) } # Function to check independence (no autocorrelation) check_independence <- function(model, model_name) { print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) # Durbin-Watson test dw_test <- durbinWatsonTest(model) print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) # Residuals vs Order plot residuals_vs_order <- ggplot(data.frame( residuals = residuals(model), order = seq_along(residuals(model)) ), aes(x = order, y = residuals)) + geom_point(color = "black") + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + ggtitle(paste("Residuals vs Order:", model_name)) + theme_minimal() print(residuals_vs_order) return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) } # Function to check for influential observations check_influence <- function(model, model_name) { print(paste("=== INFLUENCE CHECK:", model_name, "===")) # Cook's Distance plot plot(model, which = 4, main = paste("Cook's Distance:", model_name)) # Calculate influence measures cooks_d <- cooks.distance(model) leverage <- hatvalues(model) dffits_val <- dffits(model) # Identify influential observations cooks_threshold <- 4/length(cooks_d) # Cook's D threshold leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold influential_cooks <- which(cooks_d > cooks_threshold) influential_leverage <- which(leverage > leverage_threshold) influential_dffits <- which(abs(dffits_val) > dffits_threshold) print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) print(paste("Influential observations (Cook's D):", length(influential_cooks))) print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) print(paste("High leverage observations:", length(influential_leverage))) print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) print(paste("Influential observations (DFFITS):", length(influential_dffits))) if (length(influential_cooks) > 0) { print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) } if (length(influential_leverage) > 0) { print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) } if (length(influential_dffits) > 0) { print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) } return(list(influential_cooks = influential_cooks, influential_leverage = influential_leverage, influential_dffits = influential_dffits)) } # Function to get comprehensive model summary get_model_summary <- function(model, model_name) { print(paste("=== MODEL SUMMARY:", model_name, "===")) # Basic model summary summary_model <- summary(model) print(summary_model) # R-squared and adjusted R-squared print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) # AIC and BIC aic_val <- AIC(model) bic_val <- BIC(model) print(paste("AIC:", format(aic_val, digits = 5))) print(paste("BIC:", format(bic_val, digits = 5))) return(list(summary = summary_model, r_squared = summary_model$r.squared, adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) } ``` ## Model 1: Sex → EOHI-DGEN Mean ```{r model1} #| label: model1 #| echo: true model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") ``` ## Model 2: Age → EOHI-DGEN Mean ```{r model2} #| label: model2 #| echo: true model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") ``` ## Model 3: Education → EOHI-DGEN Mean ```{r model3} #| label: model3 #| echo: true model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") ``` ## Model 4: Sex → EHI-Global Mean ```{r model4} #| label: model4 #| echo: true model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") ``` ## Model 5: Age → EHI-Global Mean ```{r model5} #| label: model5 #| echo: true model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") ``` ## Model 6: Education → EHI-Global Mean ```{r model6} #| label: model6 #| echo: true model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") ``` ## Summary Tables ### Assumption Violation Summary ```{r violation-summary} #| label: violation-summary #| echo: true # Create summary table violation_summary <- data.frame( Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), Normality = c( ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") ), Homoscedasticity = c( ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") ), Independence = c( ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") ), Influential_Obs = c( ifelse(length(model1_influence$influential_cooks) > 0, paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model2_influence$influential_cooks) > 0, paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model3_influence$influential_cooks) > 0, paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model4_influence$influential_cooks) > 0, paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model5_influence$influential_cooks) > 0, paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), ifelse(length(model6_influence$influential_cooks) > 0, paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") ), stringsAsFactors = FALSE ) print(violation_summary) ``` ### Model Comparison Summary ```{r comparison-summary} #| label: comparison-summary #| echo: true # Create model comparison table comparison_table <- data.frame( Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, model4_summary$aic, model5_summary$aic, model6_summary$aic), BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, model4_summary$bic, model5_summary$bic, model6_summary$bic), Significant = c( ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") ), stringsAsFactors = FALSE ) print(comparison_table) ``` ## Recommendations ### For Assumption Violations: **1. Normality Violations:** - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) - Alternative: Use robust regression methods or bootstrapping **2. Homoscedasticity Violations:** - If violated: Use weighted least squares or robust standard errors - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors **3. Independence Violations:** - If violated: Check for clustering or repeated measures structure - Alternative: Use mixed-effects models or clustered standard errors **4. Influential Observations:** - If present: Examine these cases for data entry errors - Consider: Running analysis with and without influential cases - Alternative: Use robust regression methods **5. Linearity Violations:** - If violated: Add polynomial terms or use splines - Alternative: Transform predictors or use non-parametric methods --- *Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.*