options(scipen = 999) library(dplyr) library(car) library(lmtest) library(stargazer) library(sandwich) library(lmtest) setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") df <- read.csv("ehi1.csv") data <- df %>% select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% mutate(demo_edu = as.factor(demo_edu)) # examine data object str(data) colSums(is.na(data)) sapply(data, class) levels(data$demo_edu) data$demo_edu <- factor(data$demo_edu, levels = c( "High School (or equivalent)", "Trade School (non-military)", "College Diploma/Certificate", "University - Undergraduate", "University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)" )) levels(data$demo_edu) # Create dummy variables dummy_vars <- model.matrix(~ demo_edu - 1, data = data) dummy_df <- as.data.frame(dummy_vars) # Rename columns with meaningful names (excluding reference level) colnames(dummy_df) <- c( "edu_highschool", # reference level (will be dropped) "edu_trade", "edu_college", "edu_uni_undergrad", "edu_uni_masters", "edu_uni_phd", "edu_prof" ) # Add to your data data <- cbind(data, dummy_df) data <- data %>% select(-starts_with("edu_highschool")) #### MODEL 1 - DGEN #### model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + edu_uni_masters + edu_uni_phd + edu_prof, data = data) # Model 1 diagnostics par(mfrow = c(2, 2)) plot(model_DGEN, which = 1) # Residuals vs Fitted plot(model_DGEN, which = 2) # Normal Q-Q, normality hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") shapiro.test(residuals(model_DGEN)) plot(model_DGEN, which = 3) # Scale-Location plot(model_DGEN, which = 4) # Cook's Distance # Model 1 specific tests vif(model_DGEN) # Multicollinearity dwtest(model_DGEN) # Independence outlierTest(model_DGEN) # Outliers # Look at the specific influential cases data[c(670, 388, 760), ] # 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. # heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. # normality violated but sample size is robust to violation # no multicollinearity # no autocorrelation (samples are independent) #results print(summary(model_DGEN)) print(AIC(model_DGEN)) # Create a nice formatted table stargazer(model_DGEN, type = "text", title = "Regression Results: Education and EOHI-DGEN", dep.var.labels = "EOHI-DGEN Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp", add.lines = list(c("AIC", round(AIC(model_DGEN), 2)))) # Use robust standard errors (doesn't change coefficients, just SEs) modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3")) stargazer(modelDGEN_robust, type = "text", title = "Regression Results: Education and EOHI-DGEN", dep.var.labels = "EOHI-DGEN Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp") #### MODEL 2 - DOMAIN #### model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + edu_uni_masters + edu_uni_phd + edu_prof, data = data) # ASSUMPTION CHECKING FOR MODEL 2 (model_domain) plot(model_domain, which = 1) # Residuals vs Fitted plot(model_domain, which = 2) # Normal Q-Q, normality hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") shapiro.test(residuals(model_domain)) plot(model_domain, which = 3) # Scale-Location plot(model_domain, which = 4) # Cook's Distance # Model 2 specific tests vif(model_domain) # Multicollinearity dwtest(model_domain) # Independence outlierTest(model_domain) # Outliers # Check if the autocorrelation is real or artifactual # Plot residuals against observation order plot(residuals(model_domain), type = "l") abline(h = 0, col = "red") # 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. # heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. # normality violated but sample size is robust to violation # no multicollinearity # auto correlation is significant, may be due to aggregated measure of multiple repeated measures # Reset plotting to 1x1 # par(mfrow = c(1, 1)) print(summary(model_domain)) print(AIC(model_domain)) stargazer(model_domain, type = "text", title = "Regression Results: Education and EHI Domain", dep.var.labels = "EHI Domain Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp", # This shows coefficients, SEs, and p-values add.lines = list(c("AIC", round(AIC(model_domain), 2)))) # Use robust standard errors (doesn't change coefficients, just SEs) modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3")) stargazer(modelDOMAIN_robust, type = "text", title = "Regression Results: Education and EHI Domain", dep.var.labels = "EHI Domain Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp") #### PLOTS #### library(ggplot2) library(dplyr) # Calculate means and confidence intervals for EOHI-DGEN edu_summary_DGEN <- data %>% group_by(demo_edu) %>% summarise( mean_DGEN = mean(eohiDGEN_mean, na.rm = TRUE), n = n(), se_DGEN = sd(eohiDGEN_mean, na.rm = TRUE) / sqrt(n()), ci_lower_DGEN = mean_DGEN - 1.96 * se_DGEN, ci_upper_DGEN = mean_DGEN + 1.96 * se_DGEN ) # Calculate means and confidence intervals for EHI Domain edu_summary_domain <- data %>% group_by(demo_edu) %>% summarise( mean_domain = mean(ehi_global_mean, na.rm = TRUE), n = n(), se_domain = sd(ehi_global_mean, na.rm = TRUE) / sqrt(n()), ci_lower_domain = mean_domain - 1.96 * se_domain, ci_upper_domain = mean_domain + 1.96 * se_domain ) # Plot 1: EOHI-DGEN means with confidence intervals p1 <- ggplot(edu_summary_DGEN, aes(x = demo_edu, y = mean_DGEN)) + geom_point(size = 3, color = "steelblue") + geom_errorbar(aes(ymin = ci_lower_DGEN, ymax = ci_upper_DGEN), width = 0.1, color = "steelblue", linewidth = 1) + labs( title = "Mean EOHI-DGEN by Education Level", subtitle = "Error bars show 95% confidence intervals", x = "Education Level", y = "Mean EOHI-DGEN Score" ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 14, face = "bold"), plot.subtitle = element_text(size = 10, color = "gray60") ) # Plot 2: EHI Domain means with confidence intervals p2 <- ggplot(edu_summary_domain, aes(x = demo_edu, y = mean_domain)) + geom_point(size = 3, color = "darkgreen") + geom_errorbar(aes(ymin = ci_lower_domain, ymax = ci_upper_domain), width = 0.1, color = "darkgreen", linewidth = 1) + labs( title = "Mean EHI Domain by Education Level", subtitle = "Error bars show 95% confidence intervals", x = "Education Level", y = "Mean EHI Domain Score" ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 14, face = "bold"), plot.subtitle = element_text(size = 10, color = "gray60") ) # Display the plots print(p1) print(p2) # Save the plots ggsave("education_DGEN_means.png", p1, width = 10, height = 6, dpi = 300) ggsave("education_domain_means.png", p2, width = 10, height = 6, dpi = 300)