options(scipen = 999) library(dplyr) library(car) library(lmtest) library(stargazer) library(sandwich) library(lmtest) setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") df <- read.csv("ehi1.csv") data <- df %>% select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% mutate(demo_edu = as.factor(demo_edu)) # examine data object str(data) colSums(is.na(data)) sapply(data, class) levels(data$demo_edu) data$demo_edu <- factor(data$demo_edu, levels = c( "High School (or equivalent)", "Trade School (non-military)", "College Diploma/Certificate", "University - Undergraduate", "University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)" )) levels(data$demo_edu) # Create dummy variables dummy_vars <- model.matrix(~ demo_edu - 1, data = data) dummy_df <- as.data.frame(dummy_vars) # Rename columns with meaningful names (excluding reference level) colnames(dummy_df) <- c( "edu_highschool", # reference level (will be dropped) "edu_trade", "edu_college", "edu_uni_undergrad", "edu_uni_masters", "edu_uni_phd", "edu_prof" ) # Add to your data data <- cbind(data, dummy_df) data <- data %>% select(-starts_with("edu_highschool")) #### MODEL 1 - DGEN #### model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + edu_uni_masters + edu_uni_phd + edu_prof, data = data) # Model 1 diagnostics par(mfrow = c(2, 2)) plot(model_DGEN, which = 1) # Residuals vs Fitted plot(model_DGEN, which = 2) # Normal Q-Q, normality hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") shapiro.test(residuals(model_DGEN)) plot(model_DGEN, which = 3) # Scale-Location plot(model_DGEN, which = 4) # Cook's Distance # Model 1 specific tests vif(model_DGEN) # Multicollinearity dwtest(model_DGEN) # Independence outlierTest(model_DGEN) # Outliers # Look at the specific influential cases data[c(670, 388, 760), ] # 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. # heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. # normality violated but sample size is robust to violation # no multicollinearity # no autocorrelation (samples are independent) #results summary(model_DGEN) # Create a nice formatted table stargazer(model_DGEN, type = "text", title = "Regression Results: Education and EOHI-DGEN", dep.var.labels = "EOHI-DGEN Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp"), add.lines = list(c("AIC", round(AIC(model_DGEN), 2)))) # Use robust standard errors (doesn't change coefficients, just SEs) modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3")) stargazer(modelDGEN_robust, type = "text", title = "Regression Results: Education and EOHI-DGEN", dep.var.labels = "EOHI-DGEN Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp") #### MODEL 2 - DOMAIN #### model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + edu_uni_masters + edu_uni_phd + edu_prof, data = data) # ASSUMPTION CHECKING FOR MODEL 2 (model_domain) plot(model_domain, which = 1) # Residuals vs Fitted plot(model_domain, which = 2) # Normal Q-Q, normality hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") shapiro.test(residuals(model_domain)) plot(model_domain, which = 3) # Scale-Location plot(model_domain, which = 4) # Cook's Distance # Model 2 specific tests vif(model_domain) # Multicollinearity dwtest(model_domain) # Independence outlierTest(model_domain) # Outliers # Check if the autocorrelation is real or artifactual # Plot residuals against observation order plot(residuals(model_domain), type = "l") abline(h = 0, col = "red") # 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. # heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. # normality violated but sample size is robust to violation # no multicollinearity # auto correlation is significant, may be due to aggregated measure of multiple repeated measures # Reset plotting to 1x1 # par(mfrow = c(1, 1)) summary(model_domain) stargazer(model_domain, type = "text", title = "Regression Results: Education and EOHI-DGEN", dep.var.labels = "EHI Domain Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp"), # This shows coefficients, SEs, and p-values add.lines = list(c("AIC", round(AIC(model_DGEN), 2), round(AIC(model_domain), 2))) # Use robust standard errors (doesn't change coefficients, just SEs) modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3")) stargazer(modelDOMAIN_robust, type = "text", title = "Regression Results: Education and EOHI-DGEN", dep.var.labels = "EHI Domain Mean", covariate.labels = c("Trade School", "College", "University Undergrad", "University Masters", "University PhD", "Professional Degree"), report = "vcsp")