232 lines
8.3 KiB
R
232 lines
8.3 KiB
R
options(scipen = 999)
|
|
|
|
library(dplyr)
|
|
library(car)
|
|
library(lmtest)
|
|
library(stargazer)
|
|
library(sandwich)
|
|
library(lmtest)
|
|
|
|
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
|
|
|
|
df <- read.csv("ehi1.csv")
|
|
|
|
data <- df %>%
|
|
select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>%
|
|
mutate(demo_edu = as.factor(demo_edu))
|
|
|
|
# examine data object
|
|
str(data)
|
|
colSums(is.na(data))
|
|
sapply(data, class)
|
|
levels(data$demo_edu)
|
|
|
|
data$demo_edu <- factor(data$demo_edu, levels = c(
|
|
"High School (or equivalent)",
|
|
"Trade School (non-military)",
|
|
"College Diploma/Certificate",
|
|
"University - Undergraduate",
|
|
"University - Graduate (Masters)",
|
|
"University - PhD",
|
|
"Professional Degree (ex. JD/MD)"
|
|
))
|
|
|
|
levels(data$demo_edu)
|
|
# Create dummy variables
|
|
dummy_vars <- model.matrix(~ demo_edu - 1, data = data)
|
|
dummy_df <- as.data.frame(dummy_vars)
|
|
|
|
# Rename columns with meaningful names (excluding reference level)
|
|
colnames(dummy_df) <- c(
|
|
"edu_highschool", # reference level (will be dropped)
|
|
"edu_trade",
|
|
"edu_college",
|
|
"edu_uni_undergrad",
|
|
"edu_uni_masters",
|
|
"edu_uni_phd",
|
|
"edu_prof"
|
|
)
|
|
# Add to your data
|
|
data <- cbind(data, dummy_df)
|
|
|
|
data <- data %>% select(-starts_with("edu_highschool"))
|
|
|
|
#### MODEL 1 - DGEN ####
|
|
|
|
model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad +
|
|
edu_uni_masters + edu_uni_phd + edu_prof, data = data)
|
|
|
|
# Model 1 diagnostics
|
|
par(mfrow = c(2, 2))
|
|
plot(model_DGEN, which = 1) # Residuals vs Fitted
|
|
plot(model_DGEN, which = 2) # Normal Q-Q, normality
|
|
hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals")
|
|
shapiro.test(residuals(model_DGEN))
|
|
|
|
plot(model_DGEN, which = 3) # Scale-Location
|
|
plot(model_DGEN, which = 4) # Cook's Distance
|
|
|
|
# Model 1 specific tests
|
|
vif(model_DGEN) # Multicollinearity
|
|
dwtest(model_DGEN) # Independence
|
|
outlierTest(model_DGEN) # Outliers
|
|
|
|
# Look at the specific influential cases
|
|
data[c(670, 388, 760), ]
|
|
|
|
# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable.
|
|
# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity.
|
|
# normality violated but sample size is robust to violation
|
|
# no multicollinearity
|
|
# no autocorrelation (samples are independent)
|
|
|
|
#results
|
|
print(summary(model_DGEN))
|
|
print(AIC(model_DGEN))
|
|
|
|
# Create a nice formatted table
|
|
stargazer(model_DGEN, type = "text",
|
|
title = "Regression Results: Education and EOHI-DGEN",
|
|
dep.var.labels = "EOHI-DGEN Mean",
|
|
covariate.labels = c("Trade School", "College", "University Undergrad",
|
|
"University Masters", "University PhD", "Professional Degree"),
|
|
report = "vcsp",
|
|
add.lines = list(c("AIC", round(AIC(model_DGEN), 2))))
|
|
|
|
# Use robust standard errors (doesn't change coefficients, just SEs)
|
|
modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3"))
|
|
|
|
stargazer(modelDGEN_robust, type = "text",
|
|
title = "Regression Results: Education and EOHI-DGEN",
|
|
dep.var.labels = "EOHI-DGEN Mean",
|
|
covariate.labels = c("Trade School", "College", "University Undergrad",
|
|
"University Masters", "University PhD", "Professional Degree"),
|
|
report = "vcsp")
|
|
|
|
|
|
#### MODEL 2 - DOMAIN ####
|
|
|
|
model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad +
|
|
edu_uni_masters + edu_uni_phd + edu_prof, data = data)
|
|
|
|
# ASSUMPTION CHECKING FOR MODEL 2 (model_domain)
|
|
plot(model_domain, which = 1) # Residuals vs Fitted
|
|
|
|
plot(model_domain, which = 2) # Normal Q-Q, normality
|
|
hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals")
|
|
shapiro.test(residuals(model_domain))
|
|
|
|
plot(model_domain, which = 3) # Scale-Location
|
|
plot(model_domain, which = 4) # Cook's Distance
|
|
|
|
# Model 2 specific tests
|
|
vif(model_domain) # Multicollinearity
|
|
dwtest(model_domain) # Independence
|
|
outlierTest(model_domain) # Outliers
|
|
|
|
# Check if the autocorrelation is real or artifactual
|
|
# Plot residuals against observation order
|
|
plot(residuals(model_domain), type = "l")
|
|
abline(h = 0, col = "red")
|
|
|
|
|
|
# 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable.
|
|
# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity.
|
|
# normality violated but sample size is robust to violation
|
|
# no multicollinearity
|
|
# auto correlation is significant, may be due to aggregated measure of multiple repeated measures
|
|
|
|
# Reset plotting to 1x1
|
|
# par(mfrow = c(1, 1))
|
|
|
|
print(summary(model_domain))
|
|
print(AIC(model_domain))
|
|
|
|
stargazer(model_domain, type = "text",
|
|
title = "Regression Results: Education and EHI Domain",
|
|
dep.var.labels = "EHI Domain Mean",
|
|
covariate.labels = c("Trade School", "College", "University Undergrad",
|
|
"University Masters", "University PhD", "Professional Degree"),
|
|
report = "vcsp", # This shows coefficients, SEs, and p-values
|
|
add.lines = list(c("AIC", round(AIC(model_domain), 2))))
|
|
|
|
# Use robust standard errors (doesn't change coefficients, just SEs)
|
|
modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3"))
|
|
|
|
stargazer(modelDOMAIN_robust, type = "text",
|
|
title = "Regression Results: Education and EHI Domain",
|
|
dep.var.labels = "EHI Domain Mean",
|
|
covariate.labels = c("Trade School", "College", "University Undergrad",
|
|
"University Masters", "University PhD", "Professional Degree"),
|
|
report = "vcsp")
|
|
|
|
#### PLOTS ####
|
|
|
|
library(ggplot2)
|
|
library(dplyr)
|
|
|
|
# Calculate means and confidence intervals for EOHI-DGEN
|
|
edu_summary_DGEN <- data %>%
|
|
group_by(demo_edu) %>%
|
|
summarise(
|
|
mean_DGEN = mean(eohiDGEN_mean, na.rm = TRUE),
|
|
n = n(),
|
|
se_DGEN = sd(eohiDGEN_mean, na.rm = TRUE) / sqrt(n()),
|
|
ci_lower_DGEN = mean_DGEN - 1.96 * se_DGEN,
|
|
ci_upper_DGEN = mean_DGEN + 1.96 * se_DGEN
|
|
)
|
|
|
|
# Calculate means and confidence intervals for EHI Domain
|
|
edu_summary_domain <- data %>%
|
|
group_by(demo_edu) %>%
|
|
summarise(
|
|
mean_domain = mean(ehi_global_mean, na.rm = TRUE),
|
|
n = n(),
|
|
se_domain = sd(ehi_global_mean, na.rm = TRUE) / sqrt(n()),
|
|
ci_lower_domain = mean_domain - 1.96 * se_domain,
|
|
ci_upper_domain = mean_domain + 1.96 * se_domain
|
|
)
|
|
|
|
# Plot 1: EOHI-DGEN means with confidence intervals
|
|
p1 <- ggplot(edu_summary_DGEN, aes(x = demo_edu, y = mean_DGEN)) +
|
|
geom_point(size = 3, color = "steelblue") +
|
|
geom_errorbar(aes(ymin = ci_lower_DGEN, ymax = ci_upper_DGEN),
|
|
width = 0.1, color = "steelblue", linewidth = 1) +
|
|
labs(
|
|
title = "Mean EOHI-DGEN by Education Level",
|
|
subtitle = "Error bars show 95% confidence intervals",
|
|
x = "Education Level",
|
|
y = "Mean EOHI-DGEN Score"
|
|
) +
|
|
theme_minimal() +
|
|
theme(
|
|
axis.text.x = element_text(angle = 45, hjust = 1),
|
|
plot.title = element_text(size = 14, face = "bold"),
|
|
plot.subtitle = element_text(size = 10, color = "gray60")
|
|
)
|
|
|
|
# Plot 2: EHI Domain means with confidence intervals
|
|
p2 <- ggplot(edu_summary_domain, aes(x = demo_edu, y = mean_domain)) +
|
|
geom_point(size = 3, color = "darkgreen") +
|
|
geom_errorbar(aes(ymin = ci_lower_domain, ymax = ci_upper_domain),
|
|
width = 0.1, color = "darkgreen", linewidth = 1) +
|
|
labs(
|
|
title = "Mean EHI Domain by Education Level",
|
|
subtitle = "Error bars show 95% confidence intervals",
|
|
x = "Education Level",
|
|
y = "Mean EHI Domain Score"
|
|
) +
|
|
theme_minimal() +
|
|
theme(
|
|
axis.text.x = element_text(angle = 45, hjust = 1),
|
|
plot.title = element_text(size = 14, face = "bold"),
|
|
plot.subtitle = element_text(size = 10, color = "gray60")
|
|
)
|
|
|
|
# Display the plots
|
|
print(p1)
|
|
print(p2)
|
|
|
|
# Save the plots
|
|
ggsave("education_DGEN_means.png", p1, width = 10, height = 6, dpi = 300)
|
|
ggsave("education_domain_means.png", p2, width = 10, height = 6, dpi = 300) |