options(scipen = 999)

library(dplyr)
library(car)
library(lmtest)
library(stargazer)
library(sandwich)
library(lmtest)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")

df <- read.csv("ehi1.csv")

data <- df %>%
  select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>%
  mutate(demo_edu = as.factor(demo_edu))

# examine data object
str(data)
colSums(is.na(data))
sapply(data, class)
levels(data$demo_edu)

data$demo_edu <- factor(data$demo_edu, levels = c(
  "High School (or equivalent)",
  "Trade School (non-military)", 
  "College Diploma/Certificate",
  "University - Undergraduate",
  "University - Graduate (Masters)",
  "University - PhD",
  "Professional Degree (ex. JD/MD)"
))

levels(data$demo_edu)
# Create dummy variables
dummy_vars <- model.matrix(~ demo_edu - 1, data = data)
dummy_df <- as.data.frame(dummy_vars)

# Rename columns with meaningful names (excluding reference level)
colnames(dummy_df) <- c(
  "edu_highschool",           # reference level (will be dropped)
  "edu_trade",
  "edu_college", 
  "edu_uni_undergrad",
  "edu_uni_masters",
  "edu_uni_phd",
  "edu_prof"
)
# Add to your data
data <- cbind(data, dummy_df)

data <- data %>% select(-starts_with("edu_highschool"))

#### MODEL 1 - DGEN ####

model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + 
             edu_uni_masters + edu_uni_phd + edu_prof, data = data)

# Model 1 diagnostics
par(mfrow = c(2, 2))
plot(model_DGEN, which = 1)  # Residuals vs Fitted
plot(model_DGEN, which = 2)  # Normal Q-Q, normality
hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals")
shapiro.test(residuals(model_DGEN))

plot(model_DGEN, which = 3)  # Scale-Location
plot(model_DGEN, which = 4)  # Cook's Distance

# Model 1 specific tests
vif(model_DGEN)              # Multicollinearity
dwtest(model_DGEN)           # Independence
outlierTest(model_DGEN)      # Outliers

# Look at the specific influential cases
data[c(670, 388, 760), ]

# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable.
# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity.
# normality violated but sample size is robust to violation
# no multicollinearity
# no autocorrelation (samples are independent)

#results
summary(model_DGEN)

# Create a nice formatted table
stargazer(model_DGEN, type = "text", 
          title = "Regression Results: Education and EOHI-DGEN",
          dep.var.labels = "EOHI-DGEN Mean",
          covariate.labels = c("Trade School", "College", "University Undergrad", 
                              "University Masters", "University PhD", "Professional Degree"),
          report = "vcsp"),
        add.lines = list(c("AIC", round(AIC(model_DGEN), 2))))

# Use robust standard errors (doesn't change coefficients, just SEs)
modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3"))

stargazer(modelDGEN_robust, type = "text", 
          title = "Regression Results: Education and EOHI-DGEN",
          dep.var.labels = "EOHI-DGEN Mean",
          covariate.labels = c("Trade School", "College", "University Undergrad", 
                              "University Masters", "University PhD", "Professional Degree"),
          report = "vcsp") 
          

#### MODEL 2 - DOMAIN ####

model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + 
             edu_uni_masters + edu_uni_phd + edu_prof, data = data)

# ASSUMPTION CHECKING FOR MODEL 2 (model_domain)
plot(model_domain, which = 1)  # Residuals vs Fitted

plot(model_domain, which = 2)  # Normal Q-Q, normality
hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals")
shapiro.test(residuals(model_domain))

plot(model_domain, which = 3)  # Scale-Location
plot(model_domain, which = 4)  # Cook's Distance

# Model 2 specific tests
vif(model_domain)              # Multicollinearity
dwtest(model_domain)           # Independence
outlierTest(model_domain)      # Outliers

# Check if the autocorrelation is real or artifactual
# Plot residuals against observation order
plot(residuals(model_domain), type = "l")
abline(h = 0, col = "red")


# 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable.
# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity.
# normality violated but sample size is robust to violation
# no multicollinearity
# auto correlation is significant, may be due to aggregated measure of multiple repeated measures 

# Reset plotting to 1x1
# par(mfrow = c(1, 1))

summary(model_domain)

stargazer(model_domain, type = "text", 
          title = "Regression Results: Education and EOHI-DGEN",
          dep.var.labels = "EHI Domain Mean",
          covariate.labels = c("Trade School", "College", "University Undergrad", 
                              "University Masters", "University PhD", "Professional Degree"),
          report = "vcsp"),  # This shows coefficients, SEs, and p-values
          add.lines = list(c("AIC", round(AIC(model_DGEN), 2), round(AIC(model_domain), 2)))

# Use robust standard errors (doesn't change coefficients, just SEs)
modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3"))

stargazer(modelDOMAIN_robust, type = "text", 
          title = "Regression Results: Education and EOHI-DGEN",
          dep.var.labels = "EHI Domain Mean",
          covariate.labels = c("Trade School", "College", "University Undergrad", 
                              "University Masters", "University PhD", "Professional Degree"),
          report = "vcsp")