options(scipen = 999) library(dplyr) library(car) library(lmtest) library(stargazer) library(sandwich) library(lmtest) setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") df <- read.csv("ehi1.csv") data <- df %>% select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% filter(demo_sex != "Prefer not to say") str(data) colSums(is.na(data)) sapply(data, class) # Create dummy variable for sex (0 = Male, 1 = Female) data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) # Verify the dummy coding print(table(data$demo_sex, data$sex_dummy)) #descriptives # Descriptives for age print(summary(data$demo_age_1)) print(sd(data$demo_age_1, na.rm = TRUE)) # Center demo_age_1 (subtract the mean) data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) # Verify the centering print(summary(data$age_centered)) # Descriptives for sex (frequency table) print(table(data$demo_sex)) print(prop.table(table(data$demo_sex))) # Descriptives for sex dummy variable print(table(data$sex_dummy)) #### REGRESSION MODELS #### # MODEL 1: Age only - EOHI age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) par(mfrow = c(2, 2)) plot(age_DGEN) print(shapiro.test(residuals(age_DGEN))) print(summary(age_DGEN)) print(AIC(age_DGEN)) # MODEL 1: Age only - EHI age_domain <- lm(ehi_global_mean ~ age_centered, data = data) par(mfrow = c(2, 2)) plot(age_domain) print(shapiro.test(residuals(age_domain))) print(summary(age_domain)) print(AIC(age_domain)) # MODEL 2: Sex only - EOHI sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) par(mfrow = c(2, 2)) plot(sex_DGEN) print(shapiro.test(residuals(sex_DGEN))) print(summary(sex_DGEN)) print(AIC(sex_DGEN)) # P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met. # P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust. # P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n. # p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage. # across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed. # MODEL 2: Sex only - EHI sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) par(mfrow = c(2, 2)) plot(sex_domain) print(shapiro.test(residuals(sex_domain))) print(summary(sex_domain)) print(AIC(sex_domain)) # MODEL 3: Age + Sex + Interaction - EOHI interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) par(mfrow = c(2, 2)) plot(interaction_DGEN) print(shapiro.test(residuals(interaction_DGEN))) vif(interaction_DGEN) print(summary(interaction_DGEN)) print(AIC(interaction_DGEN)) # MODEL 3: Age + Sex + Interaction - EHI interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) par(mfrow = c(2, 2)) plot(interaction_domain) print(shapiro.test(residuals(interaction_domain))) vif(interaction_domain) print(summary(interaction_domain)) print(AIC(interaction_domain))