options(scipen = 999) library(dplyr) library(car) library(lmtest) library(stargazer) library(sandwich) library(lmtest) setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") df <- read.csv("ehi1.csv") data <- df %>% select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% filter(demo_sex != "Prefer not to say") str(data) colSums(is.na(data)) sapply(data, class) # Create dummy variable for sex (0 = Male, 1 = Female) data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) # Verify the dummy coding print(table(data$demo_sex, data$sex_dummy)) #descriptives # Descriptives for age print(summary(data$demo_age_1)) print(sd(data$demo_age_1, na.rm = TRUE)) # Center demo_age_1 (subtract the mean) data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) # Verify the centering print(summary(data$age_centered)) # Descriptives for sex (frequency table) print(table(data$demo_sex)) print(prop.table(table(data$demo_sex))) # Descriptives for sex dummy variable print(table(data$sex_dummy)) #### REGRESSION MODELS #### # MODEL 1: Age only - EOHI age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) par(mfrow = c(2, 2)) plot(age_DGEN) print(shapiro.test(residuals(age_DGEN))) print(summary(age_DGEN)) print(AIC(age_DGEN)) # MODEL 1: Age only - EHI age_domain <- lm(ehi_global_mean ~ age_centered, data = data) par(mfrow = c(2, 2)) plot(age_domain) print(shapiro.test(residuals(age_domain))) print(summary(age_domain)) print(AIC(age_domain)) # MODEL 2: Sex only - EOHI sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) par(mfrow = c(2, 2)) plot(sex_DGEN) print(shapiro.test(residuals(sex_DGEN))) print(summary(sex_DGEN)) print(AIC(sex_DGEN)) # P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met. # P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust. # P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n. # p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage. # across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed. # MODEL 2: Sex only - EHI sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) par(mfrow = c(2, 2)) plot(sex_domain) print(shapiro.test(residuals(sex_domain))) print(summary(sex_domain)) print(AIC(sex_domain)) # MODEL 3: Age + Sex + Interaction - EOHI interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) par(mfrow = c(2, 2)) plot(interaction_DGEN) print(shapiro.test(residuals(interaction_DGEN))) vif_DGEN <- vif(interaction_DGEN) print(vif_DGEN) print(summary(interaction_DGEN)) print(AIC(interaction_DGEN)) # MODEL 3: Age + Sex + Interaction - EHI interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) par(mfrow = c(2, 2)) plot(interaction_domain) print(shapiro.test(residuals(interaction_domain))) vif_domain <- vif(interaction_domain) print(vif_domain) print(summary(interaction_domain)) print(AIC(interaction_domain)) #### troubleshooting #### # Clear any existing plots # dev.off() #### PLOTS #### # Create visual figures for age models library(ggplot2) # Figure 1: age_DGEN model p1 <- ggplot(data, aes(x = age_centered, y = eohiDGEN_mean)) + geom_point(alpha = 0.6, color = "steelblue") + geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1) + labs( title = "Age and EOHI-DGEN Relationship", x = "Age (centered)", y = "EOHI-DGEN Mean", subtitle = paste("R² =", round(summary(age_DGEN)$r.squared, 3), ", p < 0.001") ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), axis.title = element_text(size = 12), plot.subtitle = element_text(size = 10, color = "gray60") ) # Figure 2: age_domain model p2 <- ggplot(data, aes(x = age_centered, y = ehi_global_mean)) + geom_point(alpha = 0.6, color = "darkgreen") + geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1) + labs( title = "Age and EHI Domain Relationship", x = "Age (centered)", y = "EHI Domain Mean", subtitle = paste("R² =", round(summary(age_domain)$r.squared, 3), ", p < 0.001") ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), axis.title = element_text(size = 12), plot.subtitle = element_text(size = 10, color = "gray60") ) # Save the plots ggsave("age_DGEN_plot.png", p1, width = 8, height = 6, dpi = 300) ggsave("age_domain_plot.png", p2, width = 8, height = 6, dpi = 300) # Display the plots print(p1) print(p2) #### HTML file #### # Create comprehensive HTML report grouped by model library(htmltools) # Start HTML document html_content <- htmltools::div( htmltools::h1("Regression Analysis: Age and Sex Effects on EOHI-DGEN and EHI Domain"), # EOHI-DGEN Models Section htmltools::h2("EOHI-DGEN Models"), # Age Model htmltools::h3("1. Age Model (age_DGEN)"), htmltools::div( style = "margin-bottom: 30px;", htmltools::h4("Model Results, AIC, and VIF"), htmltools::HTML( stargazer( age_DGEN, type = "html", title = "Age Model: EOHI-DGEN", dep.var.labels = "EOHI-DGEN Mean", covariate.labels = c("Age (centered)"), report = "vcsp*", add.lines = list( c("AIC", round(AIC(age_DGEN), 2)), c("Max VIF", "N/A") ) ) ), htmltools::h4("Assumption Diagnostic Plots"), htmltools::img(src = "age_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;"), htmltools::h4("Model Visualization"), htmltools::img(src = "age_DGEN_plot.png", style = "width: 100%; max-width: 800px;") ), # Sex Model htmltools::h3("2. Sex Model (sex_DGEN)"), htmltools::div( style = "margin-bottom: 30px;", htmltools::h4("Model Results, AIC, and VIF"), htmltools::HTML( stargazer( sex_DGEN, type = "html", title = "Sex Model: EOHI-DGEN", dep.var.labels = "EOHI-DGEN Mean", covariate.labels = c("Sex (dummy)"), report = "vcsp*", add.lines = list( c("AIC", round(AIC(sex_DGEN), 2)), c("Max VIF", "N/A") ) ) ), htmltools::h4("Assumption Diagnostic Plots"), htmltools::img(src = "sex_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;") ), # Interaction Model htmltools::h3("3. Interaction Model (interaction_DGEN)"), htmltools::div( style = "margin-bottom: 30px;", htmltools::h4("Model Results, AIC, and VIF"), htmltools::HTML( stargazer( interaction_DGEN, type = "html", title = "Interaction Model: EOHI-DGEN", dep.var.labels = "EOHI-DGEN Mean", covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"), report = "vcsp*", add.lines = list( c("AIC", round(AIC(interaction_DGEN), 2)), c("Max VIF", max_vif_DGEN) ) ) ), htmltools::h4("Assumption Diagnostic Plots"), htmltools::img(src = "interaction_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;") ), # EHI Domain Models Section htmltools::h2("EHI Domain Models"), # Age Model htmltools::h3("1. Age Model (age_domain)"), htmltools::div( style = "margin-bottom: 30px;", htmltools::h4("Model Results, AIC, and VIF"), htmltools::HTML( stargazer( age_domain, type = "html", title = "Age Model: EHI Domain", dep.var.labels = "EHI Domain Mean", covariate.labels = c("Age (centered)"), report = "vcsp*", add.lines = list( c("AIC", round(AIC(age_domain), 2)), c("Max VIF", "N/A") ) ) ), htmltools::h4("Assumption Diagnostic Plots"), htmltools::img(src = "age_domain_assumptions.png", style = "width: 100%; max-width: 800px;"), htmltools::h4("Model Visualization"), htmltools::img(src = "age_domain_plot.png", style = "width: 100%; max-width: 800px;") ), # Sex Model htmltools::h3("2. Sex Model (sex_domain)"), htmltools::div( style = "margin-bottom: 30px;", htmltools::h4("Model Results, AIC, and VIF"), htmltools::HTML( stargazer( sex_domain, type = "html", title = "Sex Model: EHI Domain", dep.var.labels = "EHI Domain Mean", covariate.labels = c("Sex (dummy)"), report = "vcsp*", add.lines = list( c("AIC", round(AIC(sex_domain), 2)), c("Max VIF", "N/A") ) ) ), htmltools::h4("Assumption Diagnostic Plots"), htmltools::img(src = "sex_domain_assumptions.png", style = "width: 100%; max-width: 800px;") ), # Interaction Model htmltools::h3("3. Interaction Model (interaction_domain)"), htmltools::div( style = "margin-bottom: 30px;", htmltools::h4("Model Results, AIC, and VIF"), htmltools::HTML( stargazer( interaction_domain, type = "html", title = "Interaction Model: EHI Domain", dep.var.labels = "EHI Domain Mean", covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"), report = "vcsp*", add.lines = list( c("AIC", round(AIC(interaction_domain), 2)), c("Max VIF", max_vif_domain) ) ) ), htmltools::h4("Assumption Diagnostic Plots"), htmltools::img(src = "interaction_domain_assumptions.png", style = "width: 100%; max-width: 800px;") ) ) # Save HTML content htmltools::save_html(html_content, "regression_analysis_report.html") print("HTML report created: regression_analysis_report.html")