eohi/eohi1/regression e1 - ehi x sex x age.r
2025-12-23 15:47:09 -05:00

320 lines
10 KiB
R

options(scipen = 999)
library(dplyr)
library(car)
library(lmtest)
library(stargazer)
library(sandwich)
library(lmtest)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
df <- read.csv("ehi1.csv")
data <- df %>%
select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>%
filter(demo_sex != "Prefer not to say")
str(data)
colSums(is.na(data))
sapply(data, class)
# Create dummy variable for sex (0 = Male, 1 = Female)
data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0)
# Verify the dummy coding
print(table(data$demo_sex, data$sex_dummy))
#descriptives
# Descriptives for age
print(summary(data$demo_age_1))
print(sd(data$demo_age_1, na.rm = TRUE))
# Center demo_age_1 (subtract the mean)
data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE)
# Verify the centering
print(summary(data$age_centered))
# Descriptives for sex (frequency table)
print(table(data$demo_sex))
print(prop.table(table(data$demo_sex)))
# Descriptives for sex dummy variable
print(table(data$sex_dummy))
#### REGRESSION MODELS ####
# MODEL 1: Age only - EOHI
age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data)
par(mfrow = c(2, 2))
plot(age_DGEN)
print(shapiro.test(residuals(age_DGEN)))
print(summary(age_DGEN))
print(AIC(age_DGEN))
# MODEL 1: Age only - EHI
age_domain <- lm(ehi_global_mean ~ age_centered, data = data)
par(mfrow = c(2, 2))
plot(age_domain)
print(shapiro.test(residuals(age_domain)))
print(summary(age_domain))
print(AIC(age_domain))
# MODEL 2: Sex only - EOHI
sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data)
par(mfrow = c(2, 2))
plot(sex_DGEN)
print(shapiro.test(residuals(sex_DGEN)))
print(summary(sex_DGEN))
print(AIC(sex_DGEN))
# P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met.
# P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust.
# P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n.
# p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage.
# across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed.
# MODEL 2: Sex only - EHI
sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data)
par(mfrow = c(2, 2))
plot(sex_domain)
print(shapiro.test(residuals(sex_domain)))
print(summary(sex_domain))
print(AIC(sex_domain))
# MODEL 3: Age + Sex + Interaction - EOHI
interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data)
par(mfrow = c(2, 2))
plot(interaction_DGEN)
print(shapiro.test(residuals(interaction_DGEN)))
vif_DGEN <- vif(interaction_DGEN)
print(vif_DGEN)
print(summary(interaction_DGEN))
print(AIC(interaction_DGEN))
# MODEL 3: Age + Sex + Interaction - EHI
interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data)
par(mfrow = c(2, 2))
plot(interaction_domain)
print(shapiro.test(residuals(interaction_domain)))
vif_domain <- vif(interaction_domain)
print(vif_domain)
print(summary(interaction_domain))
print(AIC(interaction_domain))
#### troubleshooting ####
# Clear any existing plots
# dev.off()
#### PLOTS ####
# Create visual figures for age models
library(ggplot2)
# Figure 1: age_DGEN model
p1 <- ggplot(data, aes(x = age_centered, y = eohiDGEN_mean)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1) +
labs(
title = "Age and EOHI-DGEN Relationship",
x = "Age (centered)",
y = "EOHI-DGEN Mean",
subtitle = paste("R² =", round(summary(age_DGEN)$r.squared, 3),
", p < 0.001")
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
plot.subtitle = element_text(size = 10, color = "gray60")
)
# Figure 2: age_domain model
p2 <- ggplot(data, aes(x = age_centered, y = ehi_global_mean)) +
geom_point(alpha = 0.6, color = "darkgreen") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1) +
labs(
title = "Age and EHI Domain Relationship",
x = "Age (centered)",
y = "EHI Domain Mean",
subtitle = paste("R² =", round(summary(age_domain)$r.squared, 3),
", p < 0.001")
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
plot.subtitle = element_text(size = 10, color = "gray60")
)
# Save the plots
ggsave("age_DGEN_plot.png", p1, width = 8, height = 6, dpi = 300)
ggsave("age_domain_plot.png", p2, width = 8, height = 6, dpi = 300)
# Display the plots
print(p1)
print(p2)
#### HTML file ####
# Create comprehensive HTML report grouped by model
library(htmltools)
# Start HTML document
html_content <- htmltools::div(
htmltools::h1("Regression Analysis: Age and Sex Effects on EOHI-DGEN and EHI Domain"),
# EOHI-DGEN Models Section
htmltools::h2("EOHI-DGEN Models"),
# Age Model
htmltools::h3("1. Age Model (age_DGEN)"),
htmltools::div(
style = "margin-bottom: 30px;",
htmltools::h4("Model Results, AIC, and VIF"),
htmltools::HTML(
stargazer(
age_DGEN,
type = "html",
title = "Age Model: EOHI-DGEN",
dep.var.labels = "EOHI-DGEN Mean",
covariate.labels = c("Age (centered)"),
report = "vcsp*",
add.lines = list(
c("AIC", round(AIC(age_DGEN), 2)),
c("Max VIF", "N/A")
)
)
),
htmltools::h4("Assumption Diagnostic Plots"),
htmltools::img(src = "age_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;"),
htmltools::h4("Model Visualization"),
htmltools::img(src = "age_DGEN_plot.png", style = "width: 100%; max-width: 800px;")
),
# Sex Model
htmltools::h3("2. Sex Model (sex_DGEN)"),
htmltools::div(
style = "margin-bottom: 30px;",
htmltools::h4("Model Results, AIC, and VIF"),
htmltools::HTML(
stargazer(
sex_DGEN,
type = "html",
title = "Sex Model: EOHI-DGEN",
dep.var.labels = "EOHI-DGEN Mean",
covariate.labels = c("Sex (dummy)"),
report = "vcsp*",
add.lines = list(
c("AIC", round(AIC(sex_DGEN), 2)),
c("Max VIF", "N/A")
)
)
),
htmltools::h4("Assumption Diagnostic Plots"),
htmltools::img(src = "sex_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;")
),
# Interaction Model
htmltools::h3("3. Interaction Model (interaction_DGEN)"),
htmltools::div(
style = "margin-bottom: 30px;",
htmltools::h4("Model Results, AIC, and VIF"),
htmltools::HTML(
stargazer(
interaction_DGEN,
type = "html",
title = "Interaction Model: EOHI-DGEN",
dep.var.labels = "EOHI-DGEN Mean",
covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"),
report = "vcsp*",
add.lines = list(
c("AIC", round(AIC(interaction_DGEN), 2)),
c("Max VIF", max_vif_DGEN)
)
)
),
htmltools::h4("Assumption Diagnostic Plots"),
htmltools::img(src = "interaction_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;")
),
# EHI Domain Models Section
htmltools::h2("EHI Domain Models"),
# Age Model
htmltools::h3("1. Age Model (age_domain)"),
htmltools::div(
style = "margin-bottom: 30px;",
htmltools::h4("Model Results, AIC, and VIF"),
htmltools::HTML(
stargazer(
age_domain,
type = "html",
title = "Age Model: EHI Domain",
dep.var.labels = "EHI Domain Mean",
covariate.labels = c("Age (centered)"),
report = "vcsp*",
add.lines = list(
c("AIC", round(AIC(age_domain), 2)),
c("Max VIF", "N/A")
)
)
),
htmltools::h4("Assumption Diagnostic Plots"),
htmltools::img(src = "age_domain_assumptions.png", style = "width: 100%; max-width: 800px;"),
htmltools::h4("Model Visualization"),
htmltools::img(src = "age_domain_plot.png", style = "width: 100%; max-width: 800px;")
),
# Sex Model
htmltools::h3("2. Sex Model (sex_domain)"),
htmltools::div(
style = "margin-bottom: 30px;",
htmltools::h4("Model Results, AIC, and VIF"),
htmltools::HTML(
stargazer(
sex_domain,
type = "html",
title = "Sex Model: EHI Domain",
dep.var.labels = "EHI Domain Mean",
covariate.labels = c("Sex (dummy)"),
report = "vcsp*",
add.lines = list(
c("AIC", round(AIC(sex_domain), 2)),
c("Max VIF", "N/A")
)
)
),
htmltools::h4("Assumption Diagnostic Plots"),
htmltools::img(src = "sex_domain_assumptions.png", style = "width: 100%; max-width: 800px;")
),
# Interaction Model
htmltools::h3("3. Interaction Model (interaction_domain)"),
htmltools::div(
style = "margin-bottom: 30px;",
htmltools::h4("Model Results, AIC, and VIF"),
htmltools::HTML(
stargazer(
interaction_domain,
type = "html",
title = "Interaction Model: EHI Domain",
dep.var.labels = "EHI Domain Mean",
covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"),
report = "vcsp*",
add.lines = list(
c("AIC", round(AIC(interaction_domain), 2)),
c("Max VIF", max_vif_domain)
)
)
),
htmltools::h4("Assumption Diagnostic Plots"),
htmltools::img(src = "interaction_domain_assumptions.png", style = "width: 100%; max-width: 800px;")
)
)
# Save HTML content
htmltools::save_html(html_content, "regression_analysis_report.html")
print("HTML report created: regression_analysis_report.html")