101 lines
3.4 KiB
R
101 lines
3.4 KiB
R
options(scipen = 999)
|
|
|
|
library(dplyr)
|
|
library(car)
|
|
library(lmtest)
|
|
library(stargazer)
|
|
library(sandwich)
|
|
library(lmtest)
|
|
|
|
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
|
|
|
|
df <- read.csv("ehi1.csv")
|
|
|
|
data <- df %>%
|
|
select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>%
|
|
filter(demo_sex != "Prefer not to say")
|
|
|
|
str(data)
|
|
colSums(is.na(data))
|
|
sapply(data, class)
|
|
|
|
# Create dummy variable for sex (0 = Male, 1 = Female)
|
|
data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0)
|
|
|
|
# Verify the dummy coding
|
|
print(table(data$demo_sex, data$sex_dummy))
|
|
|
|
#descriptives
|
|
|
|
# Descriptives for age
|
|
print(summary(data$demo_age_1))
|
|
print(sd(data$demo_age_1, na.rm = TRUE))
|
|
|
|
# Center demo_age_1 (subtract the mean)
|
|
data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE)
|
|
|
|
# Verify the centering
|
|
print(summary(data$age_centered))
|
|
|
|
# Descriptives for sex (frequency table)
|
|
print(table(data$demo_sex))
|
|
print(prop.table(table(data$demo_sex)))
|
|
|
|
# Descriptives for sex dummy variable
|
|
print(table(data$sex_dummy))
|
|
|
|
#### REGRESSION MODELS ####
|
|
# MODEL 1: Age only - EOHI
|
|
age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data)
|
|
par(mfrow = c(2, 2))
|
|
plot(age_DGEN)
|
|
print(shapiro.test(residuals(age_DGEN)))
|
|
print(summary(age_DGEN))
|
|
print(AIC(age_DGEN))
|
|
|
|
# MODEL 1: Age only - EHI
|
|
age_domain <- lm(ehi_global_mean ~ age_centered, data = data)
|
|
par(mfrow = c(2, 2))
|
|
plot(age_domain)
|
|
print(shapiro.test(residuals(age_domain)))
|
|
print(summary(age_domain))
|
|
print(AIC(age_domain))
|
|
|
|
# MODEL 2: Sex only - EOHI
|
|
sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data)
|
|
par(mfrow = c(2, 2))
|
|
plot(sex_DGEN)
|
|
print(shapiro.test(residuals(sex_DGEN)))
|
|
print(summary(sex_DGEN))
|
|
print(AIC(sex_DGEN))
|
|
# P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met.
|
|
# P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust.
|
|
# P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n.
|
|
# p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage.
|
|
# across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed.
|
|
|
|
# MODEL 2: Sex only - EHI
|
|
sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data)
|
|
par(mfrow = c(2, 2))
|
|
plot(sex_domain)
|
|
print(shapiro.test(residuals(sex_domain)))
|
|
print(summary(sex_domain))
|
|
print(AIC(sex_domain))
|
|
|
|
# MODEL 3: Age + Sex + Interaction - EOHI
|
|
interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data)
|
|
par(mfrow = c(2, 2))
|
|
plot(interaction_DGEN)
|
|
print(shapiro.test(residuals(interaction_DGEN)))
|
|
vif(interaction_DGEN)
|
|
print(summary(interaction_DGEN))
|
|
print(AIC(interaction_DGEN))
|
|
|
|
# MODEL 3: Age + Sex + Interaction - EHI
|
|
interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data)
|
|
par(mfrow = c(2, 2))
|
|
plot(interaction_domain)
|
|
print(shapiro.test(residuals(interaction_domain)))
|
|
vif(interaction_domain)
|
|
print(summary(interaction_domain))
|
|
print(AIC(interaction_domain)) |