eohi/.history/eohi1/regressions e1 - assumptions_20251016175309.r
2025-12-23 15:47:09 -05:00

374 lines
17 KiB
R

# Regression Analysis - Assumption Checking
# IVs: demo_sex, demo_age, demo_edu
# DVs: eohiDGEN_mean, ehi_global_mean
options(scipen = 999)
library(car)
library(performance)
#library(see)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(lmtest) # For bptest and durbinWatsonTest
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
data <- read.csv("ehi1.csv")
# Check for missing values
missing_summary <- data %>%
select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
summarise_all(~sum(is.na(.)))
print(missing_summary)
# Remove rows with missing values
data_clean <- data %>%
select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
filter(complete.cases(.))
print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x ")))
# Recode demo_sex as numeric for regression (0 = Female, 1 = Male)
data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0)
# Check demo_edu levels and recode if needed
print("Original education levels:")
print(table(data_clean$demo_edu))
print("Education levels:")
print(levels(factor(data_clean$demo_edu)))
# Create dummy variables for education (k-1 dummy variables)
# First, convert to factor to ensure proper level ordering
data_clean$demo_edu_factor <- factor(data_clean$demo_edu)
# Create dummy variables (k-1), using the first level as reference
edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean)
edu_dummies <- edu_dummies[, -1] # Remove intercept column
# Add dummy variables to dataset
colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies))
data_clean <- cbind(data_clean, edu_dummies)
# Center the age variable for regression analysis
data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1]
# Verify recoding
print("Sex recoding (0=Female, 1=Male):")
print(table(data_clean$demo_sex_numeric))
print("Education dummy variables created:")
print(paste("Number of dummy variables:", ncol(edu_dummies)))
print("Dummy variable names:")
print(colnames(edu_dummies))
print("Dummy variable summary:")
print(summary(edu_dummies))
print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2)))
print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2)))
# =============================================================================
# REGRESSION MODELS
# =============================================================================
# Define the 6 regression models
models <- list()
# Model 1: demo_sex → eohiDGEN_mean
models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean)
# Model 2: demo_age_1 → eohiDGEN_mean (centered)
models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean)
# Model 3: demo_edu → eohiDGEN_mean (using dummy variables)
models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ ., data = data_clean[, c("eohiDGEN_mean", colnames(edu_dummies))])
# Model 4: demo_sex → ehi_global_mean
models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean)
# Model 5: demo_age_1 → ehi_global_mean (centered)
models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean)
# Model 6: demo_edu → ehi_global_mean (using dummy variables)
models$edu_ehi_global <- lm(ehi_global_mean ~ ., data = data_clean[, c("ehi_global_mean", colnames(edu_dummies))])
# =============================================================================
# ASSUMPTION CHECKING FUNCTIONS
# =============================================================================
# Function to check linearity assumption
check_linearity <- function(model, model_name) {
# Residuals vs Fitted plot
plot(model, which = 1, main = paste("Linearity:", model_name))
# Component + residual plot (partial residual plot)
crPlots(model, main = paste("Component+Residual Plot:", model_name))
return(NULL)
}
# Function to check normality of residuals
check_normality <- function(model, model_name) {
# Q-Q plot
plot(model, which = 2, main = paste("Q-Q Plot:", model_name))
# Shapiro-Wilk test
residuals <- residuals(model)
shapiro_test <- shapiro.test(residuals)
print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5)))
# Kolmogorov-Smirnov test
ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals))
print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5)))
# Histogram of residuals
hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
ggtitle(paste("Residuals Histogram:", model_name)) +
theme_minimal()
print(hist_plot)
return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value))
}
# Function to check homoscedasticity (constant variance)
check_homoscedasticity <- function(model, model_name) {
# Scale-Location plot
plot(model, which = 3, main = paste("Scale-Location Plot:", model_name))
# Breusch-Pagan test
bp_test <- bptest(model)
print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5)))
# White test (if available)
tryCatch({
white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2))
print(paste("White test p-value:", format(white_test$p.value, digits = 5)))
}, error = function(e) {
print("White test not available for this model")
})
return(list(bp_p = bp_test$p.value))
}
# Function to check independence (no autocorrelation)
check_independence <- function(model, model_name) {
# Durbin-Watson test
dw_test <- durbinWatsonTest(model)
print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5)))
print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5)))
# Residuals vs Order plot
residuals_vs_order <- ggplot(data.frame(
residuals = residuals(model),
order = seq_along(residuals(model))
), aes(x = order, y = residuals)) +
geom_point(color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("Residuals vs Order:", model_name)) +
theme_minimal()
print(residuals_vs_order)
return(list(dw_stat = dw_test$dw, dw_p = dw_test$p))
}
# Function to check for influential observations
check_influence <- function(model, model_name) {
# Cook's Distance plot
plot(model, which = 4, main = paste("Cook's Distance:", model_name))
# Calculate influence measures
cooks_d <- cooks.distance(model)
leverage <- hatvalues(model)
dffits_val <- dffits(model)
# Identify influential observations
cooks_threshold <- 4/length(cooks_d) # Cook's D threshold
leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold
dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold
influential_cooks <- which(cooks_d > cooks_threshold)
influential_leverage <- which(leverage > leverage_threshold)
influential_dffits <- which(abs(dffits_val) > dffits_threshold)
print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5)))
print(paste("Influential observations (Cook's D):", length(influential_cooks)))
print(paste("Leverage threshold:", format(leverage_threshold, digits = 5)))
print(paste("High leverage observations:", length(influential_leverage)))
print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5)))
print(paste("Influential observations (DFFITS):", length(influential_dffits)))
if (length(influential_cooks) > 0) {
print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", ")))
}
if (length(influential_leverage) > 0) {
print(paste("High leverage cases:", paste(influential_leverage, collapse = ", ")))
}
if (length(influential_dffits) > 0) {
print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", ")))
}
return(list(influential_cooks = influential_cooks,
influential_leverage = influential_leverage,
influential_dffits = influential_dffits))
}
# Function to get comprehensive model summary
get_model_summary <- function(model, model_name) {
# Basic model summary
summary_model <- summary(model)
print(summary_model)
# R-squared and adjusted R-squared
print(paste("R-squared:", format(summary_model$r.squared, digits = 5)))
print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5)))
# AIC and BIC
aic_val <- AIC(model)
bic_val <- BIC(model)
print(paste("AIC:", format(aic_val, digits = 5)))
print(paste("BIC:", format(bic_val, digits = 5)))
return(list(summary = summary_model, r_squared = summary_model$r.squared,
adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val))
}
# =============================================================================
# RUN ASSUMPTION CHECKS - MODEL BY MODEL
# =============================================================================
# Model 1: Sex → EOHI-DGEN Mean
model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN")
# Model 2: Age (centered) → EOHI-DGEN Mean
model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN")
model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN")
model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN")
model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN")
model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN")
model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN")
# Model 3: Education → EOHI-DGEN Mean
model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN")
model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN")
# Model 4: Sex → EHI-Global Mean
model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global")
model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global")
model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global")
model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global")
model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global")
model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global")
# Model 5: Age (centered) → EHI-Global Mean
model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global")
model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global")
model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global")
model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global")
model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global")
model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global")
# Model 6: Education → EHI-Global Mean
model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global")
model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global")
model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global")
model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global")
model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global")
model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global")
# =============================================================================
# SUMMARY TABLE OF ASSUMPTION VIOLATIONS
# =============================================================================
# Create summary table
violation_summary <- data.frame(
Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN",
"Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"),
Normality = c(
ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"),
ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK")
),
Homoscedasticity = c(
ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"),
ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"),
ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"),
ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"),
ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"),
ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK")
),
Independence = c(
ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"),
ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"),
ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"),
ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"),
ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"),
ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK")
),
Influential_Obs = c(
ifelse(length(model1_influence$influential_cooks) > 0,
paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"),
ifelse(length(model2_influence$influential_cooks) > 0,
paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"),
ifelse(length(model3_influence$influential_cooks) > 0,
paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"),
ifelse(length(model4_influence$influential_cooks) > 0,
paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"),
ifelse(length(model5_influence$influential_cooks) > 0,
paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"),
ifelse(length(model6_influence$influential_cooks) > 0,
paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO")
),
stringsAsFactors = FALSE
)
print(violation_summary)
# =============================================================================
# MODEL COMPARISON TABLE
# =============================================================================
# Create model comparison table
comparison_table <- data.frame(
Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN",
"Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"),
R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared,
model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared),
Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared,
model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared),
AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic,
model4_summary$aic, model5_summary$aic, model6_summary$aic),
BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic,
model4_summary$bic, model5_summary$bic, model6_summary$bic),
Significant = c(
ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"),
ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO")
),
stringsAsFactors = FALSE
)
print(comparison_table)