393 lines
14 KiB
R
393 lines
14 KiB
R
# Regression Analysis - Assumption Checking
|
|
# IVs: demo_sex, demo_age, demo_edu
|
|
# DVs: eohiDGEN_mean, ehi_global_mean
|
|
|
|
options(scipen = 999)
|
|
|
|
library(car)
|
|
library(performance)
|
|
library(see)
|
|
library(ggplot2)
|
|
library(gridExtra)
|
|
library(dplyr)
|
|
library(lmtest) # For bptest and durbinWatsonTest
|
|
|
|
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
|
|
data <- read.csv("ehi1.csv")
|
|
|
|
# Check for missing values
|
|
missing_summary <- data %>%
|
|
select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
|
|
summarise_all(~sum(is.na(.)))
|
|
print("Missing values check:")
|
|
print(missing_summary)
|
|
|
|
# Remove rows with missing values
|
|
data_clean <- data %>%
|
|
select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>%
|
|
filter(complete.cases(.))
|
|
|
|
print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x ")))
|
|
|
|
# Recode demo_sex as numeric for regression (0 = Female, 1 = Male)
|
|
data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0)
|
|
|
|
# Check demo_edu levels and recode if needed
|
|
print("Education levels:")
|
|
print(table(data_clean$demo_edu))
|
|
|
|
# Recode education as ordinal (assuming higher values = more education)
|
|
edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate",
|
|
"University - Undergraduate", "University - Graduate")
|
|
data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels)
|
|
|
|
# Verify recoding
|
|
print("Sex recoding (0=Female, 1=Male):")
|
|
print(table(data_clean$demo_sex_numeric))
|
|
print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):")
|
|
print(table(data_clean$demo_edu_numeric))
|
|
|
|
# =============================================================================
|
|
# REGRESSION MODELS
|
|
# =============================================================================
|
|
|
|
# Define the 6 regression models
|
|
models <- list()
|
|
|
|
# Model 1: demo_sex → eohiDGEN_mean
|
|
models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean)
|
|
|
|
# Model 2: demo_age → eohiDGEN_mean
|
|
models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean)
|
|
|
|
# Model 3: demo_edu → eohiDGEN_mean
|
|
models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean)
|
|
|
|
# Model 4: demo_sex → ehi_global_mean
|
|
models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean)
|
|
|
|
# Model 5: demo_age → ehi_global_mean
|
|
models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean)
|
|
|
|
# Model 6: demo_edu → ehi_global_mean
|
|
models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean)
|
|
|
|
# =============================================================================
|
|
# ASSUMPTION CHECKING FUNCTIONS
|
|
# =============================================================================
|
|
|
|
# Function to check linearity assumption
|
|
check_linearity <- function(model, model_name) {
|
|
print(paste("=== LINEARITY CHECK:", model_name, "==="))
|
|
|
|
# Residuals vs Fitted plot
|
|
plot(model, which = 1, main = paste("Linearity:", model_name))
|
|
|
|
# Component + residual plot (partial residual plot)
|
|
crPlots(model, main = paste("Component+Residual Plot:", model_name))
|
|
|
|
return(NULL)
|
|
}
|
|
|
|
# Function to check normality of residuals
|
|
check_normality <- function(model, model_name) {
|
|
print(paste("=== NORMALITY CHECK:", model_name, "==="))
|
|
|
|
# Q-Q plot
|
|
plot(model, which = 2, main = paste("Q-Q Plot:", model_name))
|
|
|
|
# Shapiro-Wilk test
|
|
residuals <- residuals(model)
|
|
shapiro_test <- shapiro.test(residuals)
|
|
print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5)))
|
|
|
|
# Kolmogorov-Smirnov test
|
|
ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals))
|
|
print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5)))
|
|
|
|
# Histogram of residuals
|
|
hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) +
|
|
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
|
|
ggtitle(paste("Residuals Histogram:", model_name)) +
|
|
theme_minimal()
|
|
print(hist_plot)
|
|
|
|
return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value))
|
|
}
|
|
|
|
# Function to check homoscedasticity (constant variance)
|
|
check_homoscedasticity <- function(model, model_name) {
|
|
print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "==="))
|
|
|
|
# Scale-Location plot
|
|
plot(model, which = 3, main = paste("Scale-Location Plot:", model_name))
|
|
|
|
# Breusch-Pagan test
|
|
bp_test <- bptest(model)
|
|
print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5)))
|
|
|
|
# White test (if available)
|
|
tryCatch({
|
|
white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2))
|
|
print(paste("White test p-value:", format(white_test$p.value, digits = 5)))
|
|
}, error = function(e) {
|
|
print("White test not available for this model")
|
|
})
|
|
|
|
return(list(bp_p = bp_test$p.value))
|
|
}
|
|
|
|
# Function to check independence (no autocorrelation)
|
|
check_independence <- function(model, model_name) {
|
|
print(paste("=== INDEPENDENCE CHECK:", model_name, "==="))
|
|
|
|
# Durbin-Watson test
|
|
dw_test <- durbinWatsonTest(model)
|
|
print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5)))
|
|
print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5)))
|
|
|
|
# Residuals vs Order plot
|
|
residuals_vs_order <- ggplot(data.frame(
|
|
residuals = residuals(model),
|
|
order = seq_along(residuals(model))
|
|
), aes(x = order, y = residuals)) +
|
|
geom_point(color = "black") +
|
|
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
|
|
ggtitle(paste("Residuals vs Order:", model_name)) +
|
|
theme_minimal()
|
|
print(residuals_vs_order)
|
|
|
|
return(list(dw_stat = dw_test$dw, dw_p = dw_test$p))
|
|
}
|
|
|
|
# Function to check for influential observations
|
|
check_influence <- function(model, model_name) {
|
|
print(paste("=== INFLUENCE CHECK:", model_name, "==="))
|
|
|
|
# Cook's Distance plot
|
|
plot(model, which = 4, main = paste("Cook's Distance:", model_name))
|
|
|
|
# Calculate influence measures
|
|
cooks_d <- cooks.distance(model)
|
|
leverage <- hatvalues(model)
|
|
dffits_val <- dffits(model)
|
|
|
|
# Identify influential observations
|
|
cooks_threshold <- 4/length(cooks_d) # Cook's D threshold
|
|
leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold
|
|
dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold
|
|
|
|
influential_cooks <- which(cooks_d > cooks_threshold)
|
|
influential_leverage <- which(leverage > leverage_threshold)
|
|
influential_dffits <- which(abs(dffits_val) > dffits_threshold)
|
|
|
|
print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5)))
|
|
print(paste("Influential observations (Cook's D):", length(influential_cooks)))
|
|
print(paste("Leverage threshold:", format(leverage_threshold, digits = 5)))
|
|
print(paste("High leverage observations:", length(influential_leverage)))
|
|
print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5)))
|
|
print(paste("Influential observations (DFFITS):", length(influential_dffits)))
|
|
|
|
if (length(influential_cooks) > 0) {
|
|
print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", ")))
|
|
}
|
|
if (length(influential_leverage) > 0) {
|
|
print(paste("High leverage cases:", paste(influential_leverage, collapse = ", ")))
|
|
}
|
|
if (length(influential_dffits) > 0) {
|
|
print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", ")))
|
|
}
|
|
|
|
return(list(influential_cooks = influential_cooks,
|
|
influential_leverage = influential_leverage,
|
|
influential_dffits = influential_dffits))
|
|
}
|
|
|
|
# Function to get comprehensive model summary
|
|
get_model_summary <- function(model, model_name) {
|
|
print(paste("=== MODEL SUMMARY:", model_name, "==="))
|
|
|
|
# Basic model summary
|
|
summary_model <- summary(model)
|
|
print(summary_model)
|
|
|
|
# R-squared and adjusted R-squared
|
|
print(paste("R-squared:", format(summary_model$r.squared, digits = 5)))
|
|
print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5)))
|
|
|
|
# AIC and BIC
|
|
aic_val <- AIC(model)
|
|
bic_val <- BIC(model)
|
|
print(paste("AIC:", format(aic_val, digits = 5)))
|
|
print(paste("BIC:", format(bic_val, digits = 5)))
|
|
|
|
return(list(summary = summary_model, r_squared = summary_model$r.squared,
|
|
adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val))
|
|
}
|
|
|
|
# =============================================================================
|
|
# RUN ASSUMPTION CHECKS FOR ALL MODELS
|
|
# =============================================================================
|
|
|
|
# Create results storage
|
|
assumption_results <- list()
|
|
model_summaries <- list()
|
|
|
|
# Model names for reference
|
|
model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN",
|
|
"Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global")
|
|
|
|
cat("\n", rep("=", 80), "\n")
|
|
cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n")
|
|
cat(rep("=", 80), "\n")
|
|
|
|
# Run assumption checks for each model
|
|
for (i in seq_along(models)) {
|
|
model_name <- model_names[i]
|
|
model <- models[[i]]
|
|
|
|
cat("\n", rep("-", 60), "\n")
|
|
cat("ANALYZING MODEL", i, ":", model_name, "\n")
|
|
cat(rep("-", 60), "\n")
|
|
|
|
# Store results
|
|
assumption_results[[i]] <- list()
|
|
assumption_results[[i]]$model_name <- model_name
|
|
|
|
# 1. Model Summary
|
|
model_summaries[[i]] <- get_model_summary(model, model_name)
|
|
assumption_results[[i]]$summary <- model_summaries[[i]]
|
|
|
|
# 2. Linearity Check
|
|
assumption_results[[i]]$linearity <- check_linearity(model, model_name)
|
|
|
|
# 3. Normality Check
|
|
assumption_results[[i]]$normality <- check_normality(model, model_name)
|
|
|
|
# 4. Homoscedasticity Check
|
|
assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name)
|
|
|
|
# 5. Independence Check
|
|
assumption_results[[i]]$independence <- check_independence(model, model_name)
|
|
|
|
# 6. Influence Check
|
|
assumption_results[[i]]$influence <- check_influence(model, model_name)
|
|
}
|
|
|
|
# =============================================================================
|
|
# SUMMARY TABLE OF ASSUMPTION VIOLATIONS
|
|
# =============================================================================
|
|
|
|
cat("\n", rep("=", 80), "\n")
|
|
cat("ASSUMPTION VIOLATION SUMMARY\n")
|
|
cat(rep("=", 80), "\n")
|
|
|
|
# Create summary table
|
|
violation_summary <- data.frame(
|
|
Model = character(),
|
|
Linearity = character(),
|
|
Normality = character(),
|
|
Homoscedasticity = character(),
|
|
Independence = character(),
|
|
Influential_Obs = character(),
|
|
stringsAsFactors = FALSE
|
|
)
|
|
|
|
# Populate summary table
|
|
for (i in seq_along(models)) {
|
|
model_name <- model_names[i]
|
|
|
|
# Check normality (Shapiro-Wilk p < 0.05 indicates violation)
|
|
normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK")
|
|
|
|
# Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation)
|
|
homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK")
|
|
|
|
# Check independence (Durbin-Watson p < 0.05 indicates violation)
|
|
independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK")
|
|
|
|
# Check for influential observations
|
|
influential_count <- length(assumption_results[[i]]$influence$influential_cooks)
|
|
influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO")
|
|
|
|
# Linearity is assessed visually, so we'll mark as "CHECK VISUALLY"
|
|
linearity_status <- "CHECK VISUALLY"
|
|
|
|
violation_summary <- rbind(violation_summary, data.frame(
|
|
Model = model_name,
|
|
Linearity = linearity_status,
|
|
Normality = normality_violation,
|
|
Homoscedasticity = homosced_violation,
|
|
Independence = independence_violation,
|
|
Influential_Obs = influential_status
|
|
))
|
|
}
|
|
|
|
print(violation_summary)
|
|
|
|
# =============================================================================
|
|
# MODEL COMPARISON TABLE
|
|
# =============================================================================
|
|
|
|
cat("\n", rep("=", 80), "\n")
|
|
cat("MODEL COMPARISON SUMMARY\n")
|
|
cat(rep("=", 80), "\n")
|
|
|
|
# Create model comparison table
|
|
comparison_table <- data.frame(
|
|
Model = model_names,
|
|
R_Squared = numeric(length(models)),
|
|
Adj_R_Squared = numeric(length(models)),
|
|
AIC = numeric(length(models)),
|
|
BIC = numeric(length(models)),
|
|
Significant = character(length(models)),
|
|
stringsAsFactors = FALSE
|
|
)
|
|
|
|
for (i in seq_along(models)) {
|
|
summary_model <- model_summaries[[i]]$summary
|
|
comparison_table$R_Squared[i] <- summary_model$r.squared
|
|
comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared
|
|
comparison_table$AIC[i] <- model_summaries[[i]]$aic
|
|
comparison_table$BIC[i] <- model_summaries[[i]]$bic
|
|
|
|
# Check if predictor is significant (p < 0.05)
|
|
p_value <- summary_model$coefficients[2, 4]
|
|
comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO")
|
|
}
|
|
|
|
print(comparison_table)
|
|
|
|
# =============================================================================
|
|
# RECOMMENDATIONS
|
|
# =============================================================================
|
|
|
|
cat("\n", rep("=", 80), "\n")
|
|
cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n")
|
|
cat(rep("=", 80), "\n")
|
|
|
|
cat("\n1. NORMALITY VIOLATIONS:\n")
|
|
cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n")
|
|
cat(" - Alternative: Use robust regression methods or bootstrapping\n")
|
|
|
|
cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n")
|
|
cat(" - If violated: Use weighted least squares or robust standard errors\n")
|
|
cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n")
|
|
|
|
cat("\n3. INDEPENDENCE VIOLATIONS:\n")
|
|
cat(" - If violated: Check for clustering or repeated measures structure\n")
|
|
cat(" - Alternative: Use mixed-effects models or clustered standard errors\n")
|
|
|
|
cat("\n4. INFLUENTIAL OBSERVATIONS:\n")
|
|
cat(" - If present: Examine these cases for data entry errors\n")
|
|
cat(" - Consider: Running analysis with and without influential cases\n")
|
|
cat(" - Alternative: Use robust regression methods\n")
|
|
|
|
cat("\n5. LINEARITY VIOLATIONS:\n")
|
|
cat(" - If violated: Add polynomial terms or use splines\n")
|
|
cat(" - Alternative: Transform predictors or use non-parametric methods\n")
|
|
|
|
cat("\n", rep("=", 80), "\n")
|
|
cat("ANALYSIS COMPLETE\n")
|
|
cat(rep("=", 80), "\n")
|