eohi/.history/eohi1/assumption_checks_before_cronbach_20250917154857.r
2025-12-23 15:47:09 -05:00

163 lines
6.4 KiB
R

# Assumption Checks Before Cronbach's Alpha Analysis
# Run this BEFORE the main reliability analysis
library(psych)
library(corrplot)
library(ggplot2)
# Read the data
data <- read.csv("exp1.csv")
# Define scale variables
past_pref_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv",
"NPastDiff_pref_nap", "NPastDiff_pref_travel")
past_pers_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable",
"NPastDiff_pers_anxious", "NPastDiff_pers_complex")
past_val_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion",
"NPastDiff_val_performance", "NPastDiff_val_justice")
past_life_vars <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied",
"NPastDiff_life_important", "NPastDiff_life_change")
# Function to check assumptions for a scale
check_assumptions <- function(data, var_names, scale_name) {
cat("\n", "="*60, "\n")
cat("ASSUMPTION CHECKS FOR:", scale_name, "\n")
cat("="*60, "\n")
# Get scale data
scale_data <- data[, var_names]
# 1. Sample size check
complete_cases <- sum(complete.cases(scale_data))
cat("1. SAMPLE SIZE CHECK:\n")
cat(" Total participants:", nrow(data), "\n")
cat(" Complete cases:", complete_cases, "\n")
cat(" Adequate (≥30)?", ifelse(complete_cases >= 30, "✓ YES", "✗ NO"), "\n")
if(complete_cases < 30) {
cat(" WARNING: Sample size too small for reliable alpha estimates\n")
return(FALSE)
}
# 2. Missing data check
cat("\n2. MISSING DATA CHECK:\n")
missing_counts <- colSums(is.na(scale_data))
missing_pct <- round(missing_counts / nrow(data) * 100, 2)
cat(" Missing data by item:\n")
for(i in 1:length(var_names)) {
cat(" ", var_names[i], ":", missing_counts[i], "(", missing_pct[i], "%)\n")
}
max_missing <- max(missing_pct)
cat(" Maximum missing:", max_missing, "%\n")
cat(" Acceptable (<20%)?", ifelse(max_missing < 20, "✓ YES", "✗ NO"), "\n")
# 3. Use only complete cases for remaining checks
complete_data <- scale_data[complete.cases(scale_data), ]
# 4. Normality check (Shapiro-Wilk test on first item as example)
cat("\n3. NORMALITY CHECK (Shapiro-Wilk test on first item):\n")
if(nrow(complete_data) <= 5000) { # Shapiro-Wilk has sample size limit
shapiro_result <- shapiro.test(complete_data[, 1])
cat(" p-value:", round(shapiro_result$p.value, 4), "\n")
cat(" Normal?", ifelse(shapiro_result$p.value > 0.05, "✓ YES", "✗ NO (but alpha is robust)"), "\n")
} else {
cat(" Sample too large for Shapiro-Wilk test (alpha is robust to non-normality)\n")
}
# 5. Inter-item correlations check
cat("\n4. INTER-ITEM CORRELATIONS CHECK:\n")
cor_matrix <- cor(complete_data)
# Get off-diagonal correlations
cor_matrix[lower.tri(cor_matrix)] <- NA
diag(cor_matrix) <- NA
cors <- as.vector(cor_matrix)
cors <- cors[!is.na(cors)]
positive_cors <- sum(cors > 0)
strong_cors <- sum(cors > 0.30)
negative_cors <- sum(cors < 0)
cat(" Total correlations:", length(cors), "\n")
cat(" Positive correlations:", positive_cors, "\n")
cat(" Strong correlations (>0.30):", strong_cors, "\n")
cat(" Negative correlations:", negative_cors, "\n")
cat(" Mean correlation:", round(mean(cors), 4), "\n")
cat(" Range:", round(min(cors), 4), "to", round(max(cors), 4), "\n")
if(negative_cors > 0) {
cat(" ⚠️ WARNING: Negative correlations suggest potential issues\n")
}
if(strong_cors / length(cors) < 0.5) {
cat(" ⚠️ WARNING: Many weak correlations may indicate poor scale coherence\n")
}
# 6. Item variance check
cat("\n5. ITEM VARIANCE CHECK:\n")
item_vars <- apply(complete_data, 2, var)
var_ratio <- max(item_vars) / min(item_vars)
cat(" Item variances:", round(item_vars, 4), "\n")
cat(" Variance ratio (max/min):", round(var_ratio, 4), "\n")
cat(" Acceptable (<4:1)?", ifelse(var_ratio < 4, "✓ YES", "✗ NO"), "\n")
# 7. Outlier check
cat("\n6. OUTLIER CHECK:\n")
# Check for multivariate outliers using Mahalanobis distance
if(nrow(complete_data) > ncol(complete_data)) {
mahal_dist <- mahalanobis(complete_data, colMeans(complete_data), cov(complete_data))
outlier_threshold <- qchisq(0.999, df = ncol(complete_data))
outliers <- sum(mahal_dist > outlier_threshold)
cat(" Multivariate outliers (p<0.001):", outliers, "\n")
cat(" Acceptable (<5%)?", ifelse(outliers/nrow(complete_data) < 0.05, "✓ YES", "✗ NO"), "\n")
}
# 8. Summary recommendation
cat("\n7. OVERALL RECOMMENDATION:\n")
issues <- 0
if(complete_cases < 30) issues <- issues + 1
if(max_missing >= 20) issues <- issues + 1
if(negative_cors > 0) issues <- issues + 1
if(var_ratio >= 4) issues <- issues + 1
if(issues == 0) {
cat(" ✓ PROCEED with Cronbach's alpha analysis\n")
} else if(issues <= 2) {
cat(" ⚠️ PROCEED with CAUTION - some assumptions violated\n")
} else {
cat(" ✗ CONSIDER alternatives or data cleaning before proceeding\n")
}
return(TRUE)
}
# Check assumptions for all past scales
cat("CRONBACH'S ALPHA ASSUMPTION CHECKS")
cat("\nData: exp1.csv")
cat("\nTotal sample size:", nrow(data))
check_assumptions(data, past_pref_vars, "Past Preferences")
check_assumptions(data, past_pers_vars, "Past Personality")
check_assumptions(data, past_val_vars, "Past Values")
check_assumptions(data, past_life_vars, "Past Life Satisfaction")
# Quick check of future scales (you can expand this)
fut_pref_vars <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv",
"NFutDiff_pref_nap", "NFutDiff_pref_travel")
check_assumptions(data, fut_pref_vars, "Future Preferences")
cat("\n", "="*60, "\n")
cat("GENERAL GUIDELINES:\n")
cat("="*60, "\n")
cat("✓ If most assumptions are met, Cronbach's alpha is appropriate\n")
cat("⚠️ If some assumptions are violated, interpret with caution\n")
cat("✗ If many assumptions are violated, consider alternative approaches:\n")
cat(" - Omega coefficient (more robust to violations)\n")
cat(" - Split-half reliability\n")
cat(" - Test-retest reliability\n")
cat(" - Factor analysis to check dimensionality\n")