eohi/eohi2/datap 13 - ehi domain specific means.r
2025-12-23 15:47:09 -05:00

161 lines
7.8 KiB
R

options(scipen = 999)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
# Load data
data <- read.csv("eohi2.csv")
# Calculate domain-specific mean scores for EHI variables across time intervals
# === 5-YEAR MEANS ===
data$ehi5_pref_MEAN <- rowMeans(data[, c("ehi5_pref_read", "ehi5_pref_music",
"ehi5_pref_TV", "ehi5_pref_nap",
"ehi5_pref_travel")], na.rm = TRUE)
data$ehi5_pers_MEAN <- rowMeans(data[, c("ehi5_pers_extravert", "ehi5_pers_critical",
"ehi5_pers_dependable", "ehi5_pers_anxious",
"ehi5_pers_complex")], na.rm = TRUE)
data$ehi5_val_MEAN <- rowMeans(data[, c("ehi5_val_obey", "ehi5_val_trad",
"ehi5_val_opinion", "ehi5_val_performance",
"ehi5_val_justice")], na.rm = TRUE)
# === 10-YEAR MEANS ===
data$ehi10_pref_MEAN <- rowMeans(data[, c("ehi10_pref_read", "ehi10_pref_music",
"ehi10_pref_TV", "ehi10_pref_nap",
"ehi10_pref_travel")], na.rm = TRUE)
data$ehi10_pers_MEAN <- rowMeans(data[, c("ehi10_pers_extravert", "ehi10_pers_critical",
"ehi10_pers_dependable", "ehi10_pers_anxious",
"ehi10_pers_complex")], na.rm = TRUE)
data$ehi10_val_MEAN <- rowMeans(data[, c("ehi10_val_obey", "ehi10_val_trad",
"ehi10_val_opinion", "ehi10_val_performance",
"ehi10_val_justice")], na.rm = TRUE)
# === 5-10 YEAR CHANGE MEANS ===
data$ehi5.10_pref_MEAN <- rowMeans(data[, c("ehi5.10_pref_read", "ehi5.10_pref_music",
"ehi5.10_pref_TV", "ehi5.10_pref_nap",
"ehi5.10_pref_travel")], na.rm = TRUE)
data$ehi5.10_pers_MEAN <- rowMeans(data[, c("ehi5.10_pers_extravert", "ehi5.10_pers_critical",
"ehi5.10_pers_dependable", "ehi5.10_pers_anxious",
"ehi5.10_pers_complex")], na.rm = TRUE)
data$ehi5.10_val_MEAN <- rowMeans(data[, c("ehi5.10_val_obey", "ehi5.10_val_trad",
"ehi5.10_val_opinion", "ehi5.10_val_performance",
"ehi5.10_val_justice")], na.rm = TRUE)
# QA: Verify mean calculations
cat("\n=== QUALITY ASSURANCE CHECK ===\n")
cat("Verifying EHI domain-specific mean calculations\n\n")
cat("--- FIRST 5 ROWS: 5-YEAR PREFERENCES MEAN ---\n")
for (i in 1:5) {
vals <- c(data$ehi5_pref_read[i], data$ehi5_pref_music[i],
data$ehi5_pref_TV[i], data$ehi5_pref_nap[i],
data$ehi5_pref_travel[i])
calc_mean <- mean(vals, na.rm = TRUE)
actual_mean <- data$ehi5_pref_MEAN[i]
match <- abs(calc_mean - actual_mean) < 1e-10
cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n",
i, vals[1], vals[2], vals[3], vals[4], vals[5],
calc_mean, actual_mean, ifelse(match, "✓", "✗")))
}
cat("\n--- FIRST 5 ROWS: 5-YEAR PERSONALITY MEAN ---\n")
for (i in 1:5) {
vals <- c(data$ehi5_pers_extravert[i], data$ehi5_pers_critical[i],
data$ehi5_pers_dependable[i], data$ehi5_pers_anxious[i],
data$ehi5_pers_complex[i])
calc_mean <- mean(vals, na.rm = TRUE)
actual_mean <- data$ehi5_pers_MEAN[i]
match <- abs(calc_mean - actual_mean) < 1e-10
cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n",
i, vals[1], vals[2], vals[3], vals[4], vals[5],
calc_mean, actual_mean, ifelse(match, "✓", "✗")))
}
cat("\n--- FIRST 5 ROWS: 10-YEAR PREFERENCES MEAN ---\n")
for (i in 1:5) {
vals <- c(data$ehi10_pref_read[i], data$ehi10_pref_music[i],
data$ehi10_pref_TV[i], data$ehi10_pref_nap[i],
data$ehi10_pref_travel[i])
calc_mean <- mean(vals, na.rm = TRUE)
actual_mean <- data$ehi10_pref_MEAN[i]
match <- abs(calc_mean - actual_mean) < 1e-10
cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n",
i, vals[1], vals[2], vals[3], vals[4], vals[5],
calc_mean, actual_mean, ifelse(match, "✓", "✗")))
}
cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE PREFERENCES MEAN ---\n")
for (i in 1:5) {
vals <- c(data$ehi5.10_pref_read[i], data$ehi5.10_pref_music[i],
data$ehi5.10_pref_TV[i], data$ehi5.10_pref_nap[i],
data$ehi5.10_pref_travel[i])
calc_mean <- mean(vals, na.rm = TRUE)
actual_mean <- data$ehi5.10_pref_MEAN[i]
match <- abs(calc_mean - actual_mean) < 1e-10
cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n",
i, vals[1], vals[2], vals[3], vals[4], vals[5],
calc_mean, actual_mean, ifelse(match, "✓", "✗")))
}
# Overall QA check for all rows
cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n")
qa_checks <- list(
# 5-year means
list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"),
target = "ehi5_pref_MEAN", name = "5-Year Preferences"),
list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"),
target = "ehi5_pers_MEAN", name = "5-Year Personality"),
list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"),
target = "ehi5_val_MEAN", name = "5-Year Values"),
# 10-year means
list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"),
target = "ehi10_pref_MEAN", name = "10-Year Preferences"),
list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"),
target = "ehi10_pers_MEAN", name = "10-Year Personality"),
list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"),
target = "ehi10_val_MEAN", name = "10-Year Values"),
# 5-10 year change means
list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"),
target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"),
list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"),
target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"),
list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"),
target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values")
)
all_checks_passed <- TRUE
for (check in qa_checks) {
calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE)
actual_mean <- data[[check$target]]
discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10)
if (length(discrepancies) > 0) {
cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars)))
cat(sprintf(" Found %d discrepancies in rows: %s\n",
length(discrepancies),
paste(head(discrepancies, 10), collapse = ", ")))
all_checks_passed <- FALSE
} else {
cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n",
check$name, length(check$vars), nrow(data)))
}
}
cat("\n")
if (all_checks_passed) {
cat("*** ALL QA CHECKS PASSED ***\n")
} else {
cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n")
}
# Save updated dataset
write.csv(data, "eohi2.csv", row.names = FALSE)
cat("\nDataset saved to eohi2.csv\n")