463 lines
16 KiB
R
463 lines
16 KiB
R
library(dplyr)
|
|
|
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3")
|
|
|
|
# Read the data (with check.names=FALSE to preserve original column names)
|
|
# Keep empty cells as empty strings, not NA
|
|
# Only convert the literal string "NA" to NA, not empty strings
|
|
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
|
|
|
# =============================================================================
|
|
# 1. CREATE BACKUP
|
|
# =============================================================================
|
|
file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)
|
|
|
|
# =============================================================================
|
|
# HELPER FUNCTION: Check variable existence and values
|
|
# =============================================================================
|
|
|
|
check_vars_exist <- function(source_vars, target_vars) {
|
|
missing_source <- setdiff(source_vars, names(df))
|
|
missing_target <- setdiff(target_vars, names(df))
|
|
|
|
if (length(missing_source) > 0) {
|
|
stop(paste("Missing source variables:", paste(missing_source, collapse = ", ")))
|
|
}
|
|
if (length(missing_target) > 0) {
|
|
stop(paste("Missing target variables:", paste(missing_target, collapse = ", ")))
|
|
}
|
|
return(TRUE)
|
|
}
|
|
|
|
check_values_exist <- function(var_name, expected_values) {
|
|
unique_vals <- unique(df[[var_name]])
|
|
unique_vals <- unique_vals[!is.na(unique_vals) & unique_vals != ""]
|
|
missing_vals <- setdiff(expected_values, unique_vals)
|
|
extra_vals <- setdiff(unique_vals, expected_values)
|
|
|
|
if (length(missing_vals) > 0) {
|
|
cat(paste(" ⚠ Expected values not found in", var_name, ":", paste(missing_vals, collapse = ", "), "\n"))
|
|
}
|
|
if (length(extra_vals) > 0) {
|
|
cat(paste(" ⚠ Unexpected values found in", var_name, ":", paste(extra_vals, collapse = ", "), "\n"))
|
|
}
|
|
return(list(missing = missing_vals, extra = extra_vals))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 2. RECODE other_length2 TO other_length
|
|
# =============================================================================
|
|
|
|
cat("\n=== 1. RECODING other_length2 TO other_length ===\n\n")
|
|
|
|
# Check variables exist
|
|
check_vars_exist("other_length2", "other_length")
|
|
|
|
# Check values in source
|
|
cat("Checking source variable values...\n")
|
|
length_vals <- unique(df$other_length2[!is.na(df$other_length2) & df$other_length2 != ""])
|
|
cat(paste(" Unique values in other_length2:", paste(length_vals, collapse = ", "), "\n"))
|
|
|
|
# Recode - handle "20+" as special case first, then convert to numeric for ranges
|
|
# Convert to numeric once, suppressing warnings for non-numeric values
|
|
num_length <- suppressWarnings(as.numeric(df$other_length2))
|
|
|
|
df$other_length <- ifelse(
|
|
is.na(df$other_length2),
|
|
NA,
|
|
ifelse(
|
|
df$other_length2 == "",
|
|
"",
|
|
ifelse(
|
|
df$other_length2 == "20+",
|
|
"20+",
|
|
ifelse(
|
|
!is.na(num_length) & num_length >= 5 & num_length <= 9,
|
|
"5-9",
|
|
ifelse(
|
|
!is.na(num_length) & num_length >= 10 & num_length <= 14,
|
|
"10-14",
|
|
ifelse(
|
|
!is.na(num_length) & num_length >= 15 & num_length <= 19,
|
|
"15-19",
|
|
NA
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(123)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
source_val <- df$other_length2[i]
|
|
target_val <- df$other_length[i]
|
|
cat(sprintf(" Row %d: other_length2 = %s -> other_length = %s\n",
|
|
i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)),
|
|
ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val))))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 3. RECODE other_like2 TO other_like
|
|
# =============================================================================
|
|
|
|
cat("\n=== 2. RECODING other_like2 TO other_like ===\n\n")
|
|
|
|
# Check variables exist
|
|
check_vars_exist("other_like2", "other_like")
|
|
|
|
# Check expected values exist
|
|
expected_like <- c("Dislike a great deal", "Dislike somewhat", "Neither like nor dislike",
|
|
"Like somewhat", "Like a great deal")
|
|
check_values_exist("other_like2", expected_like)
|
|
|
|
# Recode
|
|
df$other_like <- ifelse(
|
|
is.na(df$other_like2),
|
|
NA,
|
|
ifelse(
|
|
df$other_like2 == "",
|
|
"",
|
|
ifelse(
|
|
df$other_like2 == "Dislike a great deal",
|
|
"-2",
|
|
ifelse(
|
|
df$other_like2 == "Dislike somewhat",
|
|
"-1",
|
|
ifelse(
|
|
df$other_like2 == "Neither like nor dislike",
|
|
"0",
|
|
ifelse(
|
|
df$other_like2 == "Like somewhat",
|
|
"1",
|
|
ifelse(
|
|
df$other_like2 == "Like a great deal",
|
|
"2",
|
|
NA
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(456)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
source_val <- df$other_like2[i]
|
|
target_val <- df$other_like[i]
|
|
cat(sprintf(" Row %d: other_like2 = %s -> other_like = %s\n",
|
|
i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)),
|
|
ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val))))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 4. CALCULATE aot_total
|
|
# =============================================================================
|
|
|
|
cat("\n=== 3. CALCULATING aot_total ===\n\n")
|
|
|
|
# Check variables exist
|
|
aot_vars <- c("aot01", "aot02", "aot03", "aot04_r", "aot05_r", "aot06_r", "aot07_r", "aot08")
|
|
check_vars_exist(aot_vars, "aot_total")
|
|
|
|
# Reverse code aot04_r through aot07_r
|
|
reverse_vars <- c("aot04_r", "aot05_r", "aot06_r", "aot07_r")
|
|
for (var in reverse_vars) {
|
|
df[[paste0(var, "_reversed")]] <- as.numeric(ifelse(
|
|
df[[var]] == "" | is.na(df[[var]]),
|
|
NA,
|
|
as.numeric(df[[var]]) * -1
|
|
))
|
|
}
|
|
|
|
# Calculate mean of all 8 variables (4 reversed + 4 original)
|
|
all_aot_vars <- c("aot01", "aot02", "aot03", "aot04_r_reversed", "aot05_r_reversed",
|
|
"aot06_r_reversed", "aot07_r_reversed", "aot08")
|
|
|
|
# Convert to numeric matrix
|
|
aot_matrix <- df[, all_aot_vars]
|
|
aot_numeric <- apply(aot_matrix, 2, function(x) {
|
|
as.numeric(ifelse(x == "" | is.na(x), NA, x))
|
|
})
|
|
|
|
# Calculate mean
|
|
df$aot_total <- rowMeans(aot_numeric, na.rm = TRUE)
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(789)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
aot_vals <- df[i, all_aot_vars]
|
|
aot_nums <- as.numeric(ifelse(aot_vals == "" | is.na(aot_vals), NA, aot_vals))
|
|
expected_mean <- mean(aot_nums, na.rm = TRUE)
|
|
actual_mean <- df$aot_total[i]
|
|
cat(sprintf(" Row %d: aot_total = %s (expected: %s)\n",
|
|
i, ifelse(is.na(actual_mean), "NA", round(actual_mean, 4)),
|
|
ifelse(is.na(expected_mean), "NA", round(expected_mean, 4))))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 5. PROCESS CRT QUESTIONS
|
|
# =============================================================================
|
|
|
|
cat("\n=== 4. PROCESSING CRT QUESTIONS ===\n\n")
|
|
|
|
# Check variables exist
|
|
check_vars_exist(c("crt01", "crt02", "crt03"), c("crt_correct", "crt_int"))
|
|
|
|
# Initialize CRT variables
|
|
df$crt_correct <- 0
|
|
df$crt_int <- 0
|
|
|
|
# CRT01: "5 cents" = correct (1,0), "10 cents" = intuitive (0,1), else (0,0)
|
|
df$crt_correct <- ifelse(df$crt01 == "5 cents", 1, df$crt_correct)
|
|
df$crt_int <- ifelse(df$crt01 == "10 cents", 1, df$crt_int)
|
|
|
|
# CRT02: "5 minutes" = correct (1,0), "100 minutes" = intuitive (0,1), else (0,0)
|
|
df$crt_correct <- ifelse(df$crt02 == "5 minutes", df$crt_correct + 1, df$crt_correct)
|
|
df$crt_int <- ifelse(df$crt02 == "100 minutes", df$crt_int + 1, df$crt_int)
|
|
|
|
# CRT03: "47 days" = correct (1,0), "24 days" = intuitive (0,1), else (0,0)
|
|
df$crt_correct <- ifelse(df$crt03 == "47 days", df$crt_correct + 1, df$crt_correct)
|
|
df$crt_int <- ifelse(df$crt03 == "24 days", df$crt_int + 1, df$crt_int)
|
|
|
|
# Check expected values exist
|
|
expected_crt01 <- c("5 cents", "10 cents")
|
|
expected_crt02 <- c("5 minutes", "100 minutes")
|
|
expected_crt03 <- c("47 days", "24 days")
|
|
check_values_exist("crt01", expected_crt01)
|
|
check_values_exist("crt02", expected_crt02)
|
|
check_values_exist("crt03", expected_crt03)
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(1011)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
cat(sprintf(" Row %d:\n", i))
|
|
cat(sprintf(" crt01 = %s -> crt_correct = %d, crt_int = %d\n",
|
|
ifelse(is.na(df$crt01[i]) || df$crt01[i] == "", "NA/empty", df$crt01[i]),
|
|
ifelse(df$crt01[i] == "5 cents", 1, 0),
|
|
ifelse(df$crt01[i] == "10 cents", 1, 0)))
|
|
cat(sprintf(" crt02 = %s -> crt_correct = %d, crt_int = %d\n",
|
|
ifelse(is.na(df$crt02[i]) || df$crt02[i] == "", "NA/empty", df$crt02[i]),
|
|
ifelse(df$crt02[i] == "5 minutes", 1, 0),
|
|
ifelse(df$crt02[i] == "100 minutes", 1, 0)))
|
|
cat(sprintf(" crt03 = %s -> crt_correct = %d, crt_int = %d\n",
|
|
ifelse(is.na(df$crt03[i]) || df$crt03[i] == "", "NA/empty", df$crt03[i]),
|
|
ifelse(df$crt03[i] == "47 days", 1, 0),
|
|
ifelse(df$crt03[i] == "24 days", 1, 0)))
|
|
cat(sprintf(" Total: crt_correct = %d, crt_int = %d\n\n",
|
|
df$crt_correct[i], df$crt_int[i]))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 6. CALCULATE icar_verbal
|
|
# =============================================================================
|
|
|
|
cat("\n=== 5. CALCULATING icar_verbal ===\n\n")
|
|
|
|
# Check variables exist
|
|
verbal_vars <- c("verbal01", "verbal02", "verbal03", "verbal04", "verbal05")
|
|
check_vars_exist(verbal_vars, "icar_verbal")
|
|
|
|
# Correct answers
|
|
correct_verbal <- c("5", "8", "It's impossible to tell", "47", "Sunday")
|
|
|
|
# Calculate proportion correct
|
|
verbal_responses <- df[, verbal_vars]
|
|
correct_count <- rowSums(
|
|
sapply(1:5, function(i) {
|
|
verbal_responses[, i] == correct_verbal[i]
|
|
}),
|
|
na.rm = TRUE
|
|
)
|
|
df$icar_verbal <- correct_count / 5
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(1213)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
responses <- df[i, verbal_vars]
|
|
correct <- sum(sapply(1:5, function(j) responses[j] == correct_verbal[j]), na.rm = TRUE)
|
|
prop <- correct / 5
|
|
cat(sprintf(" Row %d: Correct = %d/5, icar_verbal = %s\n",
|
|
i, correct, round(prop, 4)))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 7. CALCULATE icar_matrix
|
|
# =============================================================================
|
|
|
|
cat("\n=== 6. CALCULATING icar_matrix ===\n\n")
|
|
|
|
# Check variables exist
|
|
matrix_vars <- c("matrix01", "matrix02", "matrix03", "matrix04", "matrix05")
|
|
check_vars_exist(matrix_vars, "icar_matrix")
|
|
|
|
# Correct answers
|
|
correct_matrix <- c("D", "E", "B", "B", "D")
|
|
|
|
# Calculate proportion correct
|
|
matrix_responses <- df[, matrix_vars]
|
|
correct_count <- rowSums(
|
|
sapply(1:5, function(i) {
|
|
matrix_responses[, i] == correct_matrix[i]
|
|
}),
|
|
na.rm = TRUE
|
|
)
|
|
df$icar_matrix <- correct_count / 5
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(1415)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
responses <- df[i, matrix_vars]
|
|
correct <- sum(sapply(1:5, function(j) responses[j] == correct_matrix[j]), na.rm = TRUE)
|
|
prop <- correct / 5
|
|
cat(sprintf(" Row %d: Correct = %d/5, icar_matrix = %s\n",
|
|
i, correct, round(prop, 4)))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 8. CALCULATE icar_total
|
|
# =============================================================================
|
|
|
|
cat("\n=== 7. CALCULATING icar_total ===\n\n")
|
|
|
|
# Check variables exist
|
|
check_vars_exist(c(verbal_vars, matrix_vars), "icar_total")
|
|
|
|
# Calculate proportion correct across all 10 items
|
|
all_correct <- c(correct_verbal, correct_matrix)
|
|
all_responses <- df[, c(verbal_vars, matrix_vars)]
|
|
correct_count <- rowSums(
|
|
sapply(1:10, function(i) {
|
|
all_responses[, i] == all_correct[i]
|
|
}),
|
|
na.rm = TRUE
|
|
)
|
|
df$icar_total <- correct_count / 10
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(1617)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
responses <- df[i, c(verbal_vars, matrix_vars)]
|
|
correct <- sum(sapply(1:10, function(j) responses[j] == all_correct[j]), na.rm = TRUE)
|
|
prop <- correct / 10
|
|
cat(sprintf(" Row %d: Correct = %d/10, icar_total = %s\n",
|
|
i, correct, round(prop, 4)))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 9. RECODE demo_sex TO sex
|
|
# =============================================================================
|
|
|
|
cat("\n=== 8. RECODING demo_sex TO sex ===\n\n")
|
|
|
|
# Check variables exist
|
|
check_vars_exist("demo_sex", "sex")
|
|
|
|
# Check values
|
|
sex_vals <- unique(df$demo_sex[!is.na(df$demo_sex) & df$demo_sex != ""])
|
|
cat(paste(" Unique values in demo_sex:", paste(sex_vals, collapse = ", "), "\n"))
|
|
|
|
# Recode: male = 0, female = 1, else = 2
|
|
df$sex <- ifelse(
|
|
is.na(df$demo_sex) | df$demo_sex == "",
|
|
NA,
|
|
ifelse(
|
|
tolower(df$demo_sex) == "male",
|
|
0,
|
|
ifelse(
|
|
tolower(df$demo_sex) == "female",
|
|
1,
|
|
2
|
|
)
|
|
)
|
|
)
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(1819)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
source_val <- df$demo_sex[i]
|
|
target_val <- df$sex[i]
|
|
cat(sprintf(" Row %d: demo_sex = %s -> sex = %s\n",
|
|
i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val),
|
|
ifelse(is.na(target_val), "NA", target_val)))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 10. RECODE demo_edu TO education
|
|
# =============================================================================
|
|
|
|
cat("\n=== 9. RECODING demo_edu TO education ===\n\n")
|
|
|
|
# Check variables exist
|
|
check_vars_exist("demo_edu", "education")
|
|
|
|
# Check values
|
|
edu_vals <- unique(df$demo_edu[!is.na(df$demo_edu) & df$demo_edu != ""])
|
|
cat(paste(" Unique values in demo_edu:", paste(edu_vals, collapse = ", "), "\n"))
|
|
|
|
# Recode
|
|
df$education <- ifelse(
|
|
is.na(df$demo_edu) | df$demo_edu == "",
|
|
NA,
|
|
ifelse(
|
|
df$demo_edu %in% c("High School (or equivalent)", "Trade School"),
|
|
"HS_TS",
|
|
ifelse(
|
|
df$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate"),
|
|
"C_Ug",
|
|
ifelse(
|
|
df$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)"),
|
|
"grad_prof",
|
|
NA
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|
|
# Convert to ordered factor
|
|
df$education <- factor(df$education,
|
|
levels = c("HS_TS", "C_Ug", "grad_prof"),
|
|
ordered = TRUE)
|
|
|
|
# Verification check
|
|
cat("\nVerification (5 random rows):\n")
|
|
set.seed(2021)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
for (i in sample_rows) {
|
|
source_val <- df$demo_edu[i]
|
|
target_val <- df$education[i]
|
|
cat(sprintf(" Row %d: demo_edu = %s -> education = %s\n",
|
|
i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val),
|
|
ifelse(is.na(target_val), "NA", as.character(target_val))))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 11. SAVE UPDATED DATA
|
|
# =============================================================================
|
|
# COMMENTED OUT: Uncomment when ready to save
|
|
|
|
# write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
|
|
# cat("\nUpdated data saved to: eohi3.csv\n")
|
|
# cat(paste("Total rows:", nrow(df), "\n"))
|
|
# cat(paste("Total columns:", ncol(df), "\n"))
|
|
|