eohi/eohi3/datap 07 - scales and recodes.r

463 lines
16 KiB
R

library(dplyr)
setwd("/home/ladmin/Documents/DND/EOHI/eohi3")
# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
# =============================================================================
# 1. CREATE BACKUP
# =============================================================================
file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)
# =============================================================================
# HELPER FUNCTION: Check variable existence and values
# =============================================================================
check_vars_exist <- function(source_vars, target_vars) {
missing_source <- setdiff(source_vars, names(df))
missing_target <- setdiff(target_vars, names(df))
if (length(missing_source) > 0) {
stop(paste("Missing source variables:", paste(missing_source, collapse = ", ")))
}
if (length(missing_target) > 0) {
stop(paste("Missing target variables:", paste(missing_target, collapse = ", ")))
}
return(TRUE)
}
check_values_exist <- function(var_name, expected_values) {
unique_vals <- unique(df[[var_name]])
unique_vals <- unique_vals[!is.na(unique_vals) & unique_vals != ""]
missing_vals <- setdiff(expected_values, unique_vals)
extra_vals <- setdiff(unique_vals, expected_values)
if (length(missing_vals) > 0) {
cat(paste(" ⚠ Expected values not found in", var_name, ":", paste(missing_vals, collapse = ", "), "\n"))
}
if (length(extra_vals) > 0) {
cat(paste(" ⚠ Unexpected values found in", var_name, ":", paste(extra_vals, collapse = ", "), "\n"))
}
return(list(missing = missing_vals, extra = extra_vals))
}
# =============================================================================
# 2. RECODE other_length2 TO other_length
# =============================================================================
cat("\n=== 1. RECODING other_length2 TO other_length ===\n\n")
# Check variables exist
check_vars_exist("other_length2", "other_length")
# Check values in source
cat("Checking source variable values...\n")
length_vals <- unique(df$other_length2[!is.na(df$other_length2) & df$other_length2 != ""])
cat(paste(" Unique values in other_length2:", paste(length_vals, collapse = ", "), "\n"))
# Recode - handle "20+" as special case first, then convert to numeric for ranges
# Convert to numeric once, suppressing warnings for non-numeric values
num_length <- suppressWarnings(as.numeric(df$other_length2))
df$other_length <- ifelse(
is.na(df$other_length2),
NA,
ifelse(
df$other_length2 == "",
"",
ifelse(
df$other_length2 == "20+",
"20+",
ifelse(
!is.na(num_length) & num_length >= 5 & num_length <= 9,
"5-9",
ifelse(
!is.na(num_length) & num_length >= 10 & num_length <= 14,
"10-14",
ifelse(
!is.na(num_length) & num_length >= 15 & num_length <= 19,
"15-19",
NA
)
)
)
)
)
)
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(123)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
source_val <- df$other_length2[i]
target_val <- df$other_length[i]
cat(sprintf(" Row %d: other_length2 = %s -> other_length = %s\n",
i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)),
ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val))))
}
# =============================================================================
# 3. RECODE other_like2 TO other_like
# =============================================================================
cat("\n=== 2. RECODING other_like2 TO other_like ===\n\n")
# Check variables exist
check_vars_exist("other_like2", "other_like")
# Check expected values exist
expected_like <- c("Dislike a great deal", "Dislike somewhat", "Neither like nor dislike",
"Like somewhat", "Like a great deal")
check_values_exist("other_like2", expected_like)
# Recode
df$other_like <- ifelse(
is.na(df$other_like2),
NA,
ifelse(
df$other_like2 == "",
"",
ifelse(
df$other_like2 == "Dislike a great deal",
"-2",
ifelse(
df$other_like2 == "Dislike somewhat",
"-1",
ifelse(
df$other_like2 == "Neither like nor dislike",
"0",
ifelse(
df$other_like2 == "Like somewhat",
"1",
ifelse(
df$other_like2 == "Like a great deal",
"2",
NA
)
)
)
)
)
)
)
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(456)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
source_val <- df$other_like2[i]
target_val <- df$other_like[i]
cat(sprintf(" Row %d: other_like2 = %s -> other_like = %s\n",
i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)),
ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val))))
}
# =============================================================================
# 4. CALCULATE aot_total
# =============================================================================
cat("\n=== 3. CALCULATING aot_total ===\n\n")
# Check variables exist
aot_vars <- c("aot01", "aot02", "aot03", "aot04_r", "aot05_r", "aot06_r", "aot07_r", "aot08")
check_vars_exist(aot_vars, "aot_total")
# Reverse code aot04_r through aot07_r
reverse_vars <- c("aot04_r", "aot05_r", "aot06_r", "aot07_r")
for (var in reverse_vars) {
df[[paste0(var, "_reversed")]] <- as.numeric(ifelse(
df[[var]] == "" | is.na(df[[var]]),
NA,
as.numeric(df[[var]]) * -1
))
}
# Calculate mean of all 8 variables (4 reversed + 4 original)
all_aot_vars <- c("aot01", "aot02", "aot03", "aot04_r_reversed", "aot05_r_reversed",
"aot06_r_reversed", "aot07_r_reversed", "aot08")
# Convert to numeric matrix
aot_matrix <- df[, all_aot_vars]
aot_numeric <- apply(aot_matrix, 2, function(x) {
as.numeric(ifelse(x == "" | is.na(x), NA, x))
})
# Calculate mean
df$aot_total <- rowMeans(aot_numeric, na.rm = TRUE)
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(789)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
aot_vals <- df[i, all_aot_vars]
aot_nums <- as.numeric(ifelse(aot_vals == "" | is.na(aot_vals), NA, aot_vals))
expected_mean <- mean(aot_nums, na.rm = TRUE)
actual_mean <- df$aot_total[i]
cat(sprintf(" Row %d: aot_total = %s (expected: %s)\n",
i, ifelse(is.na(actual_mean), "NA", round(actual_mean, 4)),
ifelse(is.na(expected_mean), "NA", round(expected_mean, 4))))
}
# =============================================================================
# 5. PROCESS CRT QUESTIONS
# =============================================================================
cat("\n=== 4. PROCESSING CRT QUESTIONS ===\n\n")
# Check variables exist
check_vars_exist(c("crt01", "crt02", "crt03"), c("crt_correct", "crt_int"))
# Initialize CRT variables
df$crt_correct <- 0
df$crt_int <- 0
# CRT01: "5 cents" = correct (1,0), "10 cents" = intuitive (0,1), else (0,0)
df$crt_correct <- ifelse(df$crt01 == "5 cents", 1, df$crt_correct)
df$crt_int <- ifelse(df$crt01 == "10 cents", 1, df$crt_int)
# CRT02: "5 minutes" = correct (1,0), "100 minutes" = intuitive (0,1), else (0,0)
df$crt_correct <- ifelse(df$crt02 == "5 minutes", df$crt_correct + 1, df$crt_correct)
df$crt_int <- ifelse(df$crt02 == "100 minutes", df$crt_int + 1, df$crt_int)
# CRT03: "47 days" = correct (1,0), "24 days" = intuitive (0,1), else (0,0)
df$crt_correct <- ifelse(df$crt03 == "47 days", df$crt_correct + 1, df$crt_correct)
df$crt_int <- ifelse(df$crt03 == "24 days", df$crt_int + 1, df$crt_int)
# Check expected values exist
expected_crt01 <- c("5 cents", "10 cents")
expected_crt02 <- c("5 minutes", "100 minutes")
expected_crt03 <- c("47 days", "24 days")
check_values_exist("crt01", expected_crt01)
check_values_exist("crt02", expected_crt02)
check_values_exist("crt03", expected_crt03)
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1011)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
cat(sprintf(" Row %d:\n", i))
cat(sprintf(" crt01 = %s -> crt_correct = %d, crt_int = %d\n",
ifelse(is.na(df$crt01[i]) || df$crt01[i] == "", "NA/empty", df$crt01[i]),
ifelse(df$crt01[i] == "5 cents", 1, 0),
ifelse(df$crt01[i] == "10 cents", 1, 0)))
cat(sprintf(" crt02 = %s -> crt_correct = %d, crt_int = %d\n",
ifelse(is.na(df$crt02[i]) || df$crt02[i] == "", "NA/empty", df$crt02[i]),
ifelse(df$crt02[i] == "5 minutes", 1, 0),
ifelse(df$crt02[i] == "100 minutes", 1, 0)))
cat(sprintf(" crt03 = %s -> crt_correct = %d, crt_int = %d\n",
ifelse(is.na(df$crt03[i]) || df$crt03[i] == "", "NA/empty", df$crt03[i]),
ifelse(df$crt03[i] == "47 days", 1, 0),
ifelse(df$crt03[i] == "24 days", 1, 0)))
cat(sprintf(" Total: crt_correct = %d, crt_int = %d\n\n",
df$crt_correct[i], df$crt_int[i]))
}
# =============================================================================
# 6. CALCULATE icar_verbal
# =============================================================================
cat("\n=== 5. CALCULATING icar_verbal ===\n\n")
# Check variables exist
verbal_vars <- c("verbal01", "verbal02", "verbal03", "verbal04", "verbal05")
check_vars_exist(verbal_vars, "icar_verbal")
# Correct answers
correct_verbal <- c("5", "8", "It's impossible to tell", "47", "Sunday")
# Calculate proportion correct
verbal_responses <- df[, verbal_vars]
correct_count <- rowSums(
sapply(1:5, function(i) {
verbal_responses[, i] == correct_verbal[i]
}),
na.rm = TRUE
)
df$icar_verbal <- correct_count / 5
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1213)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
responses <- df[i, verbal_vars]
correct <- sum(sapply(1:5, function(j) responses[j] == correct_verbal[j]), na.rm = TRUE)
prop <- correct / 5
cat(sprintf(" Row %d: Correct = %d/5, icar_verbal = %s\n",
i, correct, round(prop, 4)))
}
# =============================================================================
# 7. CALCULATE icar_matrix
# =============================================================================
cat("\n=== 6. CALCULATING icar_matrix ===\n\n")
# Check variables exist
matrix_vars <- c("matrix01", "matrix02", "matrix03", "matrix04", "matrix05")
check_vars_exist(matrix_vars, "icar_matrix")
# Correct answers
correct_matrix <- c("D", "E", "B", "B", "D")
# Calculate proportion correct
matrix_responses <- df[, matrix_vars]
correct_count <- rowSums(
sapply(1:5, function(i) {
matrix_responses[, i] == correct_matrix[i]
}),
na.rm = TRUE
)
df$icar_matrix <- correct_count / 5
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1415)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
responses <- df[i, matrix_vars]
correct <- sum(sapply(1:5, function(j) responses[j] == correct_matrix[j]), na.rm = TRUE)
prop <- correct / 5
cat(sprintf(" Row %d: Correct = %d/5, icar_matrix = %s\n",
i, correct, round(prop, 4)))
}
# =============================================================================
# 8. CALCULATE icar_total
# =============================================================================
cat("\n=== 7. CALCULATING icar_total ===\n\n")
# Check variables exist
check_vars_exist(c(verbal_vars, matrix_vars), "icar_total")
# Calculate proportion correct across all 10 items
all_correct <- c(correct_verbal, correct_matrix)
all_responses <- df[, c(verbal_vars, matrix_vars)]
correct_count <- rowSums(
sapply(1:10, function(i) {
all_responses[, i] == all_correct[i]
}),
na.rm = TRUE
)
df$icar_total <- correct_count / 10
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1617)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
responses <- df[i, c(verbal_vars, matrix_vars)]
correct <- sum(sapply(1:10, function(j) responses[j] == all_correct[j]), na.rm = TRUE)
prop <- correct / 10
cat(sprintf(" Row %d: Correct = %d/10, icar_total = %s\n",
i, correct, round(prop, 4)))
}
# =============================================================================
# 9. RECODE demo_sex TO sex
# =============================================================================
cat("\n=== 8. RECODING demo_sex TO sex ===\n\n")
# Check variables exist
check_vars_exist("demo_sex", "sex")
# Check values
sex_vals <- unique(df$demo_sex[!is.na(df$demo_sex) & df$demo_sex != ""])
cat(paste(" Unique values in demo_sex:", paste(sex_vals, collapse = ", "), "\n"))
# Recode: male = 0, female = 1, else = 2
df$sex <- ifelse(
is.na(df$demo_sex) | df$demo_sex == "",
NA,
ifelse(
tolower(df$demo_sex) == "male",
0,
ifelse(
tolower(df$demo_sex) == "female",
1,
2
)
)
)
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1819)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
source_val <- df$demo_sex[i]
target_val <- df$sex[i]
cat(sprintf(" Row %d: demo_sex = %s -> sex = %s\n",
i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val),
ifelse(is.na(target_val), "NA", target_val)))
}
# =============================================================================
# 10. RECODE demo_edu TO education
# =============================================================================
cat("\n=== 9. RECODING demo_edu TO education ===\n\n")
# Check variables exist
check_vars_exist("demo_edu", "education")
# Check values
edu_vals <- unique(df$demo_edu[!is.na(df$demo_edu) & df$demo_edu != ""])
cat(paste(" Unique values in demo_edu:", paste(edu_vals, collapse = ", "), "\n"))
# Recode
df$education <- ifelse(
is.na(df$demo_edu) | df$demo_edu == "",
NA,
ifelse(
df$demo_edu %in% c("High School (or equivalent)", "Trade School"),
"HS_TS",
ifelse(
df$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate"),
"C_Ug",
ifelse(
df$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)"),
"grad_prof",
NA
)
)
)
)
# Convert to ordered factor
df$education <- factor(df$education,
levels = c("HS_TS", "C_Ug", "grad_prof"),
ordered = TRUE)
# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(2021)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
source_val <- df$demo_edu[i]
target_val <- df$education[i]
cat(sprintf(" Row %d: demo_edu = %s -> education = %s\n",
i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val),
ifelse(is.na(target_val), "NA", as.character(target_val))))
}
# =============================================================================
# 11. SAVE UPDATED DATA
# =============================================================================
# COMMENTED OUT: Uncomment when ready to save
# write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
# cat("\nUpdated data saved to: eohi3.csv\n")
# cat(paste("Total rows:", nrow(df), "\n"))
# cat(paste("Total columns:", ncol(df), "\n"))