library(dplyr) setwd("/home/ladmin/Documents/DND/EOHI/eohi3") # Read the data (with check.names=FALSE to preserve original column names) # Keep empty cells as empty strings, not NA # Only convert the literal string "NA" to NA, not empty strings df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA") # ============================================================================= # 1. CREATE BACKUP # ============================================================================= file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE) # ============================================================================= # HELPER FUNCTION: Check variable existence and values # ============================================================================= check_vars_exist <- function(source_vars, target_vars) { missing_source <- setdiff(source_vars, names(df)) missing_target <- setdiff(target_vars, names(df)) if (length(missing_source) > 0) { stop(paste("Missing source variables:", paste(missing_source, collapse = ", "))) } if (length(missing_target) > 0) { stop(paste("Missing target variables:", paste(missing_target, collapse = ", "))) } return(TRUE) } check_values_exist <- function(var_name, expected_values) { unique_vals <- unique(df[[var_name]]) unique_vals <- unique_vals[!is.na(unique_vals) & unique_vals != ""] missing_vals <- setdiff(expected_values, unique_vals) extra_vals <- setdiff(unique_vals, expected_values) if (length(missing_vals) > 0) { cat(paste(" ⚠ Expected values not found in", var_name, ":", paste(missing_vals, collapse = ", "), "\n")) } if (length(extra_vals) > 0) { cat(paste(" ⚠ Unexpected values found in", var_name, ":", paste(extra_vals, collapse = ", "), "\n")) } return(list(missing = missing_vals, extra = extra_vals)) } # ============================================================================= # 2. RECODE other_length2 TO other_length # ============================================================================= cat("\n=== 1. RECODING other_length2 TO other_length ===\n\n") # Check variables exist check_vars_exist("other_length2", "other_length") # Check values in source cat("Checking source variable values...\n") length_vals <- unique(df$other_length2[!is.na(df$other_length2) & df$other_length2 != ""]) cat(paste(" Unique values in other_length2:", paste(length_vals, collapse = ", "), "\n")) # Recode - handle "20+" as special case first, then convert to numeric for ranges # Convert to numeric once, suppressing warnings for non-numeric values num_length <- suppressWarnings(as.numeric(df$other_length2)) df$other_length <- ifelse( is.na(df$other_length2), NA, ifelse( df$other_length2 == "", "", ifelse( df$other_length2 == "20+", "20+", ifelse( !is.na(num_length) & num_length >= 5 & num_length <= 9, "5-9", ifelse( !is.na(num_length) & num_length >= 10 & num_length <= 14, "10-14", ifelse( !is.na(num_length) & num_length >= 15 & num_length <= 19, "15-19", NA ) ) ) ) ) ) # Verification check cat("\nVerification (5 random rows):\n") set.seed(123) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { source_val <- df$other_length2[i] target_val <- df$other_length[i] cat(sprintf(" Row %d: other_length2 = %s -> other_length = %s\n", i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)), ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val)))) } # ============================================================================= # 3. RECODE other_like2 TO other_like # ============================================================================= cat("\n=== 2. RECODING other_like2 TO other_like ===\n\n") # Check variables exist check_vars_exist("other_like2", "other_like") # Check expected values exist expected_like <- c("Dislike a great deal", "Dislike somewhat", "Neither like nor dislike", "Like somewhat", "Like a great deal") check_values_exist("other_like2", expected_like) # Recode df$other_like <- ifelse( is.na(df$other_like2), NA, ifelse( df$other_like2 == "", "", ifelse( df$other_like2 == "Dislike a great deal", "-2", ifelse( df$other_like2 == "Dislike somewhat", "-1", ifelse( df$other_like2 == "Neither like nor dislike", "0", ifelse( df$other_like2 == "Like somewhat", "1", ifelse( df$other_like2 == "Like a great deal", "2", NA ) ) ) ) ) ) ) # Verification check cat("\nVerification (5 random rows):\n") set.seed(456) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { source_val <- df$other_like2[i] target_val <- df$other_like[i] cat(sprintf(" Row %d: other_like2 = %s -> other_like = %s\n", i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)), ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val)))) } # ============================================================================= # 4. CALCULATE aot_total # ============================================================================= cat("\n=== 3. CALCULATING aot_total ===\n\n") # Check variables exist aot_vars <- c("aot01", "aot02", "aot03", "aot04_r", "aot05_r", "aot06_r", "aot07_r", "aot08") check_vars_exist(aot_vars, "aot_total") # Reverse code aot04_r through aot07_r reverse_vars <- c("aot04_r", "aot05_r", "aot06_r", "aot07_r") for (var in reverse_vars) { df[[paste0(var, "_reversed")]] <- as.numeric(ifelse( df[[var]] == "" | is.na(df[[var]]), NA, as.numeric(df[[var]]) * -1 )) } # Calculate mean of all 8 variables (4 reversed + 4 original) all_aot_vars <- c("aot01", "aot02", "aot03", "aot04_r_reversed", "aot05_r_reversed", "aot06_r_reversed", "aot07_r_reversed", "aot08") # Convert to numeric matrix aot_matrix <- df[, all_aot_vars] aot_numeric <- apply(aot_matrix, 2, function(x) { as.numeric(ifelse(x == "" | is.na(x), NA, x)) }) # Calculate mean df$aot_total <- rowMeans(aot_numeric, na.rm = TRUE) # Verification check cat("\nVerification (5 random rows):\n") set.seed(789) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { aot_vals <- df[i, all_aot_vars] aot_nums <- as.numeric(ifelse(aot_vals == "" | is.na(aot_vals), NA, aot_vals)) expected_mean <- mean(aot_nums, na.rm = TRUE) actual_mean <- df$aot_total[i] cat(sprintf(" Row %d: aot_total = %s (expected: %s)\n", i, ifelse(is.na(actual_mean), "NA", round(actual_mean, 4)), ifelse(is.na(expected_mean), "NA", round(expected_mean, 4)))) } # ============================================================================= # 5. PROCESS CRT QUESTIONS # ============================================================================= cat("\n=== 4. PROCESSING CRT QUESTIONS ===\n\n") # Check variables exist check_vars_exist(c("crt01", "crt02", "crt03"), c("crt_correct", "crt_int")) # Initialize CRT variables df$crt_correct <- 0 df$crt_int <- 0 # CRT01: "5 cents" = correct (1,0), "10 cents" = intuitive (0,1), else (0,0) df$crt_correct <- ifelse(df$crt01 == "5 cents", 1, df$crt_correct) df$crt_int <- ifelse(df$crt01 == "10 cents", 1, df$crt_int) # CRT02: "5 minutes" = correct (1,0), "100 minutes" = intuitive (0,1), else (0,0) df$crt_correct <- ifelse(df$crt02 == "5 minutes", df$crt_correct + 1, df$crt_correct) df$crt_int <- ifelse(df$crt02 == "100 minutes", df$crt_int + 1, df$crt_int) # CRT03: "47 days" = correct (1,0), "24 days" = intuitive (0,1), else (0,0) df$crt_correct <- ifelse(df$crt03 == "47 days", df$crt_correct + 1, df$crt_correct) df$crt_int <- ifelse(df$crt03 == "24 days", df$crt_int + 1, df$crt_int) # Check expected values exist expected_crt01 <- c("5 cents", "10 cents") expected_crt02 <- c("5 minutes", "100 minutes") expected_crt03 <- c("47 days", "24 days") check_values_exist("crt01", expected_crt01) check_values_exist("crt02", expected_crt02) check_values_exist("crt03", expected_crt03) # Verification check cat("\nVerification (5 random rows):\n") set.seed(1011) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { cat(sprintf(" Row %d:\n", i)) cat(sprintf(" crt01 = %s -> crt_correct = %d, crt_int = %d\n", ifelse(is.na(df$crt01[i]) || df$crt01[i] == "", "NA/empty", df$crt01[i]), ifelse(df$crt01[i] == "5 cents", 1, 0), ifelse(df$crt01[i] == "10 cents", 1, 0))) cat(sprintf(" crt02 = %s -> crt_correct = %d, crt_int = %d\n", ifelse(is.na(df$crt02[i]) || df$crt02[i] == "", "NA/empty", df$crt02[i]), ifelse(df$crt02[i] == "5 minutes", 1, 0), ifelse(df$crt02[i] == "100 minutes", 1, 0))) cat(sprintf(" crt03 = %s -> crt_correct = %d, crt_int = %d\n", ifelse(is.na(df$crt03[i]) || df$crt03[i] == "", "NA/empty", df$crt03[i]), ifelse(df$crt03[i] == "47 days", 1, 0), ifelse(df$crt03[i] == "24 days", 1, 0))) cat(sprintf(" Total: crt_correct = %d, crt_int = %d\n\n", df$crt_correct[i], df$crt_int[i])) } # ============================================================================= # 6. CALCULATE icar_verbal # ============================================================================= cat("\n=== 5. CALCULATING icar_verbal ===\n\n") # Check variables exist verbal_vars <- c("verbal01", "verbal02", "verbal03", "verbal04", "verbal05") check_vars_exist(verbal_vars, "icar_verbal") # Correct answers correct_verbal <- c("5", "8", "It's impossible to tell", "47", "Sunday") # Calculate proportion correct verbal_responses <- df[, verbal_vars] correct_count <- rowSums( sapply(1:5, function(i) { verbal_responses[, i] == correct_verbal[i] }), na.rm = TRUE ) df$icar_verbal <- correct_count / 5 # Verification check cat("\nVerification (5 random rows):\n") set.seed(1213) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { responses <- df[i, verbal_vars] correct <- sum(sapply(1:5, function(j) responses[j] == correct_verbal[j]), na.rm = TRUE) prop <- correct / 5 cat(sprintf(" Row %d: Correct = %d/5, icar_verbal = %s\n", i, correct, round(prop, 4))) } # ============================================================================= # 7. CALCULATE icar_matrix # ============================================================================= cat("\n=== 6. CALCULATING icar_matrix ===\n\n") # Check variables exist matrix_vars <- c("matrix01", "matrix02", "matrix03", "matrix04", "matrix05") check_vars_exist(matrix_vars, "icar_matrix") # Correct answers correct_matrix <- c("D", "E", "B", "B", "D") # Calculate proportion correct matrix_responses <- df[, matrix_vars] correct_count <- rowSums( sapply(1:5, function(i) { matrix_responses[, i] == correct_matrix[i] }), na.rm = TRUE ) df$icar_matrix <- correct_count / 5 # Verification check cat("\nVerification (5 random rows):\n") set.seed(1415) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { responses <- df[i, matrix_vars] correct <- sum(sapply(1:5, function(j) responses[j] == correct_matrix[j]), na.rm = TRUE) prop <- correct / 5 cat(sprintf(" Row %d: Correct = %d/5, icar_matrix = %s\n", i, correct, round(prop, 4))) } # ============================================================================= # 8. CALCULATE icar_total # ============================================================================= cat("\n=== 7. CALCULATING icar_total ===\n\n") # Check variables exist check_vars_exist(c(verbal_vars, matrix_vars), "icar_total") # Calculate proportion correct across all 10 items all_correct <- c(correct_verbal, correct_matrix) all_responses <- df[, c(verbal_vars, matrix_vars)] correct_count <- rowSums( sapply(1:10, function(i) { all_responses[, i] == all_correct[i] }), na.rm = TRUE ) df$icar_total <- correct_count / 10 # Verification check cat("\nVerification (5 random rows):\n") set.seed(1617) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { responses <- df[i, c(verbal_vars, matrix_vars)] correct <- sum(sapply(1:10, function(j) responses[j] == all_correct[j]), na.rm = TRUE) prop <- correct / 10 cat(sprintf(" Row %d: Correct = %d/10, icar_total = %s\n", i, correct, round(prop, 4))) } # ============================================================================= # 9. RECODE demo_sex TO sex # ============================================================================= cat("\n=== 8. RECODING demo_sex TO sex ===\n\n") # Check variables exist check_vars_exist("demo_sex", "sex") # Check values sex_vals <- unique(df$demo_sex[!is.na(df$demo_sex) & df$demo_sex != ""]) cat(paste(" Unique values in demo_sex:", paste(sex_vals, collapse = ", "), "\n")) # Recode: male = 0, female = 1, else = 2 df$sex <- ifelse( is.na(df$demo_sex) | df$demo_sex == "", NA, ifelse( tolower(df$demo_sex) == "male", 0, ifelse( tolower(df$demo_sex) == "female", 1, 2 ) ) ) # Verification check cat("\nVerification (5 random rows):\n") set.seed(1819) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { source_val <- df$demo_sex[i] target_val <- df$sex[i] cat(sprintf(" Row %d: demo_sex = %s -> sex = %s\n", i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val), ifelse(is.na(target_val), "NA", target_val))) } # ============================================================================= # 10. RECODE demo_edu TO education # ============================================================================= cat("\n=== 9. RECODING demo_edu TO education ===\n\n") # Check variables exist check_vars_exist("demo_edu", "education") # Check values edu_vals <- unique(df$demo_edu[!is.na(df$demo_edu) & df$demo_edu != ""]) cat(paste(" Unique values in demo_edu:", paste(edu_vals, collapse = ", "), "\n")) # Recode df$education <- ifelse( is.na(df$demo_edu) | df$demo_edu == "", NA, ifelse( df$demo_edu %in% c("High School (or equivalent)", "Trade School"), "HS_TS", ifelse( df$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate"), "C_Ug", ifelse( df$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)"), "grad_prof", NA ) ) ) ) # Convert to ordered factor df$education <- factor(df$education, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) # Verification check cat("\nVerification (5 random rows):\n") set.seed(2021) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) for (i in sample_rows) { source_val <- df$demo_edu[i] target_val <- df$education[i] cat(sprintf(" Row %d: demo_edu = %s -> education = %s\n", i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val), ifelse(is.na(target_val), "NA", as.character(target_val)))) } # ============================================================================= # 11. SAVE UPDATED DATA # ============================================================================= # COMMENTED OUT: Uncomment when ready to save # write.csv(df, "eohi3.csv", row.names = FALSE, na = "") # cat("\nUpdated data saved to: eohi3.csv\n") # cat(paste("Total rows:", nrow(df), "\n")) # cat(paste("Total columns:", ncol(df), "\n"))