eohi/eohi3/datap 07 - scales and recodes.r

library(dplyr)

setwd("/home/ladmin/Documents/DND/EOHI/eohi3")

# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")

# =============================================================================
# 1. CREATE BACKUP
# =============================================================================
file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)

# =============================================================================
# HELPER FUNCTION: Check variable existence and values
# =============================================================================

check_vars_exist <- function(source_vars, target_vars) {
  missing_source <- setdiff(source_vars, names(df))
  missing_target <- setdiff(target_vars, names(df))

  if (length(missing_source) > 0) {
    stop(paste("Missing source variables:", paste(missing_source, collapse = ", ")))
  }
  if (length(missing_target) > 0) {
    stop(paste("Missing target variables:", paste(missing_target, collapse = ", ")))
  }
  return(TRUE)
}

check_values_exist <- function(var_name, expected_values) {
  unique_vals <- unique(df[[var_name]])
  unique_vals <- unique_vals[!is.na(unique_vals) & unique_vals != ""]
  missing_vals <- setdiff(expected_values, unique_vals)
  extra_vals <- setdiff(unique_vals, expected_values)

  if (length(missing_vals) > 0) {
    cat(paste("  ⚠ Expected values not found in", var_name, ":", paste(missing_vals, collapse = ", "), "\n"))
  }
  if (length(extra_vals) > 0) {
    cat(paste("  ⚠ Unexpected values found in", var_name, ":", paste(extra_vals, collapse = ", "), "\n"))
  }
  return(list(missing = missing_vals, extra = extra_vals))
}

# =============================================================================
# 2. RECODE other_length2 TO other_length
# =============================================================================

cat("\n=== 1. RECODING other_length2 TO other_length ===\n\n")

# Check variables exist
check_vars_exist("other_length2", "other_length")

# Check values in source
cat("Checking source variable values...\n")
length_vals <- unique(df$other_length2[!is.na(df$other_length2) & df$other_length2 != ""])
cat(paste("  Unique values in other_length2:", paste(length_vals, collapse = ", "), "\n"))

# Recode - handle "20+" as special case first, then convert to numeric for ranges
# Convert to numeric once, suppressing warnings for non-numeric values
num_length <- suppressWarnings(as.numeric(df$other_length2))

df$other_length <- ifelse(
  is.na(df$other_length2),
  NA,
  ifelse(
    df$other_length2 == "",
    "",
    ifelse(
      df$other_length2 == "20+",
      "20+",
      ifelse(
        !is.na(num_length) & num_length >= 5 & num_length <= 9,
        "5-9",
        ifelse(
          !is.na(num_length) & num_length >= 10 & num_length <= 14,
          "10-14",
          ifelse(
            !is.na(num_length) & num_length >= 15 & num_length <= 19,
            "15-19",
            NA
          )
        )
      )
    )
  )
)

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(123)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  source_val <- df$other_length2[i]
  target_val <- df$other_length[i]
  cat(sprintf("  Row %d: other_length2 = %s -> other_length = %s\n",
              i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)),
              ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val))))
}

# =============================================================================
# 3. RECODE other_like2 TO other_like
# =============================================================================

cat("\n=== 2. RECODING other_like2 TO other_like ===\n\n")

# Check variables exist
check_vars_exist("other_like2", "other_like")

# Check expected values exist
expected_like <- c("Dislike a great deal", "Dislike somewhat", "Neither like nor dislike",
                   "Like somewhat", "Like a great deal")
check_values_exist("other_like2", expected_like)

# Recode
df$other_like <- ifelse(
  is.na(df$other_like2),
  NA,
  ifelse(
    df$other_like2 == "",
    "",
    ifelse(
      df$other_like2 == "Dislike a great deal",
      "-2",
      ifelse(
        df$other_like2 == "Dislike somewhat",
        "-1",
        ifelse(
          df$other_like2 == "Neither like nor dislike",
          "0",
          ifelse(
            df$other_like2 == "Like somewhat",
            "1",
            ifelse(
              df$other_like2 == "Like a great deal",
              "2",
              NA
            )
          )
        )
      )
    )
  )
)

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(456)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  source_val <- df$other_like2[i]
  target_val <- df$other_like[i]
  cat(sprintf("  Row %d: other_like2 = %s -> other_like = %s\n",
              i, ifelse(is.na(source_val), "NA", ifelse(source_val == "", "empty", source_val)),
              ifelse(is.na(target_val), "NA", ifelse(target_val == "", "empty", target_val))))
}

# =============================================================================
# 4. CALCULATE aot_total
# =============================================================================

cat("\n=== 3. CALCULATING aot_total ===\n\n")

# Check variables exist
aot_vars <- c("aot01", "aot02", "aot03", "aot04_r", "aot05_r", "aot06_r", "aot07_r", "aot08")
check_vars_exist(aot_vars, "aot_total")

# Reverse code aot04_r through aot07_r
reverse_vars <- c("aot04_r", "aot05_r", "aot06_r", "aot07_r")
for (var in reverse_vars) {
  df[[paste0(var, "_reversed")]] <- as.numeric(ifelse(
    df[[var]] == "" | is.na(df[[var]]),
    NA,
    as.numeric(df[[var]]) * -1
  ))
}

# Calculate mean of all 8 variables (4 reversed + 4 original)
all_aot_vars <- c("aot01", "aot02", "aot03", "aot04_r_reversed", "aot05_r_reversed",
                  "aot06_r_reversed", "aot07_r_reversed", "aot08")

# Convert to numeric matrix
aot_matrix <- df[, all_aot_vars]
aot_numeric <- apply(aot_matrix, 2, function(x) {
  as.numeric(ifelse(x == "" | is.na(x), NA, x))
})

# Calculate mean
df$aot_total <- rowMeans(aot_numeric, na.rm = TRUE)

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(789)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  aot_vals <- df[i, all_aot_vars]
  aot_nums <- as.numeric(ifelse(aot_vals == "" | is.na(aot_vals), NA, aot_vals))
  expected_mean <- mean(aot_nums, na.rm = TRUE)
  actual_mean <- df$aot_total[i]
  cat(sprintf("  Row %d: aot_total = %s (expected: %s)\n",
              i, ifelse(is.na(actual_mean), "NA", round(actual_mean, 4)),
              ifelse(is.na(expected_mean), "NA", round(expected_mean, 4))))
}

# =============================================================================
# 5. PROCESS CRT QUESTIONS
# =============================================================================

cat("\n=== 4. PROCESSING CRT QUESTIONS ===\n\n")

# Check variables exist
check_vars_exist(c("crt01", "crt02", "crt03"), c("crt_correct", "crt_int"))

# Initialize CRT variables
df$crt_correct <- 0
df$crt_int <- 0

# CRT01: "5 cents" = correct (1,0), "10 cents" = intuitive (0,1), else (0,0)
df$crt_correct <- ifelse(df$crt01 == "5 cents", 1, df$crt_correct)
df$crt_int <- ifelse(df$crt01 == "10 cents", 1, df$crt_int)

# CRT02: "5 minutes" = correct (1,0), "100 minutes" = intuitive (0,1), else (0,0)
df$crt_correct <- ifelse(df$crt02 == "5 minutes", df$crt_correct + 1, df$crt_correct)
df$crt_int <- ifelse(df$crt02 == "100 minutes", df$crt_int + 1, df$crt_int)

# CRT03: "47 days" = correct (1,0), "24 days" = intuitive (0,1), else (0,0)
df$crt_correct <- ifelse(df$crt03 == "47 days", df$crt_correct + 1, df$crt_correct)
df$crt_int <- ifelse(df$crt03 == "24 days", df$crt_int + 1, df$crt_int)

# Check expected values exist
expected_crt01 <- c("5 cents", "10 cents")
expected_crt02 <- c("5 minutes", "100 minutes")
expected_crt03 <- c("47 days", "24 days")
check_values_exist("crt01", expected_crt01)
check_values_exist("crt02", expected_crt02)
check_values_exist("crt03", expected_crt03)

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1011)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  cat(sprintf("  Row %d:\n", i))
  cat(sprintf("    crt01 = %s -> crt_correct = %d, crt_int = %d\n",
              ifelse(is.na(df$crt01[i]) || df$crt01[i] == "", "NA/empty", df$crt01[i]),
              ifelse(df$crt01[i] == "5 cents", 1, 0),
              ifelse(df$crt01[i] == "10 cents", 1, 0)))
  cat(sprintf("    crt02 = %s -> crt_correct = %d, crt_int = %d\n",
              ifelse(is.na(df$crt02[i]) || df$crt02[i] == "", "NA/empty", df$crt02[i]),
              ifelse(df$crt02[i] == "5 minutes", 1, 0),
              ifelse(df$crt02[i] == "100 minutes", 1, 0)))
  cat(sprintf("    crt03 = %s -> crt_correct = %d, crt_int = %d\n",
              ifelse(is.na(df$crt03[i]) || df$crt03[i] == "", "NA/empty", df$crt03[i]),
              ifelse(df$crt03[i] == "47 days", 1, 0),
              ifelse(df$crt03[i] == "24 days", 1, 0)))
  cat(sprintf("    Total: crt_correct = %d, crt_int = %d\n\n",
              df$crt_correct[i], df$crt_int[i]))
}

# =============================================================================
# 6. CALCULATE icar_verbal
# =============================================================================

cat("\n=== 5. CALCULATING icar_verbal ===\n\n")

# Check variables exist
verbal_vars <- c("verbal01", "verbal02", "verbal03", "verbal04", "verbal05")
check_vars_exist(verbal_vars, "icar_verbal")

# Correct answers
correct_verbal <- c("5", "8", "It's impossible to tell", "47", "Sunday")

# Calculate proportion correct
verbal_responses <- df[, verbal_vars]
correct_count <- rowSums(
  sapply(1:5, function(i) {
    verbal_responses[, i] == correct_verbal[i]
  }),
  na.rm = TRUE
)
df$icar_verbal <- correct_count / 5

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1213)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  responses <- df[i, verbal_vars]
  correct <- sum(sapply(1:5, function(j) responses[j] == correct_verbal[j]), na.rm = TRUE)
  prop <- correct / 5
  cat(sprintf("  Row %d: Correct = %d/5, icar_verbal = %s\n",
              i, correct, round(prop, 4)))
}

# =============================================================================
# 7. CALCULATE icar_matrix
# =============================================================================

cat("\n=== 6. CALCULATING icar_matrix ===\n\n")

# Check variables exist
matrix_vars <- c("matrix01", "matrix02", "matrix03", "matrix04", "matrix05")
check_vars_exist(matrix_vars, "icar_matrix")

# Correct answers
correct_matrix <- c("D", "E", "B", "B", "D")

# Calculate proportion correct
matrix_responses <- df[, matrix_vars]
correct_count <- rowSums(
  sapply(1:5, function(i) {
    matrix_responses[, i] == correct_matrix[i]
  }),
  na.rm = TRUE
)
df$icar_matrix <- correct_count / 5

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1415)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  responses <- df[i, matrix_vars]
  correct <- sum(sapply(1:5, function(j) responses[j] == correct_matrix[j]), na.rm = TRUE)
  prop <- correct / 5
  cat(sprintf("  Row %d: Correct = %d/5, icar_matrix = %s\n",
              i, correct, round(prop, 4)))
}

# =============================================================================
# 8. CALCULATE icar_total
# =============================================================================

cat("\n=== 7. CALCULATING icar_total ===\n\n")

# Check variables exist
check_vars_exist(c(verbal_vars, matrix_vars), "icar_total")

# Calculate proportion correct across all 10 items
all_correct <- c(correct_verbal, correct_matrix)
all_responses <- df[, c(verbal_vars, matrix_vars)]
correct_count <- rowSums(
  sapply(1:10, function(i) {
    all_responses[, i] == all_correct[i]
  }),
  na.rm = TRUE
)
df$icar_total <- correct_count / 10

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1617)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  responses <- df[i, c(verbal_vars, matrix_vars)]
  correct <- sum(sapply(1:10, function(j) responses[j] == all_correct[j]), na.rm = TRUE)
  prop <- correct / 10
  cat(sprintf("  Row %d: Correct = %d/10, icar_total = %s\n",
              i, correct, round(prop, 4)))
}

# =============================================================================
# 9. RECODE demo_sex TO sex
# =============================================================================

cat("\n=== 8. RECODING demo_sex TO sex ===\n\n")

# Check variables exist
check_vars_exist("demo_sex", "sex")

# Check values
sex_vals <- unique(df$demo_sex[!is.na(df$demo_sex) & df$demo_sex != ""])
cat(paste("  Unique values in demo_sex:", paste(sex_vals, collapse = ", "), "\n"))

# Recode: male = 0, female = 1, else = 2
df$sex <- ifelse(
  is.na(df$demo_sex) | df$demo_sex == "",
  NA,
  ifelse(
    tolower(df$demo_sex) == "male",
    0,
    ifelse(
      tolower(df$demo_sex) == "female",
      1,
      2
    )
  )
)

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(1819)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  source_val <- df$demo_sex[i]
  target_val <- df$sex[i]
  cat(sprintf("  Row %d: demo_sex = %s -> sex = %s\n",
              i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val),
              ifelse(is.na(target_val), "NA", target_val)))
}

# =============================================================================
# 10. RECODE demo_edu TO education
# =============================================================================

cat("\n=== 9. RECODING demo_edu TO education ===\n\n")

# Check variables exist
check_vars_exist("demo_edu", "education")

# Check values
edu_vals <- unique(df$demo_edu[!is.na(df$demo_edu) & df$demo_edu != ""])
cat(paste("  Unique values in demo_edu:", paste(edu_vals, collapse = ", "), "\n"))

# Recode
df$education <- ifelse(
  is.na(df$demo_edu) | df$demo_edu == "",
  NA,
  ifelse(
    df$demo_edu %in% c("High School (or equivalent)", "Trade School"),
    "HS_TS",
    ifelse(
      df$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate"),
      "C_Ug",
      ifelse(
        df$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)"),
        "grad_prof",
        NA
      )
    )
  )
)

# Convert to ordered factor
df$education <- factor(df$education,
                       levels = c("HS_TS", "C_Ug", "grad_prof"),
                       ordered = TRUE)

# Verification check
cat("\nVerification (5 random rows):\n")
set.seed(2021)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
for (i in sample_rows) {
  source_val <- df$demo_edu[i]
  target_val <- df$education[i]
  cat(sprintf("  Row %d: demo_edu = %s -> education = %s\n",
              i, ifelse(is.na(source_val) || source_val == "", "NA/empty", source_val),
              ifelse(is.na(target_val), "NA", as.character(target_val))))
}

# =============================================================================
# 11. SAVE UPDATED DATA
# =============================================================================
# COMMENTED OUT: Uncomment when ready to save

# write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
# cat("\nUpdated data saved to: eohi3.csv\n")
# cat(paste("Total rows:", nrow(df), "\n"))
# cat(paste("Total columns:", ncol(df), "\n"))