eohi/eohi2/dataP 05 - recode scales VARS.r

# Script to compute AOT and CRT scales in eohi2.csv
# AOT: Reverse codes items 4-7, then averages all 8 items
# CRT: Calculates proportion of correct and intuitive responses

# Load necessary library
library(dplyr)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")

# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)

# Define source columns
aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8")
crt_cols <- c("crt_1", "crt_2", "crt_3")

# Define target columns
target_cols <- c("aot_total", "crt_correct", "crt_int")

# Define correct and intuitive CRT answers
crt_correct_answers <- c("5 cents", "5 minutes", "47 days")
crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days")

# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")

# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))

# Check AOT columns
missing_aot <- aot_cols[!aot_cols %in% df_cols]
existing_aot <- aot_cols[aot_cols %in% df_cols]

cat("AOT Source Columns:\n")
cat("  Expected: 8 columns\n")
cat("  Found:", length(existing_aot), "columns\n")
cat("  Missing:", length(missing_aot), "columns\n")

if (length(missing_aot) > 0) {
  cat("\n  Missing AOT columns:\n")
  for (col in missing_aot) {
    cat("    -", col, "\n")
  }
}

# Check CRT columns
missing_crt <- crt_cols[!crt_cols %in% df_cols]
existing_crt <- crt_cols[crt_cols %in% df_cols]

cat("\nCRT Source Columns:\n")
cat("  Expected: 3 columns\n")
cat("  Found:", length(existing_crt), "columns\n")
cat("  Missing:", length(missing_crt), "columns\n")

if (length(missing_crt) > 0) {
  cat("\n  Missing CRT columns:\n")
  for (col in missing_crt) {
    cat("    -", col, "\n")
  }
}

# Check target columns
missing_targets <- target_cols[!target_cols %in% df_cols]
existing_targets <- target_cols[target_cols %in% df_cols]

cat("\nTarget Columns:\n")
cat("  Expected: 3 columns\n")
cat("  Found:", length(existing_targets), "columns\n")
cat("  Missing:", length(missing_targets), "columns\n")

if (length(missing_targets) > 0) {
  cat("\n  Missing target columns:\n")
  for (col in missing_targets) {
    cat("    -", col, "\n")
  }
}

cat("\n=== END CHECK ===\n\n")

# Stop if critical columns are missing
if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) {
  stop("ERROR: Too many columns missing! Please check column names in CSV file.")
}

cat("Proceeding with processing...\n\n")

# ============= PROCESS AOT SCALE =============
cat("Processing AOT scale...\n")

# Convert AOT columns to numeric (handling any non-numeric values)
for (col in aot_cols) {
  if (col %in% names(df)) {
    df[[col]] <- as.numeric(df[[col]])
  }
}

# Calculate average with reverse coding (WITHOUT modifying original values)
# Items 4, 5, 6, 7 are reverse coded for calculation only
df$aot_total <- apply(df[, aot_cols[aot_cols %in% names(df)], drop = FALSE], 1, function(row) {
  # Create a copy for calculation
  values <- as.numeric(row)

  # Reverse items 4, 5, 6, 7 (positions in aot_cols vector)
  reverse_positions <- c(4, 5, 6, 7)
  values[reverse_positions] <- values[reverse_positions] * -1

  # Return mean (na.rm = TRUE handles missing values)
  mean(values, na.rm = TRUE)
})

cat("  AOT total scores calculated (items 4-7 reverse coded for calculation only).\n")
cat("  Original AOT item values preserved in dataframe.\n\n")

# ============= PROCESS CRT SCALES =============
cat("Processing CRT scales...\n")

# Initialize CRT columns
df$crt_correct <- NA
df$crt_int <- NA

# Process each row
for (i in 1:nrow(df)) {
  # CRT Correct
  crt_correct_count <- 0
  crt_correct_n <- 0

  for (j in 1:3) {
    col <- crt_cols[j]
    if (col %in% names(df)) {
      response <- trimws(tolower(as.character(df[i, col])))
      correct_answer <- tolower(crt_correct_answers[j])

      if (!is.na(response) && response != "") {
        crt_correct_n <- crt_correct_n + 1
        if (response == correct_answer) {
          crt_correct_count <- crt_correct_count + 1
        }
      }
    }
  }

  # Calculate proportion correct
  if (crt_correct_n > 0) {
    df$crt_correct[i] <- crt_correct_count / crt_correct_n
  }

  # CRT Intuitive
  crt_int_count <- 0
  crt_int_n <- 0

  for (j in 1:3) {
    col <- crt_cols[j]
    if (col %in% names(df)) {
      response <- trimws(tolower(as.character(df[i, col])))
      intuitive_answer <- tolower(crt_intuitive_answers[j])

      if (!is.na(response) && response != "") {
        crt_int_n <- crt_int_n + 1
        if (response == intuitive_answer) {
          crt_int_count <- crt_int_count + 1
        }
      }
    }
  }

  # Calculate proportion intuitive
  if (crt_int_n > 0) {
    df$crt_int[i] <- crt_int_count / crt_int_n
  }
}

cat("  CRT correct and intuitive scores calculated.\n\n")

cat("=== PROCESSING COMPLETE ===\n\n")


# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
# This function can be run multiple times to check different random rows

qa_check_random_row <- function() {
  # Pick a random row
  random_row <- sample(1:nrow(df), 1)

  cat("\n========================================\n")
  cat("QA CHECK: Random Row #", random_row, "\n")
  cat("========================================\n\n")

  # AOT Check
  cat("--- AOT SCALE ---\n")
  cat("Source values (original in CSV):\n")
  aot_original <- numeric(8)
  aot_for_calc <- numeric(8)

  for (i in 1:8) {
    col <- aot_cols[i]
    val <- if (col %in% names(df)) df[random_row, col] else NA
    aot_original[i] <- val

    # Apply reversal for items 4-7
    if (i %in% 4:7) {
      aot_for_calc[i] <- val * -1
      cat(sprintf("  %s: %s (reversed to %s for calculation)\n",
                  col,
                  ifelse(is.na(val), "NA", as.character(val)),
                  ifelse(is.na(val), "NA", as.character(val * -1))))
    } else {
      aot_for_calc[i] <- val
      cat(sprintf("  %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val))))
    }
  }

  # Manual calculation check
  valid_aot <- aot_for_calc[!is.na(aot_for_calc)]
  if (length(valid_aot) > 0) {
    expected_mean <- mean(valid_aot)
    actual_value <- df$aot_total[random_row]
    cat(sprintf("\nCalculation check:\n"))
    cat(sprintf("  Sum of reversed values: %s\n", paste(valid_aot, collapse = " + ")))
    cat(sprintf("  Average of %d valid items: %.5f\n", length(valid_aot), expected_mean))
    cat(sprintf("  Target value (aot_total): %.5f\n", actual_value))
    cat(sprintf("  Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗")))
  } else {
    cat("\n  No valid AOT values to calculate.\n")
  }

  # CRT Check
  cat("\n--- CRT SCALE ---\n")
  cat("Source values:\n")
  crt_correct_count <- 0
  crt_int_count <- 0
  crt_n <- 0

  for (i in 1:3) {
    col <- crt_cols[i]
    val <- if (col %in% names(df)) as.character(df[random_row, col]) else ""
    val_trimmed <- trimws(tolower(val))

    correct_ans <- crt_correct_answers[i]
    intuitive_ans <- crt_intuitive_answers[i]

    is_correct <- val_trimmed == tolower(correct_ans)
    is_intuitive <- val_trimmed == tolower(intuitive_ans)

    if (val_trimmed != "" && !is.na(val_trimmed)) {
      crt_n <- crt_n + 1
      if (is_correct) crt_correct_count <- crt_correct_count + 1
      if (is_intuitive) crt_int_count <- crt_int_count + 1
    }

    cat(sprintf("  %s: '%s'\n", col, val))
    cat(sprintf("    Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct")))
    cat(sprintf("    Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive")))
  }

  cat("\nCalculation check:\n")
  if (crt_n > 0) {
    expected_correct <- crt_correct_count / crt_n
    expected_int <- crt_int_count / crt_n
    actual_correct <- df$crt_correct[random_row]
    actual_int <- df$crt_int[random_row]

    cat(sprintf("  Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct))
    cat(sprintf("  Target value (crt_correct): %.5f\n", actual_correct))
    cat(sprintf("  Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗")))

    cat(sprintf("\n  Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int))
    cat(sprintf("  Target value (crt_int): %.5f\n", actual_int))
    cat(sprintf("  Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗")))
  } else {
    cat("  No valid CRT responses to calculate.\n")
  }

  cat("\n========================================\n")
  cat("END QA CHECK\n")
  cat("========================================\n\n")
}

# Run QA check on first random row
cat("\n\n")
qa_check_random_row()

# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
cat("Run this command in R console:\n")
cat("  qa_check_random_row()\n")
cat("\n")


# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
write.csv(df, "eohi2.csv", row.names = FALSE, na = "")

cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
cat("Review the output above, then uncomment line 253 to save changes.\n")
cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n")