eohi/eohi2/dataP 01 - recode and combine past & future vars.r

# Script to combine and recode Likert scale items in eohi2.csv
# Combines 01 and 02 versions of items, then recodes text to numeric values

# Load necessary library
library(dplyr)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")

# Read the data (with check.names=FALSE to preserve original column names)
# na.strings="" keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)

# Define the mapping function
recode_likert <- function(x) {
  case_when(
    tolower(x) == "strongly disagree" ~ -3,
    tolower(x) == "disagree" ~ -2,
    tolower(x) == "somewhat disagree" ~ -1,
    tolower(x) == "neither agree nor disagree" ~ 0,
    tolower(x) == "somewhat agree" ~ 1,
    tolower(x) == "agree" ~ 2,
    tolower(x) == "strongly agree" ~ 3,
    TRUE ~ NA_real_
  )
}

# Define source column pairs (Set A and Set B)
source_cols_A <- c(
  "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5",
  "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5",
  "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5",
  "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5",
  "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5",
  "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5",
  "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5",
  "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5",
  "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5",
  "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5",
  "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5",
  "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5"
)

source_cols_B <- c(
  "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5",
  "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5",
  "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5",
  "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5",
  "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5",
  "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5",
  "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5",
  "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5",
  "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5",
  "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5",
  "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5",
  "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5"
)

# Define target column names
target_cols <- c(
  "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel",
  "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex",
  "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice",
  "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel",
  "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex",
  "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice",
  "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel",
  "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex",
  "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice",
  "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel",
  "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex",
  "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice"
)

# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")

# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))

# Print first 30 actual column names for debugging
cat("First 30 actual column names in CSV:\n")
for (i in 1:min(30, length(df_cols))) {
  cat(sprintf("  %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i])))
}
cat("\n")

# Check Source A columns
missing_A <- source_cols_A[!source_cols_A %in% df_cols]
existing_A <- source_cols_A[source_cols_A %in% df_cols]

cat("Source Set A:\n")
cat("  Expected: 60 columns\n")
cat("  Found:", length(existing_A), "columns\n")
cat("  Missing:", length(missing_A), "columns\n")

if (length(missing_A) > 0) {
  cat("\n  Missing columns from Set A:\n")
  for (col in missing_A) {
    cat("    -", col, "\n")
  }
}

# Check Source B columns
missing_B <- source_cols_B[!source_cols_B %in% df_cols]
existing_B <- source_cols_B[source_cols_B %in% df_cols]

cat("\nSource Set B:\n")
cat("  Expected: 60 columns\n")
cat("  Found:", length(existing_B), "columns\n")
cat("  Missing:", length(missing_B), "columns\n")

if (length(missing_B) > 0) {
  cat("\n  Missing columns from Set B:\n")
  for (col in missing_B) {
    cat("    -", col, "\n")
  }
}

# Check for columns with similar names (potential typos/spaces)
if (length(missing_A) > 0 || length(missing_B) > 0) {
  cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n")
  all_missing <- c(missing_A, missing_B)
  for (miss_col in all_missing) {
    # Find columns that start with similar pattern
    pattern <- substr(miss_col, 1, 10)
    similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE)
    if (length(similar) > 0) {
      cat("\nLooking for:", miss_col)
      cat("\n  Similar columns found:\n")
      for (sim in similar) {
        cat("    - '", sim, "' (length:", nchar(sim), ")\n", sep = "")
      }
    }
  }
}

cat("\n=== END CHECK ===\n\n")

# Stop if critical columns are missing
if (length(missing_A) > 30 || length(missing_B) > 30) {
  stop("ERROR: Too many columns missing! Please check column names in CSV file.")
}

cat("Proceeding with processing...\n\n")

# Process each pair of columns
for (i in 1:60) {
  col_A <- source_cols_A[i]
  col_B <- source_cols_B[i]
  target_col <- target_cols[i]

  # Get values from columns, handling missing columns
  vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df))
  vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df))

  # Coalesce: take value from vals_A if present, otherwise from vals_B
  combined <- ifelse(!is.na(vals_A) & vals_A != "",
                     vals_A,
                     vals_B)

  # Recode to numeric
  df[[target_col]] <- recode_likert(combined)

  # Print progress
  cat("Processed:", target_col, "\n")
}

# ============= VERIFY TARGET COLUMNS WERE CREATED =============
cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n")

# Get updated column names
df_cols_after <- trimws(names(df))

# Check which target columns exist
existing_targets <- target_cols[target_cols %in% df_cols_after]
missing_targets <- target_cols[!target_cols %in% df_cols_after]

cat("Target Columns:\n")
cat("  Expected: 60 columns\n")
cat("  Created:", length(existing_targets), "columns\n")
cat("  Missing:", length(missing_targets), "columns\n")

if (length(missing_targets) > 0) {
  cat("\n  WARNING: The following target columns were NOT created:\n")
  for (col in missing_targets) {
    cat("    -", col, "\n")
  }
  stop("\nERROR: Not all target columns were created successfully!")
} else {
  cat("\n  SUCCESS: All 60 target columns created successfully!\n")
}

cat("\n=== END VERIFICATION ===\n\n")


# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
# This function can be run multiple times to check different random rows

qa_check_random_row <- function() {
  # Pick a random row
  random_row <- sample(1:nrow(df), 1)

  cat("\n========================================\n")
  cat("QA CHECK: Random Row #", random_row, "\n")
  cat("========================================\n\n")

  # Check each of the 60 pairs
  for (i in 1:60) {
    col_A <- source_cols_A[i]
    col_B <- source_cols_B[i]
    target_col <- target_cols[i]

    # Get values
    val_A <- if (col_A %in% names(df)) df[random_row, col_A] else ""
    val_B <- if (col_B %in% names(df)) df[random_row, col_B] else ""
    target_val <- df[random_row, target_col]

    # Determine which source had the value
    has_val_A <- !is.na(val_A) && val_A != ""
    has_val_B <- !is.na(val_B) && val_B != ""

    if (has_val_A) {
      source_used <- "A"
      original_text <- val_A
    } else if (has_val_B) {
      source_used <- "B"
      original_text <- val_B
    } else {
      source_used <- "NONE"
      original_text <- "(empty)"
    }

    # Print the info
    cat(sprintf("Pair %2d:\n", i))
    cat(sprintf("  Source A: %-30s\n", col_A))
    cat(sprintf("  Source B: %-30s\n", col_B))
    cat(sprintf("  Target:   %-30s\n", target_col))
    cat(sprintf("  Value found in: Source %s\n", source_used))
    cat(sprintf("  Original text:  '%s'\n", original_text))
    cat(sprintf("  Numeric value:  %s\n", ifelse(is.na(target_val), "NA", as.character(target_val))))
    cat("\n")
  }

  cat("========================================\n")
  cat("END QA CHECK\n")
  cat("========================================\n\n")
}

# Run QA check on first random row
cat("\n\n")
qa_check_random_row()

# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
cat("Run this command in R console:\n")
cat("  qa_check_random_row()\n")
cat("\n")


# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
write.csv(df, "eohi2.csv", row.names = FALSE, na = "")

cat("\nProcessing complete! 60 new columns added to eohi2.csv\n")