eohi/.history/eohi2/dataP - time interval differences_20251001130503.r

# Script to calculate absolute differences between time intervals in eohi2.csv
# Compares present vs past/future, and 5-year vs 10-year intervals

# Load necessary library
library(dplyr)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")

# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)

# Define the 15 item names (same order for all time periods)
items <- c(
  "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel",
  "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
  "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)

# Note: present uses lowercase "tv", others use uppercase "TV"
items_present <- c(
  "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel",
  "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
  "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)

# Define all source columns (75 total)
source_cols <- c(
  paste0("present_", items_present),
  paste0("past_5_", items),
  paste0("past_10_", items),
  paste0("fut_5_", items),
  paste0("fut_10_", items)
)

# Define all target columns (90 total = 6 calculation types × 15 items)
target_NPast_5 <- paste0("NPast_5_", items)
target_NPast_10 <- paste0("NPast_10_", items)
target_NFut_5 <- paste0("NFut_5_", items)
target_NFut_10 <- paste0("NFut_10_", items)
target_5_10past <- paste0("5.10past_", items)
target_5_10fut <- paste0("5.10fut_", items)

target_cols <- c(
  target_NPast_5,
  target_NPast_10,
  target_NFut_5,
  target_NFut_10,
  target_5_10past,
  target_5_10fut
)

# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")

# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))

# Check Source columns
missing_source <- source_cols[!source_cols %in% df_cols]
existing_source <- source_cols[source_cols %in% df_cols]

cat("Source Columns:\n")
cat("  Expected: 75 columns\n")
cat("  Found:", length(existing_source), "columns\n")
cat("  Missing:", length(missing_source), "columns\n")

if (length(missing_source) > 0 && length(missing_source) <= 10) {
  cat("\n  Missing source columns:\n")
  for (col in missing_source) {
    cat("    -", col, "\n")
  }
} else if (length(missing_source) > 10) {
  cat("\n  Too many missing to list individually (", length(missing_source), "missing)\n")
}

cat("\n=== END CHECK ===\n\n")

# Stop if critical columns are missing
if (length(missing_source) > 30) {
  stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
}

cat("Proceeding with processing...\n\n")

# ============= CALCULATE DIFFERENCES =============
cat("Calculating time interval differences...\n")

# Convert source columns to numeric
for (col in source_cols) {
  if (col %in% names(df)) {
    df[[col]] <- as.numeric(df[[col]])
  }
}

# Helper function to calculate absolute difference
calc_abs_diff <- function(col1, col2) {
  val1 <- if (col1 %in% names(df)) df[[col1]] else NA
  val2 <- if (col2 %in% names(df)) df[[col2]] else NA
  abs(val1 - val2)
}

# Calculate NPast_5: |present - past_5|
cat("  Calculating NPast_5 differences (present vs past 5 years)...\n")
for (i in 1:15) {
  target <- target_NPast_5[i]
  source1 <- paste0("present_", items_present[i])
  source2 <- paste0("past_5_", items[i])
  df[[target]] <- calc_abs_diff(source1, source2)
}

# Calculate NPast_10: |present - past_10|
cat("  Calculating NPast_10 differences (present vs past 10 years)...\n")
for (i in 1:15) {
  target <- target_NPast_10[i]
  source1 <- paste0("present_", items_present[i])
  source2 <- paste0("past_10_", items[i])
  df[[target]] <- calc_abs_diff(source1, source2)
}

# Calculate NFut_5: |present - fut_5|
cat("  Calculating NFut_5 differences (present vs future 5 years)...\n")
for (i in 1:15) {
  target <- target_NFut_5[i]
  source1 <- paste0("present_", items_present[i])
  source2 <- paste0("fut_5_", items[i])
  df[[target]] <- calc_abs_diff(source1, source2)
}

# Calculate NFut_10: |present - fut_10|
cat("  Calculating NFut_10 differences (present vs future 10 years)...\n")
for (i in 1:15) {
  target <- target_NFut_10[i]
  source1 <- paste0("present_", items_present[i])
  source2 <- paste0("fut_10_", items[i])
  df[[target]] <- calc_abs_diff(source1, source2)
}

# Calculate 5.10past: |past_5 - past_10|
cat("  Calculating 5.10past differences (past 5 vs past 10 years)...\n")
for (i in 1:15) {
  target <- target_5_10past[i]
  source1 <- paste0("past_5_", items[i])
  source2 <- paste0("past_10_", items[i])
  df[[target]] <- calc_abs_diff(source1, source2)
}

# Calculate 5.10fut: |fut_5 - fut_10|
cat("  Calculating 5.10fut differences (future 5 vs future 10 years)...\n")
for (i in 1:15) {
  target <- target_5_10fut[i]
  source1 <- paste0("fut_5_", items[i])
  source2 <- paste0("fut_10_", items[i])
  df[[target]] <- calc_abs_diff(source1, source2)
}

cat("\n=== CALCULATION COMPLETE ===\n")
cat("  90 difference columns created.\n\n")


# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
# This function can be run multiple times to check different random rows

qa_check_random_row <- function() {
  # Pick a random row
  random_row <- sample(1:nrow(df), 1)

  cat("\n========================================\n")
  cat("QA CHECK: Random Row #", random_row, "\n")
  cat("========================================\n\n")

  # Sample one calculation from each type (item 1: pref_read)
  test_item_idx <- 1

  calculations <- list(
    list(name = "NPast_5", target = target_NPast_5[test_item_idx],
         source1 = paste0("present_", items_present[test_item_idx]),
         source2 = paste0("past_5_", items[test_item_idx]),
         desc = "|present - past_5|"),
    list(name = "NPast_10", target = target_NPast_10[test_item_idx],
         source1 = paste0("present_", items_present[test_item_idx]),
         source2 = paste0("past_10_", items[test_item_idx]),
         desc = "|present - past_10|"),
    list(name = "NFut_5", target = target_NFut_5[test_item_idx],
         source1 = paste0("present_", items_present[test_item_idx]),
         source2 = paste0("fut_5_", items[test_item_idx]),
         desc = "|present - fut_5|"),
    list(name = "NFut_10", target = target_NFut_10[test_item_idx],
         source1 = paste0("present_", items_present[test_item_idx]),
         source2 = paste0("fut_10_", items[test_item_idx]),
         desc = "|present - fut_10|"),
    list(name = "5.10past", target = target_5_10past[test_item_idx],
         source1 = paste0("past_5_", items[test_item_idx]),
         source2 = paste0("past_10_", items[test_item_idx]),
         desc = "|past_5 - past_10|"),
    list(name = "5.10fut", target = target_5_10fut[test_item_idx],
         source1 = paste0("fut_5_", items[test_item_idx]),
         source2 = paste0("fut_10_", items[test_item_idx]),
         desc = "|fut_5 - fut_10|")
  )

  cat("Checking sample item: pref_read\n\n")

  for (calc in calculations) {
    cat(sprintf("--- %s ---\n", calc$name))
    cat(sprintf("Formula: %s\n", calc$desc))

    val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA
    val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA
    target_val <- df[random_row, calc$target]

    cat(sprintf("  %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1))))
    cat(sprintf("  %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2))))

    if (!is.na(val1) && !is.na(val2)) {
      expected_diff <- abs(val1 - val2)
      cat(sprintf("\n  Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff))
      cat(sprintf("  Target (%s): %.5f\n", calc$target, target_val))
      cat(sprintf("  Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗")))
    } else {
      cat("  Cannot calculate (missing values)\n")
    }
    cat("\n")
  }

  cat("========================================\n")
  cat("END QA CHECK\n")
  cat("========================================\n\n")
}

# Run QA check on first random row
cat("\n\n")
qa_check_random_row()

# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
cat("Run this command in R console:\n")
cat("  qa_check_random_row()\n")
cat("\n")


# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
# write.csv(df, "eohi2.csv", row.names = FALSE, na = "")

cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
cat("Review the output above, then uncomment line 243 to save changes.\n")
cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n")