eohi/eohi2/dataP 04 - DGEN means.r

# Script to calculate DGEN means by time period in eohi2.csv
# Averages the 3 domain scores (Pref, Pers, Val) for each time period

# Load necessary library
library(dplyr)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")

# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)

# Define source columns (12 total)
source_cols <- c(
  "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val",
  "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val",
  "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val",
  "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val"
)

# Define target columns (4 total)
target_cols <- c(
  "DGEN_past_5_mean",
  "DGEN_past_10_mean",
  "DGEN_fut_5_mean",
  "DGEN_fut_10_mean"
)

# Define groupings: each target gets 3 source columns
source_groups <- list(
  DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"),
  DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"),
  DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"),
  DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val")
)

# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")

# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))

# Check Source columns
missing_source <- source_cols[!source_cols %in% df_cols]
existing_source <- source_cols[source_cols %in% df_cols]

cat("Source Columns:\n")
cat("  Expected: 12 columns\n")
cat("  Found:", length(existing_source), "columns\n")
cat("  Missing:", length(missing_source), "columns\n")

if (length(missing_source) > 0) {
  cat("\n  Missing source columns:\n")
  for (col in missing_source) {
    cat("    -", col, "\n")
  }
}

# Check Target columns
missing_targets <- target_cols[!target_cols %in% df_cols]
existing_targets <- target_cols[target_cols %in% df_cols]

cat("\nTarget Columns:\n")
cat("  Expected: 4 columns\n")
cat("  Found:", length(existing_targets), "columns\n")
cat("  Missing:", length(missing_targets), "columns\n")

if (length(missing_targets) > 0) {
  cat("\n  Target columns do NOT exist yet - will create them.\n")
  if (length(existing_targets) > 0) {
    cat("  WARNING: Some target columns already exist and will be overwritten.\n")
  }
} else {
  cat("  All target columns exist - will overwrite with calculated values.\n")
}

cat("\n=== END CHECK ===\n\n")

# Stop if critical columns are missing
if (length(missing_source) > 6) {
  stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
}

cat("Proceeding with processing...\n\n")

# ============= CALCULATE MEANS =============
cat("Calculating DGEN means by time period...\n")

# Convert source columns to numeric
for (col in source_cols) {
  if (col %in% names(df)) {
    df[[col]] <- as.numeric(df[[col]])
  }
}

# Calculate each target as the mean of its 3 source columns
for (target in target_cols) {
  source_group <- source_groups[[target]]

  # Get the columns that exist
  existing_cols <- source_group[source_group %in% names(df)]

  if (length(existing_cols) > 0) {
    # Calculate row means across the 3 domain columns
    df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE)
    cat("  Processed:", target, "\n")
  } else {
    cat("  WARNING: No source columns found for", target, "\n")
  }
}

cat("\n=== CALCULATION COMPLETE ===\n\n")


# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
# This function can be run multiple times to check different random rows

qa_check_random_row <- function() {
  # Pick a random row
  random_row <- sample(1:nrow(df), 1)

  cat("\n========================================\n")
  cat("QA CHECK: Random Row #", random_row, "\n")
  cat("========================================\n\n")

  # Check each of the 4 target columns
  for (target in target_cols) {
    source_group <- source_groups[[target]]

    cat(sprintf("Target: %s\n", target))
    cat("  Source columns:\n")

    # Get values from source columns
    values <- numeric(3)
    for (i in 1:3) {
      col <- source_group[i]
      val <- if (col %in% names(df)) df[random_row, col] else NA
      values[i] <- val
      cat(sprintf("    %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val))))
    }

    # Calculate expected mean
    valid_values <- values[!is.na(values)]
    if (length(valid_values) > 0) {
      expected_mean <- mean(valid_values)
      actual_value <- df[random_row, target]

      cat(sprintf("\n  Calculation:\n"))
      cat(sprintf("    Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values)))
      cat(sprintf("    Average of %d values: %.5f\n", length(valid_values), expected_mean))
      cat(sprintf("    Target value: %.5f\n", actual_value))
      cat(sprintf("    Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗")))
    } else {
      cat("    No valid values to calculate mean.\n")
    }
    cat("\n")
  }

  cat("========================================\n")
  cat("END QA CHECK\n")
  cat("========================================\n\n")
}

# Run QA check on first random row
cat("\n\n")
qa_check_random_row()

# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
cat("Run this command in R console:\n")
cat("  qa_check_random_row()\n")
cat("\n")


# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
# write.csv(df, "eohi2.csv", row.names = FALSE, na = "")

cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
cat("Review the output above, then uncomment line 163 to save changes.\n")
cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n")