# Script to calculate DGEN means by time period in eohi2.csv # Averages the 3 domain scores (Pref, Pers, Val) for each time period # Load necessary library library(dplyr) setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") # Read the data (with check.names=FALSE to preserve original column names) # na.strings=NULL keeps empty cells as empty strings instead of converting to NA df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) # Define source columns (12 total) source_cols <- c( "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" ) # Define target columns (4 total) target_cols <- c( "DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean" ) # Define groupings: each target gets 3 source columns source_groups <- list( DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"), DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"), DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"), DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") ) # ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") # Get actual column names from dataframe (trimmed) df_cols <- trimws(names(df)) # Check Source columns missing_source <- source_cols[!source_cols %in% df_cols] existing_source <- source_cols[source_cols %in% df_cols] cat("Source Columns:\n") cat(" Expected: 12 columns\n") cat(" Found:", length(existing_source), "columns\n") cat(" Missing:", length(missing_source), "columns\n") if (length(missing_source) > 0) { cat("\n Missing source columns:\n") for (col in missing_source) { cat(" -", col, "\n") } } # Check Target columns missing_targets <- target_cols[!target_cols %in% df_cols] existing_targets <- target_cols[target_cols %in% df_cols] cat("\nTarget Columns:\n") cat(" Expected: 4 columns\n") cat(" Found:", length(existing_targets), "columns\n") cat(" Missing:", length(missing_targets), "columns\n") if (length(missing_targets) > 0) { cat("\n Target columns do NOT exist yet - will create them.\n") if (length(existing_targets) > 0) { cat(" WARNING: Some target columns already exist and will be overwritten.\n") } } else { cat(" All target columns exist - will overwrite with calculated values.\n") } cat("\n=== END CHECK ===\n\n") # Stop if critical columns are missing if (length(missing_source) > 6) { stop("ERROR: Too many source columns missing! Please check column names in CSV file.") } cat("Proceeding with processing...\n\n") # ============= CALCULATE MEANS ============= cat("Calculating DGEN means by time period...\n") # Convert source columns to numeric for (col in source_cols) { if (col %in% names(df)) { df[[col]] <- as.numeric(df[[col]]) } } # Calculate each target as the mean of its 3 source columns for (target in target_cols) { source_group <- source_groups[[target]] # Get the columns that exist existing_cols <- source_group[source_group %in% names(df)] if (length(existing_cols) > 0) { # Calculate row means across the 3 domain columns df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE) cat(" Processed:", target, "\n") } else { cat(" WARNING: No source columns found for", target, "\n") } } cat("\n=== CALCULATION COMPLETE ===\n\n") # ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= # This function can be run multiple times to check different random rows qa_check_random_row <- function() { # Pick a random row random_row <- sample(1:nrow(df), 1) cat("\n========================================\n") cat("QA CHECK: Random Row #", random_row, "\n") cat("========================================\n\n") # Check each of the 4 target columns for (target in target_cols) { source_group <- source_groups[[target]] cat(sprintf("Target: %s\n", target)) cat(" Source columns:\n") # Get values from source columns values <- numeric(3) for (i in 1:3) { col <- source_group[i] val <- if (col %in% names(df)) df[random_row, col] else NA values[i] <- val cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) } # Calculate expected mean valid_values <- values[!is.na(values)] if (length(valid_values) > 0) { expected_mean <- mean(valid_values) actual_value <- df[random_row, target] cat(sprintf("\n Calculation:\n")) cat(sprintf(" Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values))) cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) cat(sprintf(" Target value: %.5f\n", actual_value)) cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) } else { cat(" No valid values to calculate mean.\n") } cat("\n") } cat("========================================\n") cat("END QA CHECK\n") cat("========================================\n\n") } # Run QA check on first random row cat("\n\n") qa_check_random_row() # Instructions for running additional checks cat("\n") cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") cat("Run this command in R console:\n") cat(" qa_check_random_row()\n") cat("\n") # Save the modified dataframe back to CSV # na="" writes NA values as empty cells instead of "NA" text # COMMENTED OUT FOR REVIEW - Uncomment when ready to save # write.csv(df, "eohi2.csv", row.names = FALSE, na = "") cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") cat("Review the output above, then uncomment line 163 to save changes.\n") cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n")