# Script to combine DGEN variables in eohi2.csv # Combines 01 and 02 versions of DGEN items (no recoding, just copying values) # Load necessary library library(dplyr) setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") # Read the data (with check.names=FALSE to preserve original column names) # na.strings=NULL keeps empty cells as empty strings instead of converting to NA df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) # Define source column pairs (Set A and Set B) # NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header source_cols_A <- c( "01past5PrefDGEN_1", "01past5PersDGEN_1", "01past5ValDGEN_1", "01past10PrefDGEN_1", "01past10PersDGEN_1", "01past10ValDGEN_1", "01fut5PrefDGEN_8", "01fut5PersDGEN_8", "01fut5ValuesDGEN_1", "01fut10PrefDGEN_8", "01fut10PersDGEN_8", "01fut10ValuesDGEN_1" ) source_cols_B <- c( "02past5PrefDGEN_1", "02past5PersDGEN_1", "02past5ValDGEN_1", "02past10PrefDGEN_1", "02past10PersDGEN_1", "02past10ValDGEN_1", "02fut5PrefDGEN_8", "02fut5PersDGEN_8", "02fut5ValDGEN_1", "02fut10PrefDGEN_8", "02fut10PersDGEN_8", "02fut10ValDGEN_1" ) # Define target column names target_cols <- c( "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" ) # ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") # Get actual column names from dataframe (trimmed) df_cols <- trimws(names(df)) # Print first 30 actual column names for debugging cat("First 30 actual column names in CSV:\n") for (i in 1:min(30, length(df_cols))) { cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) } cat("\n") # Check Source A columns missing_A <- source_cols_A[!source_cols_A %in% df_cols] existing_A <- source_cols_A[source_cols_A %in% df_cols] cat("Source Set A:\n") cat(" Expected: 12 columns\n") cat(" Found:", length(existing_A), "columns\n") cat(" Missing:", length(missing_A), "columns\n") if (length(missing_A) > 0) { cat("\n Missing columns from Set A:\n") for (col in missing_A) { cat(" -", col, "\n") } } # Check Source B columns missing_B <- source_cols_B[!source_cols_B %in% df_cols] existing_B <- source_cols_B[source_cols_B %in% df_cols] cat("\nSource Set B:\n") cat(" Expected: 12 columns\n") cat(" Found:", length(existing_B), "columns\n") cat(" Missing:", length(missing_B), "columns\n") if (length(missing_B) > 0) { cat("\n Missing columns from Set B:\n") for (col in missing_B) { cat(" -", col, "\n") } } # Check for columns with similar names (potential typos/spaces) if (length(missing_A) > 0 || length(missing_B) > 0) { cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") all_missing <- c(missing_A, missing_B) for (miss_col in all_missing) { # Find columns that start with similar pattern pattern <- substr(miss_col, 1, 10) similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) if (length(similar) > 0) { cat("\nLooking for:", miss_col) cat("\n Similar columns found:\n") for (sim in similar) { cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") } } } } cat("\n=== END CHECK ===\n\n") # Stop if critical columns are missing if (length(missing_A) > 6 || length(missing_B) > 6) { stop("ERROR: Too many columns missing! Please check column names in CSV file.") } cat("Proceeding with processing...\n\n") # Process each pair of columns (just copy values, no recoding) for (i in 1:12) { col_A <- source_cols_A[i] col_B <- source_cols_B[i] target_col <- target_cols[i] # Get values from columns, handling missing columns vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) # Coalesce: take value from vals_A if present, otherwise from vals_B # No recoding - just copy the value directly combined <- ifelse(!is.na(vals_A) & vals_A != "", vals_A, vals_B) # Copy directly to target column (no recoding) df[[target_col]] <- combined # Print progress cat("Processed:", target_col, "\n") } # ============= VERIFY TARGET COLUMNS WERE CREATED ============= cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") # Get updated column names df_cols_after <- trimws(names(df)) # Check which target columns exist existing_targets <- target_cols[target_cols %in% df_cols_after] missing_targets <- target_cols[!target_cols %in% df_cols_after] cat("Target Columns:\n") cat(" Expected: 12 columns\n") cat(" Created:", length(existing_targets), "columns\n") cat(" Missing:", length(missing_targets), "columns\n") if (length(missing_targets) > 0) { cat("\n WARNING: The following target columns were NOT created:\n") for (col in missing_targets) { cat(" -", col, "\n") } stop("\nERROR: Not all target columns were created successfully!") } else { cat("\n SUCCESS: All 12 target columns created successfully!\n") } cat("\n=== END VERIFICATION ===\n\n") # ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= # This function can be run multiple times to check different random rows qa_check_random_row <- function() { # Pick a random row random_row <- sample(1:nrow(df), 1) cat("\n========================================\n") cat("QA CHECK: Random Row #", random_row, "\n") cat("========================================\n\n") # Check each of the 12 pairs for (i in 1:12) { col_A <- source_cols_A[i] col_B <- source_cols_B[i] target_col <- target_cols[i] # Get values val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" target_val <- df[random_row, target_col] # Determine which source had the value has_val_A <- !is.na(val_A) && val_A != "" has_val_B <- !is.na(val_B) && val_B != "" if (has_val_A) { source_used <- "A" original_value <- val_A } else if (has_val_B) { source_used <- "B" original_value <- val_B } else { source_used <- "NONE" original_value <- "(empty)" } # Print the info cat(sprintf("Pair %2d:\n", i)) cat(sprintf(" Source A: %-30s\n", col_A)) cat(sprintf(" Source B: %-30s\n", col_B)) cat(sprintf(" Target: %-30s\n", target_col)) cat(sprintf(" Value found in: Source %s\n", source_used)) cat(sprintf(" Original value: '%s'\n", original_value)) cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) cat("\n") } cat("========================================\n") cat("END QA CHECK\n") cat("========================================\n\n") } # Run QA check on first random row cat("\n\n") qa_check_random_row() # Instructions for running additional checks cat("\n") cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") cat("Run this command in R console:\n") cat(" qa_check_random_row()\n") cat("\n") # Save the modified dataframe back to CSV # na="" writes NA values as empty cells instead of "NA" text write.csv(df, "eohi2.csv", row.names = FALSE, na = "") cat("\nProcessing complete! 12 new columns added to eohi2.csv\n")