eohi/eohi2/dataP 03 - recode DGEN vars.r
2025-12-23 15:47:09 -05:00

254 lines
7.6 KiB
R

# Script to combine DGEN variables in eohi2.csv
# Combines 01 and 02 versions of DGEN items (no recoding, just copying values)
# Load necessary library
library(dplyr)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
# Define source column pairs (Set A and Set B)
# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header
source_cols_A <- c(
"01past5PrefDGEN_1",
"01past5PersDGEN_1",
"01past5ValDGEN_1",
"01past10PrefDGEN_1",
"01past10PersDGEN_1",
"01past10ValDGEN_1",
"01fut5PrefDGEN_8",
"01fut5PersDGEN_8",
"01fut5ValuesDGEN_1",
"01fut10PrefDGEN_8",
"01fut10PersDGEN_8",
"01fut10ValuesDGEN_1"
)
source_cols_B <- c(
"02past5PrefDGEN_1",
"02past5PersDGEN_1",
"02past5ValDGEN_1",
"02past10PrefDGEN_1",
"02past10PersDGEN_1",
"02past10ValDGEN_1",
"02fut5PrefDGEN_8",
"02fut5PersDGEN_8",
"02fut5ValDGEN_1",
"02fut10PrefDGEN_8",
"02fut10PersDGEN_8",
"02fut10ValDGEN_1"
)
# Define target column names
target_cols <- c(
"DGEN_past_5_Pref",
"DGEN_past_5_Pers",
"DGEN_past_5_Val",
"DGEN_past_10_Pref",
"DGEN_past_10_Pers",
"DGEN_past_10_Val",
"DGEN_fut_5_Pref",
"DGEN_fut_5_Pers",
"DGEN_fut_5_Val",
"DGEN_fut_10_Pref",
"DGEN_fut_10_Pers",
"DGEN_fut_10_Val"
)
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))
# Print first 30 actual column names for debugging
cat("First 30 actual column names in CSV:\n")
for (i in 1:min(30, length(df_cols))) {
cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i])))
}
cat("\n")
# Check Source A columns
missing_A <- source_cols_A[!source_cols_A %in% df_cols]
existing_A <- source_cols_A[source_cols_A %in% df_cols]
cat("Source Set A:\n")
cat(" Expected: 12 columns\n")
cat(" Found:", length(existing_A), "columns\n")
cat(" Missing:", length(missing_A), "columns\n")
if (length(missing_A) > 0) {
cat("\n Missing columns from Set A:\n")
for (col in missing_A) {
cat(" -", col, "\n")
}
}
# Check Source B columns
missing_B <- source_cols_B[!source_cols_B %in% df_cols]
existing_B <- source_cols_B[source_cols_B %in% df_cols]
cat("\nSource Set B:\n")
cat(" Expected: 12 columns\n")
cat(" Found:", length(existing_B), "columns\n")
cat(" Missing:", length(missing_B), "columns\n")
if (length(missing_B) > 0) {
cat("\n Missing columns from Set B:\n")
for (col in missing_B) {
cat(" -", col, "\n")
}
}
# Check for columns with similar names (potential typos/spaces)
if (length(missing_A) > 0 || length(missing_B) > 0) {
cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n")
all_missing <- c(missing_A, missing_B)
for (miss_col in all_missing) {
# Find columns that start with similar pattern
pattern <- substr(miss_col, 1, 10)
similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE)
if (length(similar) > 0) {
cat("\nLooking for:", miss_col)
cat("\n Similar columns found:\n")
for (sim in similar) {
cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "")
}
}
}
}
cat("\n=== END CHECK ===\n\n")
# Stop if critical columns are missing
if (length(missing_A) > 6 || length(missing_B) > 6) {
stop("ERROR: Too many columns missing! Please check column names in CSV file.")
}
cat("Proceeding with processing...\n\n")
# Process each pair of columns (just copy values, no recoding)
for (i in 1:12) {
col_A <- source_cols_A[i]
col_B <- source_cols_B[i]
target_col <- target_cols[i]
# Get values from columns, handling missing columns
vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df))
vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df))
# Coalesce: take value from vals_A if present, otherwise from vals_B
# No recoding - just copy the value directly
combined <- ifelse(!is.na(vals_A) & vals_A != "",
vals_A,
vals_B)
# Copy directly to target column (no recoding)
df[[target_col]] <- combined
# Print progress
cat("Processed:", target_col, "\n")
}
# ============= VERIFY TARGET COLUMNS WERE CREATED =============
cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n")
# Get updated column names
df_cols_after <- trimws(names(df))
# Check which target columns exist
existing_targets <- target_cols[target_cols %in% df_cols_after]
missing_targets <- target_cols[!target_cols %in% df_cols_after]
cat("Target Columns:\n")
cat(" Expected: 12 columns\n")
cat(" Created:", length(existing_targets), "columns\n")
cat(" Missing:", length(missing_targets), "columns\n")
if (length(missing_targets) > 0) {
cat("\n WARNING: The following target columns were NOT created:\n")
for (col in missing_targets) {
cat(" -", col, "\n")
}
stop("\nERROR: Not all target columns were created successfully!")
} else {
cat("\n SUCCESS: All 12 target columns created successfully!\n")
}
cat("\n=== END VERIFICATION ===\n\n")
# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
# This function can be run multiple times to check different random rows
qa_check_random_row <- function() {
# Pick a random row
random_row <- sample(1:nrow(df), 1)
cat("\n========================================\n")
cat("QA CHECK: Random Row #", random_row, "\n")
cat("========================================\n\n")
# Check each of the 12 pairs
for (i in 1:12) {
col_A <- source_cols_A[i]
col_B <- source_cols_B[i]
target_col <- target_cols[i]
# Get values
val_A <- if (col_A %in% names(df)) df[random_row, col_A] else ""
val_B <- if (col_B %in% names(df)) df[random_row, col_B] else ""
target_val <- df[random_row, target_col]
# Determine which source had the value
has_val_A <- !is.na(val_A) && val_A != ""
has_val_B <- !is.na(val_B) && val_B != ""
if (has_val_A) {
source_used <- "A"
original_value <- val_A
} else if (has_val_B) {
source_used <- "B"
original_value <- val_B
} else {
source_used <- "NONE"
original_value <- "(empty)"
}
# Print the info
cat(sprintf("Pair %2d:\n", i))
cat(sprintf(" Source A: %-30s\n", col_A))
cat(sprintf(" Source B: %-30s\n", col_B))
cat(sprintf(" Target: %-30s\n", target_col))
cat(sprintf(" Value found in: Source %s\n", source_used))
cat(sprintf(" Original value: '%s'\n", original_value))
cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val))))
cat("\n")
}
cat("========================================\n")
cat("END QA CHECK\n")
cat("========================================\n\n")
}
# Run QA check on first random row
cat("\n\n")
qa_check_random_row()
# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
cat("Run this command in R console:\n")
cat(" qa_check_random_row()\n")
cat("\n")
# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
cat("\nProcessing complete! 12 new columns added to eohi2.csv\n")