library(dplyr) setwd("/home/ladmin/Documents/DND/EOHI/eohi3") # Read the data (with check.names=FALSE to preserve original column names) # Keep empty cells as empty strings, not NA # Only convert the literal string "NA" to NA, not empty strings df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA") # ============================================================================= # 1. CREATE BACKUP # ============================================================================= file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE) # ============================================================================= # 2. DEFINE VARIABLE MAPPINGS # ============================================================================= # Target variables (excluding those ending in _MEAN) # Each target var = past_var - fut_var ehi_mappings <- list( # Preferences (PREF) "ehi_pref_hobbies" = c("past_pref_hobbies", "fut_pref_hobbies"), "ehi_pref_music" = c("past_pref_music", "fut_pref_music"), "ehi_pref_dress" = c("past_pref_dress", "fut_pref_dress"), "ehi_pref_exer" = c("past_pref_exer", "fut_pref_exer"), "ehi_pref_food" = c("past_pref_food", "fut_pref_food"), "ehi_pref_friends" = c("past_pref_friends", "fut_pref_friends"), "ehi_pref_DGEN" = c("past_pref_DGEN", "fut_pref_DGEN"), # Personality (PERS) "ehi_pers_open" = c("past_pers_open", "fut_pers_open"), "ehi_pers_goal" = c("past_pers_goal", "fut_pers_goal"), "ehi_pers_social" = c("past_pers_social", "fut_pers_social"), "ehi_pers_agree" = c("past_pers_agree", "fut_pers_agree"), "ehi_pers_stress" = c("past_pers_stress", "fut_pers_stress"), "ehi_pers_DGEN" = c("past_pers_DGEN", "fut_pers_DGEN"), # Values (VAL) "ehi_val_trad" = c("past_val_trad", "fut_val_trad"), "ehi_val_autonomy" = c("past_val_autonomy", "fut_val_autonomy"), "ehi_val_personal" = c("past_val_personal", "fut_val_personal"), "ehi_val_justice" = c("past_val_justice", "fut_val_justice"), "ehi_val_close" = c("past_val_close", "fut_val_close"), "ehi_val_connect" = c("past_val_connect", "fut_val_connect"), "ehi_val_DGEN" = c("past_val_DGEN", "fut_val_DGEN") ) # ============================================================================= # 3. CHECK IF TARGET VARIABLES EXIST # ============================================================================= missing_targets <- c() for (target_var in names(ehi_mappings)) { if (!target_var %in% names(df)) { missing_targets <- c(missing_targets, target_var) cat(paste("⚠ Target variable not found:", target_var, "\n")) } } if (length(missing_targets) > 0) { cat("\nERROR: The following target variables are missing from eohi3.csv:\n") for (var in missing_targets) { cat(paste(" -", var, "\n")) } stop("Cannot proceed without target variables. Please add them to the CSV file.") } # ============================================================================= # 4. CALCULATE EHI VARIABLES (past - future) # ============================================================================= missing_source_cols <- list() for (target_var in names(ehi_mappings)) { past_var <- ehi_mappings[[target_var]][1] fut_var <- ehi_mappings[[target_var]][2] # Check if source columns exist missing <- c() if (!past_var %in% names(df)) { missing <- c(missing, past_var) } if (!fut_var %in% names(df)) { missing <- c(missing, fut_var) } if (length(missing) > 0) { missing_source_cols[[target_var]] <- missing warning(paste("Skipping", target_var, "- missing source columns:", paste(missing, collapse = ", "))) next } # Convert to numeric, handling empty strings and NA past_vals <- as.numeric(ifelse(df[[past_var]] == "" | is.na(df[[past_var]]), NA, df[[past_var]])) fut_vals <- as.numeric(ifelse(df[[fut_var]] == "" | is.na(df[[fut_var]]), NA, df[[fut_var]])) # Calculate difference: past - future ehi_vals <- past_vals - fut_vals # Update target column df[[target_var]] <- ehi_vals cat(paste(" Calculated:", target_var, "=", past_var, "-", fut_var, "\n")) } # Report any missing source columns if (length(missing_source_cols) > 0) { for (var in names(missing_source_cols)) { cat(paste(" ", var, ":", paste(missing_source_cols[[var]], collapse = ", "), "\n")) } } # ============================================================================= # 5. VALIDATION: CHECK 5 RANDOM ROWS # ============================================================================= cat("\n=== VALIDATION: CHECKING 5 RANDOM ROWS ===\n\n") # Set seed for reproducibility set.seed(123) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) sample_rows <- sort(sample_rows) for (i in sample_rows) { cat(paste("Row", i, ":\n")) # Check a few representative variables from each category test_vars <- c( "ehi_pref_hobbies", "ehi_pers_open", "ehi_val_trad" ) for (target_var in test_vars) { if (target_var %in% names(ehi_mappings)) { past_var <- ehi_mappings[[target_var]][1] fut_var <- ehi_mappings[[target_var]][2] if (past_var %in% names(df) && fut_var %in% names(df)) { past_val <- df[i, past_var] fut_val <- df[i, fut_var] ehi_val <- df[i, target_var] # Convert to numeric for calculation check past_num <- as.numeric(ifelse(past_val == "" | is.na(past_val), NA, past_val)) fut_num <- as.numeric(ifelse(fut_val == "" | is.na(fut_val), NA, fut_val)) ehi_num <- as.numeric(ifelse(is.na(ehi_val), NA, ehi_val)) # Calculate expected value expected <- if (!is.na(past_num) && !is.na(fut_num)) { past_num - fut_num } else { NA } # Check if calculation is correct match <- if (!is.na(expected) && !is.na(ehi_num)) { abs(expected - ehi_num) < 0.0001 # Allow for floating point precision } else { is.na(expected) && is.na(ehi_num) } cat(sprintf(" %s:\n", target_var)) cat(sprintf(" %s = %s\n", past_var, ifelse(is.na(past_val) || past_val == "", "NA/empty", past_val))) cat(sprintf(" %s = %s\n", fut_var, ifelse(is.na(fut_val) || fut_val == "", "NA/empty", fut_val))) cat(sprintf(" %s = %s\n", target_var, ifelse(is.na(ehi_val), "NA", ehi_val))) cat(sprintf(" Expected: %s - %s = %s\n", ifelse(is.na(past_num), "NA", past_num), ifelse(is.na(fut_num), "NA", fut_num), ifelse(is.na(expected), "NA", expected))) cat(sprintf(" Match: %s\n\n", ifelse(match, "✓", "✗ ERROR"))) } } } } # ============================================================================= # 6. SAVE UPDATED DATA # ============================================================================= # COMMENTED OUT: Uncomment when ready to save # cat("\n=== SAVING DATA ===\n") write.csv(df, "eohi3.csv", row.names = FALSE, na = "") # cat("Updated data saved to: eohi3.csv\n") # cat(paste("Total rows:", nrow(df), "\n")) # cat(paste("Total columns:", ncol(df), "\n"))