library(dplyr) setwd("/home/ladmin/Documents/DND/EOHI/eohi3") # Read the data (with check.names=FALSE to preserve original column names) # Keep empty cells as empty strings, not NA # Only convert the literal string "NA" to NA, not empty strings df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA") # ============================================================================= # 1. CREATE BACKUP # ============================================================================= #file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE) # ============================================================================= # 2. DEFINE VARIABLE MAPPINGS # ============================================================================= # Past variables mapping: [self/other][VAL/PERS/PREF]_p5_[string] -> past_[val/pers/pref]_[string] past_mappings <- list( # Values (VAL) "past_val_trad" = c("selfVAL_p5_trad", "otherVAL_p5_trad"), "past_val_autonomy" = c("selfVAL_p5_autonomy", "otherVAL_p5_autonomy"), "past_val_personal" = c("selfVAL_p5_personal", "otherVAL_p5_personal"), "past_val_justice" = c("selfVAL_p5_justice", "otherVAL_p5_justice"), "past_val_close" = c("selfVAL_p5_close", "otherVAL_p5_close"), "past_val_connect" = c("selfVAL_p5_connect", "otherVAL_p5_connect"), "past_val_DGEN" = c("selfVAL_p5_dgen", "otherVAL_p5_dgen"), # Personality (PERS) "past_pers_open" = c("selfPERS_p5_open", "otherPERS_p5_open"), "past_pers_goal" = c("selfPERS_p5_goal", "otherPERS_p5_goal"), "past_pers_social" = c("selfPERS_p5_social", "otherPERS_p5_social"), "past_pers_agree" = c("selfPERS_p5_agree", "otherPERS_p5_agree"), "past_pers_stress" = c("selfPERS_p5_stress", "otherPERS_p5_stress"), "past_pers_DGEN" = c("selfPERS_p5_dgen", "otherPERS_p5_dgen"), # Preferences (PREF) "past_pref_hobbies" = c("selfPREF_p5_hobbies", "otherPREF_p5_hobbies"), "past_pref_music" = c("selfPREF_p5_music", "otherPREF_p5_music"), "past_pref_dress" = c("selfPREF_p5_dress", "otherPREF_p5_dress"), "past_pref_exer" = c("selfPREF_p5_exer", "otherPREF_p5_exer"), "past_pref_food" = c("selfPREF_p5_food", "otherPREF_p5_food"), "past_pref_friends" = c("selfPREF_p5_friends", "otherPREF_p5_friends"), "past_pref_DGEN" = c("selfPREF_p5_dgen", "otherPREF_p5_dgen") ) # Future variables mapping: [self/other][VAL/PERS/PREF]_f5_[string] -> fut_[val/pers/pref]_[string] future_mappings <- list( # Values (VAL) "fut_val_trad" = c("selfVAL_f5_trad", "otherVAL_f5_trad"), "fut_val_autonomy" = c("selfVAL_f5_autonomy", "otherVAL_f5_autonomy"), "fut_val_personal" = c("selfVAL_f5_personal", "otherVAL_f5_personal"), "fut_val_justice" = c("selfVAL_f5_justice", "otherVAL_f5_justice"), "fut_val_close" = c("selfVAL_f5_close", "otherVAL_f5_close"), "fut_val_connect" = c("selfVAL_f5_connect", "otherVAL_f5_connect"), "fut_val_DGEN" = c("selfVAL_f5_dgen", "otherVAL_f5_dgen"), # Personality (PERS) "fut_pers_open" = c("selfPERS_f5_open", "otherPERS_f5_open"), "fut_pers_goal" = c("selfPERS_f5_goal", "otherPERS_f5_goal"), "fut_pers_social" = c("selfPERS_f5_social", "otherPERS_f5_social"), "fut_pers_agree" = c("selfPERS_f5_agree", "otherPERS_f5_agree"), "fut_pers_stress" = c("selfPERS_f5_stress", "otherPERS_f5_stress"), "fut_pers_DGEN" = c("selfPERS_f5_dgen", "otherPERS_f5_dgen"), # Preferences (PREF) "fut_pref_hobbies" = c("selfPREF_f5_hobbies", "otherPREF_f5_hobbies"), "fut_pref_music" = c("selfPREF_f5_music", "otherPREF_f5_music"), "fut_pref_dress" = c("selfPREF_f5_dress", "otherPREF_f5_dress"), "fut_pref_exer" = c("selfPREF_f5_exer", "otherPREF_f5_exer"), "fut_pref_food" = c("selfPREF_f5_food", "otherPREF_f5_food"), "fut_pref_friends" = c("selfPREF_f5_friends", "otherPREF_f5_friends"), "fut_pref_DGEN" = c("selfPREF_f5_dgen", "otherPREF_f5_dgen") ) # ============================================================================= # 3. COMBINE VARIABLES # ============================================================================= # Function to combine self and other variables # For each row, values exist in either self OR other, never both # NOTE: Column existence should be checked before calling this function combine_vars <- function(df, self_col, other_col) { # Safety check: if columns don't exist, return appropriate fallback if (!self_col %in% names(df)) { stop(paste("ERROR: Column", self_col, "not found. This should have been caught earlier.")) } if (!other_col %in% names(df)) { stop(paste("ERROR: Column", other_col, "not found. This should have been caught earlier.")) } # Combine: use self value if not empty/NA, otherwise use other value # Handle both NA and empty strings result <- ifelse( !is.na(df[[self_col]]) & df[[self_col]] != "", df[[self_col]], ifelse( !is.na(df[[other_col]]) & df[[other_col]] != "", df[[other_col]], NA ) ) return(result) } # Apply past mappings cat("\nCombining past variables...\n") missing_cols <- list() for (new_col in names(past_mappings)) { self_col <- past_mappings[[new_col]][1] other_col <- past_mappings[[new_col]][2] # Check if all required columns exist missing <- c() if (!new_col %in% names(df)) { missing <- c(missing, paste("target:", new_col)) } if (!self_col %in% names(df)) { missing <- c(missing, paste("self:", self_col)) } if (!other_col %in% names(df)) { missing <- c(missing, paste("other:", other_col)) } if (length(missing) > 0) { missing_cols[[new_col]] <- missing warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", "))) next } # All columns exist, proceed with combination df[[new_col]] <- combine_vars(df, self_col, other_col) cat(paste(" Updated:", new_col, "\n")) } # Report any missing columns if (length(missing_cols) > 0) { cat("\n⚠ Missing columns detected in PAST variables:\n") for (var in names(missing_cols)) { cat(paste(" ", var, ":", paste(missing_cols[[var]], collapse = ", "), "\n")) } } # Apply future mappings cat("\nCombining future variables...\n") missing_cols_future <- list() for (new_col in names(future_mappings)) { self_col <- future_mappings[[new_col]][1] other_col <- future_mappings[[new_col]][2] # Check if all required columns exist missing <- c() if (!new_col %in% names(df)) { missing <- c(missing, paste("target:", new_col)) } if (!self_col %in% names(df)) { missing <- c(missing, paste("self:", self_col)) } if (!other_col %in% names(df)) { missing <- c(missing, paste("other:", other_col)) } if (length(missing) > 0) { missing_cols_future[[new_col]] <- missing warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", "))) next } # All columns exist, proceed with combination df[[new_col]] <- combine_vars(df, self_col, other_col) cat(paste(" Updated:", new_col, "\n")) } # Report any missing columns if (length(missing_cols_future) > 0) { cat("\n⚠ Missing columns detected in FUTURE variables:\n") for (var in names(missing_cols_future)) { cat(paste(" ", var, ":", paste(missing_cols_future[[var]], collapse = ", "), "\n")) } } # ============================================================================= # 4. VALIDATION CHECKS # ============================================================================= cat("\n=== VALIDATION CHECKS ===\n\n") # Check 1: Ensure no row has values in both self and other for the same variable check_conflicts <- function(df, mappings) { conflicts <- data.frame() for (new_col in names(mappings)) { self_col <- mappings[[new_col]][1] other_col <- mappings[[new_col]][2] if (self_col %in% names(df) && other_col %in% names(df)) { # Find rows where both self and other have non-empty values both_filled <- !is.na(df[[self_col]]) & df[[self_col]] != "" & !is.na(df[[other_col]]) & df[[other_col]] != "" if (any(both_filled, na.rm = TRUE)) { conflict_rows <- which(both_filled) conflicts <- rbind(conflicts, data.frame( variable = new_col, self_col = self_col, other_col = other_col, n_conflicts = length(conflict_rows), example_rows = paste(head(conflict_rows, 5), collapse = ", ") )) } } } return(conflicts) } past_conflicts <- check_conflicts(df, past_mappings) future_conflicts <- check_conflicts(df, future_mappings) if (nrow(past_conflicts) > 0) { cat("WARNING: Found conflicts in PAST variables (both self and other have values):\n") print(past_conflicts) } else { cat("✓ No conflicts found in PAST variables\n") } if (nrow(future_conflicts) > 0) { cat("\nWARNING: Found conflicts in FUTURE variables (both self and other have values):\n") print(future_conflicts) } else { cat("✓ No conflicts found in FUTURE variables\n") } # Check 2: Verify that combined columns have values where expected check_coverage <- function(df, mappings) { coverage <- data.frame() for (new_col in names(mappings)) { self_col <- mappings[[new_col]][1] other_col <- mappings[[new_col]][2] # Check if columns exist before counting self_exists <- self_col %in% names(df) other_exists <- other_col %in% names(df) target_exists <- new_col %in% names(df) # Count non-empty values in original columns (only if they exist) self_count <- if (self_exists) { sum(!is.na(df[[self_col]]) & df[[self_col]] != "", na.rm = TRUE) } else { NA } other_count <- if (other_exists) { sum(!is.na(df[[other_col]]) & df[[other_col]] != "", na.rm = TRUE) } else { NA } combined_count <- if (target_exists) { sum(!is.na(df[[new_col]]) & df[[new_col]] != "", na.rm = TRUE) } else { NA } # Combined should equal sum of self and other (since they don't overlap) expected_count <- if (!is.na(self_count) && !is.na(other_count)) { self_count + other_count } else { NA } match <- if (!is.na(combined_count) && !is.na(expected_count)) { combined_count == expected_count } else { NA } coverage <- rbind(coverage, data.frame( variable = new_col, self_non_empty = self_count, other_non_empty = other_count, combined_non_empty = combined_count, expected_non_empty = expected_count, match = match )) } return(coverage) } past_coverage <- check_coverage(df, past_mappings) future_coverage <- check_coverage(df, future_mappings) cat("\n=== COVERAGE CHECK ===\n") cat("\nPAST variables:\n") print(past_coverage) cat("\nFUTURE variables:\n") print(future_coverage) # Check if all coverage matches all_past_match <- all(past_coverage$match, na.rm = TRUE) all_future_match <- all(future_coverage$match, na.rm = TRUE) if (all_past_match && all_future_match) { cat("\n✓ All combined variables have correct coverage\n") } else { cat("\n⚠ Some variables may have missing coverage - check the table above\n") } # Check 3: Sample check - verify a few rows manually cat("\n=== SAMPLE ROW CHECK ===\n") sample_rows <- min(5, nrow(df)) cat(paste("Checking first", sample_rows, "rows:\n\n")) for (i in 1:sample_rows) { cat(paste("Row", i, ":\n")) # Check one past variable test_var <- "past_val_trad" self_val <- if (past_mappings[[test_var]][1] %in% names(df)) df[i, past_mappings[[test_var]][1]] else NA other_val <- if (past_mappings[[test_var]][2] %in% names(df)) df[i, past_mappings[[test_var]][2]] else NA combined_val <- df[i, test_var] cat(sprintf(" %s: self=%s, other=%s, combined=%s\n", test_var, ifelse(is.na(self_val) || self_val == "", "empty", self_val), ifelse(is.na(other_val) || other_val == "", "empty", other_val), ifelse(is.na(combined_val) || combined_val == "", "empty", combined_val))) } # ============================================================================= # 5. SAVE UPDATED DATA # ============================================================================= write.csv(df, "eohi3.csv", row.names = FALSE, na = "") cat("Updated data saved to: eohi3.csv\n") cat(paste("Total rows:", nrow(df), "\n")) cat(paste("Total columns:", ncol(df), "\n"))