eohi/eohi3/datap 04 - combined vars.r

344 lines
12 KiB
R

library(dplyr)
setwd("/home/ladmin/Documents/DND/EOHI/eohi3")
# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
# =============================================================================
# 1. CREATE BACKUP
# =============================================================================
#file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)
# =============================================================================
# 2. DEFINE VARIABLE MAPPINGS
# =============================================================================
# Past variables mapping: [self/other][VAL/PERS/PREF]_p5_[string] -> past_[val/pers/pref]_[string]
past_mappings <- list(
# Values (VAL)
"past_val_trad" = c("selfVAL_p5_trad", "otherVAL_p5_trad"),
"past_val_autonomy" = c("selfVAL_p5_autonomy", "otherVAL_p5_autonomy"),
"past_val_personal" = c("selfVAL_p5_personal", "otherVAL_p5_personal"),
"past_val_justice" = c("selfVAL_p5_justice", "otherVAL_p5_justice"),
"past_val_close" = c("selfVAL_p5_close", "otherVAL_p5_close"),
"past_val_connect" = c("selfVAL_p5_connect", "otherVAL_p5_connect"),
"past_val_DGEN" = c("selfVAL_p5_dgen", "otherVAL_p5_dgen"),
# Personality (PERS)
"past_pers_open" = c("selfPERS_p5_open", "otherPERS_p5_open"),
"past_pers_goal" = c("selfPERS_p5_goal", "otherPERS_p5_goal"),
"past_pers_social" = c("selfPERS_p5_social", "otherPERS_p5_social"),
"past_pers_agree" = c("selfPERS_p5_agree", "otherPERS_p5_agree"),
"past_pers_stress" = c("selfPERS_p5_stress", "otherPERS_p5_stress"),
"past_pers_DGEN" = c("selfPERS_p5_dgen", "otherPERS_p5_dgen"),
# Preferences (PREF)
"past_pref_hobbies" = c("selfPREF_p5_hobbies", "otherPREF_p5_hobbies"),
"past_pref_music" = c("selfPREF_p5_music", "otherPREF_p5_music"),
"past_pref_dress" = c("selfPREF_p5_dress", "otherPREF_p5_dress"),
"past_pref_exer" = c("selfPREF_p5_exer", "otherPREF_p5_exer"),
"past_pref_food" = c("selfPREF_p5_food", "otherPREF_p5_food"),
"past_pref_friends" = c("selfPREF_p5_friends", "otherPREF_p5_friends"),
"past_pref_DGEN" = c("selfPREF_p5_dgen", "otherPREF_p5_dgen")
)
# Future variables mapping: [self/other][VAL/PERS/PREF]_f5_[string] -> fut_[val/pers/pref]_[string]
future_mappings <- list(
# Values (VAL)
"fut_val_trad" = c("selfVAL_f5_trad", "otherVAL_f5_trad"),
"fut_val_autonomy" = c("selfVAL_f5_autonomy", "otherVAL_f5_autonomy"),
"fut_val_personal" = c("selfVAL_f5_personal", "otherVAL_f5_personal"),
"fut_val_justice" = c("selfVAL_f5_justice", "otherVAL_f5_justice"),
"fut_val_close" = c("selfVAL_f5_close", "otherVAL_f5_close"),
"fut_val_connect" = c("selfVAL_f5_connect", "otherVAL_f5_connect"),
"fut_val_DGEN" = c("selfVAL_f5_dgen", "otherVAL_f5_dgen"),
# Personality (PERS)
"fut_pers_open" = c("selfPERS_f5_open", "otherPERS_f5_open"),
"fut_pers_goal" = c("selfPERS_f5_goal", "otherPERS_f5_goal"),
"fut_pers_social" = c("selfPERS_f5_social", "otherPERS_f5_social"),
"fut_pers_agree" = c("selfPERS_f5_agree", "otherPERS_f5_agree"),
"fut_pers_stress" = c("selfPERS_f5_stress", "otherPERS_f5_stress"),
"fut_pers_DGEN" = c("selfPERS_f5_dgen", "otherPERS_f5_dgen"),
# Preferences (PREF)
"fut_pref_hobbies" = c("selfPREF_f5_hobbies", "otherPREF_f5_hobbies"),
"fut_pref_music" = c("selfPREF_f5_music", "otherPREF_f5_music"),
"fut_pref_dress" = c("selfPREF_f5_dress", "otherPREF_f5_dress"),
"fut_pref_exer" = c("selfPREF_f5_exer", "otherPREF_f5_exer"),
"fut_pref_food" = c("selfPREF_f5_food", "otherPREF_f5_food"),
"fut_pref_friends" = c("selfPREF_f5_friends", "otherPREF_f5_friends"),
"fut_pref_DGEN" = c("selfPREF_f5_dgen", "otherPREF_f5_dgen")
)
# =============================================================================
# 3. COMBINE VARIABLES
# =============================================================================
# Function to combine self and other variables
# For each row, values exist in either self OR other, never both
# NOTE: Column existence should be checked before calling this function
combine_vars <- function(df, self_col, other_col) {
# Safety check: if columns don't exist, return appropriate fallback
if (!self_col %in% names(df)) {
stop(paste("ERROR: Column", self_col, "not found. This should have been caught earlier."))
}
if (!other_col %in% names(df)) {
stop(paste("ERROR: Column", other_col, "not found. This should have been caught earlier."))
}
# Combine: use self value if not empty/NA, otherwise use other value
# Handle both NA and empty strings
result <- ifelse(
!is.na(df[[self_col]]) & df[[self_col]] != "",
df[[self_col]],
ifelse(
!is.na(df[[other_col]]) & df[[other_col]] != "",
df[[other_col]],
NA
)
)
return(result)
}
# Apply past mappings
cat("\nCombining past variables...\n")
missing_cols <- list()
for (new_col in names(past_mappings)) {
self_col <- past_mappings[[new_col]][1]
other_col <- past_mappings[[new_col]][2]
# Check if all required columns exist
missing <- c()
if (!new_col %in% names(df)) {
missing <- c(missing, paste("target:", new_col))
}
if (!self_col %in% names(df)) {
missing <- c(missing, paste("self:", self_col))
}
if (!other_col %in% names(df)) {
missing <- c(missing, paste("other:", other_col))
}
if (length(missing) > 0) {
missing_cols[[new_col]] <- missing
warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", ")))
next
}
# All columns exist, proceed with combination
df[[new_col]] <- combine_vars(df, self_col, other_col)
cat(paste(" Updated:", new_col, "\n"))
}
# Report any missing columns
if (length(missing_cols) > 0) {
cat("\n⚠ Missing columns detected in PAST variables:\n")
for (var in names(missing_cols)) {
cat(paste(" ", var, ":", paste(missing_cols[[var]], collapse = ", "), "\n"))
}
}
# Apply future mappings
cat("\nCombining future variables...\n")
missing_cols_future <- list()
for (new_col in names(future_mappings)) {
self_col <- future_mappings[[new_col]][1]
other_col <- future_mappings[[new_col]][2]
# Check if all required columns exist
missing <- c()
if (!new_col %in% names(df)) {
missing <- c(missing, paste("target:", new_col))
}
if (!self_col %in% names(df)) {
missing <- c(missing, paste("self:", self_col))
}
if (!other_col %in% names(df)) {
missing <- c(missing, paste("other:", other_col))
}
if (length(missing) > 0) {
missing_cols_future[[new_col]] <- missing
warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", ")))
next
}
# All columns exist, proceed with combination
df[[new_col]] <- combine_vars(df, self_col, other_col)
cat(paste(" Updated:", new_col, "\n"))
}
# Report any missing columns
if (length(missing_cols_future) > 0) {
cat("\n⚠ Missing columns detected in FUTURE variables:\n")
for (var in names(missing_cols_future)) {
cat(paste(" ", var, ":", paste(missing_cols_future[[var]], collapse = ", "), "\n"))
}
}
# =============================================================================
# 4. VALIDATION CHECKS
# =============================================================================
cat("\n=== VALIDATION CHECKS ===\n\n")
# Check 1: Ensure no row has values in both self and other for the same variable
check_conflicts <- function(df, mappings) {
conflicts <- data.frame()
for (new_col in names(mappings)) {
self_col <- mappings[[new_col]][1]
other_col <- mappings[[new_col]][2]
if (self_col %in% names(df) && other_col %in% names(df)) {
# Find rows where both self and other have non-empty values
both_filled <- !is.na(df[[self_col]]) & df[[self_col]] != "" &
!is.na(df[[other_col]]) & df[[other_col]] != ""
if (any(both_filled, na.rm = TRUE)) {
conflict_rows <- which(both_filled)
conflicts <- rbind(conflicts, data.frame(
variable = new_col,
self_col = self_col,
other_col = other_col,
n_conflicts = length(conflict_rows),
example_rows = paste(head(conflict_rows, 5), collapse = ", ")
))
}
}
}
return(conflicts)
}
past_conflicts <- check_conflicts(df, past_mappings)
future_conflicts <- check_conflicts(df, future_mappings)
if (nrow(past_conflicts) > 0) {
cat("WARNING: Found conflicts in PAST variables (both self and other have values):\n")
print(past_conflicts)
} else {
cat("✓ No conflicts found in PAST variables\n")
}
if (nrow(future_conflicts) > 0) {
cat("\nWARNING: Found conflicts in FUTURE variables (both self and other have values):\n")
print(future_conflicts)
} else {
cat("✓ No conflicts found in FUTURE variables\n")
}
# Check 2: Verify that combined columns have values where expected
check_coverage <- function(df, mappings) {
coverage <- data.frame()
for (new_col in names(mappings)) {
self_col <- mappings[[new_col]][1]
other_col <- mappings[[new_col]][2]
# Check if columns exist before counting
self_exists <- self_col %in% names(df)
other_exists <- other_col %in% names(df)
target_exists <- new_col %in% names(df)
# Count non-empty values in original columns (only if they exist)
self_count <- if (self_exists) {
sum(!is.na(df[[self_col]]) & df[[self_col]] != "", na.rm = TRUE)
} else {
NA
}
other_count <- if (other_exists) {
sum(!is.na(df[[other_col]]) & df[[other_col]] != "", na.rm = TRUE)
} else {
NA
}
combined_count <- if (target_exists) {
sum(!is.na(df[[new_col]]) & df[[new_col]] != "", na.rm = TRUE)
} else {
NA
}
# Combined should equal sum of self and other (since they don't overlap)
expected_count <- if (!is.na(self_count) && !is.na(other_count)) {
self_count + other_count
} else {
NA
}
match <- if (!is.na(combined_count) && !is.na(expected_count)) {
combined_count == expected_count
} else {
NA
}
coverage <- rbind(coverage, data.frame(
variable = new_col,
self_non_empty = self_count,
other_non_empty = other_count,
combined_non_empty = combined_count,
expected_non_empty = expected_count,
match = match
))
}
return(coverage)
}
past_coverage <- check_coverage(df, past_mappings)
future_coverage <- check_coverage(df, future_mappings)
cat("\n=== COVERAGE CHECK ===\n")
cat("\nPAST variables:\n")
print(past_coverage)
cat("\nFUTURE variables:\n")
print(future_coverage)
# Check if all coverage matches
all_past_match <- all(past_coverage$match, na.rm = TRUE)
all_future_match <- all(future_coverage$match, na.rm = TRUE)
if (all_past_match && all_future_match) {
cat("\n✓ All combined variables have correct coverage\n")
} else {
cat("\n⚠ Some variables may have missing coverage - check the table above\n")
}
# Check 3: Sample check - verify a few rows manually
cat("\n=== SAMPLE ROW CHECK ===\n")
sample_rows <- min(5, nrow(df))
cat(paste("Checking first", sample_rows, "rows:\n\n"))
for (i in 1:sample_rows) {
cat(paste("Row", i, ":\n"))
# Check one past variable
test_var <- "past_val_trad"
self_val <- if (past_mappings[[test_var]][1] %in% names(df)) df[i, past_mappings[[test_var]][1]] else NA
other_val <- if (past_mappings[[test_var]][2] %in% names(df)) df[i, past_mappings[[test_var]][2]] else NA
combined_val <- df[i, test_var]
cat(sprintf(" %s: self=%s, other=%s, combined=%s\n",
test_var,
ifelse(is.na(self_val) || self_val == "", "empty", self_val),
ifelse(is.na(other_val) || other_val == "", "empty", other_val),
ifelse(is.na(combined_val) || combined_val == "", "empty", combined_val)))
}
# =============================================================================
# 5. SAVE UPDATED DATA
# =============================================================================
write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
cat("Updated data saved to: eohi3.csv\n")
cat(paste("Total rows:", nrow(df), "\n"))
cat(paste("Total columns:", ncol(df), "\n"))