344 lines
12 KiB
R
344 lines
12 KiB
R
library(dplyr)
|
|
|
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3")
|
|
|
|
# Read the data (with check.names=FALSE to preserve original column names)
|
|
# Keep empty cells as empty strings, not NA
|
|
# Only convert the literal string "NA" to NA, not empty strings
|
|
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
|
|
|
# =============================================================================
|
|
# 1. CREATE BACKUP
|
|
# =============================================================================
|
|
#file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)
|
|
|
|
# =============================================================================
|
|
# 2. DEFINE VARIABLE MAPPINGS
|
|
# =============================================================================
|
|
|
|
# Past variables mapping: [self/other][VAL/PERS/PREF]_p5_[string] -> past_[val/pers/pref]_[string]
|
|
past_mappings <- list(
|
|
# Values (VAL)
|
|
"past_val_trad" = c("selfVAL_p5_trad", "otherVAL_p5_trad"),
|
|
"past_val_autonomy" = c("selfVAL_p5_autonomy", "otherVAL_p5_autonomy"),
|
|
"past_val_personal" = c("selfVAL_p5_personal", "otherVAL_p5_personal"),
|
|
"past_val_justice" = c("selfVAL_p5_justice", "otherVAL_p5_justice"),
|
|
"past_val_close" = c("selfVAL_p5_close", "otherVAL_p5_close"),
|
|
"past_val_connect" = c("selfVAL_p5_connect", "otherVAL_p5_connect"),
|
|
"past_val_DGEN" = c("selfVAL_p5_dgen", "otherVAL_p5_dgen"),
|
|
|
|
# Personality (PERS)
|
|
"past_pers_open" = c("selfPERS_p5_open", "otherPERS_p5_open"),
|
|
"past_pers_goal" = c("selfPERS_p5_goal", "otherPERS_p5_goal"),
|
|
"past_pers_social" = c("selfPERS_p5_social", "otherPERS_p5_social"),
|
|
"past_pers_agree" = c("selfPERS_p5_agree", "otherPERS_p5_agree"),
|
|
"past_pers_stress" = c("selfPERS_p5_stress", "otherPERS_p5_stress"),
|
|
"past_pers_DGEN" = c("selfPERS_p5_dgen", "otherPERS_p5_dgen"),
|
|
|
|
# Preferences (PREF)
|
|
"past_pref_hobbies" = c("selfPREF_p5_hobbies", "otherPREF_p5_hobbies"),
|
|
"past_pref_music" = c("selfPREF_p5_music", "otherPREF_p5_music"),
|
|
"past_pref_dress" = c("selfPREF_p5_dress", "otherPREF_p5_dress"),
|
|
"past_pref_exer" = c("selfPREF_p5_exer", "otherPREF_p5_exer"),
|
|
"past_pref_food" = c("selfPREF_p5_food", "otherPREF_p5_food"),
|
|
"past_pref_friends" = c("selfPREF_p5_friends", "otherPREF_p5_friends"),
|
|
"past_pref_DGEN" = c("selfPREF_p5_dgen", "otherPREF_p5_dgen")
|
|
)
|
|
|
|
# Future variables mapping: [self/other][VAL/PERS/PREF]_f5_[string] -> fut_[val/pers/pref]_[string]
|
|
future_mappings <- list(
|
|
# Values (VAL)
|
|
"fut_val_trad" = c("selfVAL_f5_trad", "otherVAL_f5_trad"),
|
|
"fut_val_autonomy" = c("selfVAL_f5_autonomy", "otherVAL_f5_autonomy"),
|
|
"fut_val_personal" = c("selfVAL_f5_personal", "otherVAL_f5_personal"),
|
|
"fut_val_justice" = c("selfVAL_f5_justice", "otherVAL_f5_justice"),
|
|
"fut_val_close" = c("selfVAL_f5_close", "otherVAL_f5_close"),
|
|
"fut_val_connect" = c("selfVAL_f5_connect", "otherVAL_f5_connect"),
|
|
"fut_val_DGEN" = c("selfVAL_f5_dgen", "otherVAL_f5_dgen"),
|
|
|
|
# Personality (PERS)
|
|
"fut_pers_open" = c("selfPERS_f5_open", "otherPERS_f5_open"),
|
|
"fut_pers_goal" = c("selfPERS_f5_goal", "otherPERS_f5_goal"),
|
|
"fut_pers_social" = c("selfPERS_f5_social", "otherPERS_f5_social"),
|
|
"fut_pers_agree" = c("selfPERS_f5_agree", "otherPERS_f5_agree"),
|
|
"fut_pers_stress" = c("selfPERS_f5_stress", "otherPERS_f5_stress"),
|
|
"fut_pers_DGEN" = c("selfPERS_f5_dgen", "otherPERS_f5_dgen"),
|
|
|
|
# Preferences (PREF)
|
|
"fut_pref_hobbies" = c("selfPREF_f5_hobbies", "otherPREF_f5_hobbies"),
|
|
"fut_pref_music" = c("selfPREF_f5_music", "otherPREF_f5_music"),
|
|
"fut_pref_dress" = c("selfPREF_f5_dress", "otherPREF_f5_dress"),
|
|
"fut_pref_exer" = c("selfPREF_f5_exer", "otherPREF_f5_exer"),
|
|
"fut_pref_food" = c("selfPREF_f5_food", "otherPREF_f5_food"),
|
|
"fut_pref_friends" = c("selfPREF_f5_friends", "otherPREF_f5_friends"),
|
|
"fut_pref_DGEN" = c("selfPREF_f5_dgen", "otherPREF_f5_dgen")
|
|
)
|
|
|
|
# =============================================================================
|
|
# 3. COMBINE VARIABLES
|
|
# =============================================================================
|
|
|
|
# Function to combine self and other variables
|
|
# For each row, values exist in either self OR other, never both
|
|
# NOTE: Column existence should be checked before calling this function
|
|
combine_vars <- function(df, self_col, other_col) {
|
|
# Safety check: if columns don't exist, return appropriate fallback
|
|
if (!self_col %in% names(df)) {
|
|
stop(paste("ERROR: Column", self_col, "not found. This should have been caught earlier."))
|
|
}
|
|
if (!other_col %in% names(df)) {
|
|
stop(paste("ERROR: Column", other_col, "not found. This should have been caught earlier."))
|
|
}
|
|
|
|
# Combine: use self value if not empty/NA, otherwise use other value
|
|
# Handle both NA and empty strings
|
|
result <- ifelse(
|
|
!is.na(df[[self_col]]) & df[[self_col]] != "",
|
|
df[[self_col]],
|
|
ifelse(
|
|
!is.na(df[[other_col]]) & df[[other_col]] != "",
|
|
df[[other_col]],
|
|
NA
|
|
)
|
|
)
|
|
|
|
return(result)
|
|
}
|
|
|
|
# Apply past mappings
|
|
cat("\nCombining past variables...\n")
|
|
missing_cols <- list()
|
|
for (new_col in names(past_mappings)) {
|
|
self_col <- past_mappings[[new_col]][1]
|
|
other_col <- past_mappings[[new_col]][2]
|
|
|
|
# Check if all required columns exist
|
|
missing <- c()
|
|
if (!new_col %in% names(df)) {
|
|
missing <- c(missing, paste("target:", new_col))
|
|
}
|
|
if (!self_col %in% names(df)) {
|
|
missing <- c(missing, paste("self:", self_col))
|
|
}
|
|
if (!other_col %in% names(df)) {
|
|
missing <- c(missing, paste("other:", other_col))
|
|
}
|
|
|
|
if (length(missing) > 0) {
|
|
missing_cols[[new_col]] <- missing
|
|
warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", ")))
|
|
next
|
|
}
|
|
|
|
# All columns exist, proceed with combination
|
|
df[[new_col]] <- combine_vars(df, self_col, other_col)
|
|
cat(paste(" Updated:", new_col, "\n"))
|
|
}
|
|
|
|
# Report any missing columns
|
|
if (length(missing_cols) > 0) {
|
|
cat("\n⚠ Missing columns detected in PAST variables:\n")
|
|
for (var in names(missing_cols)) {
|
|
cat(paste(" ", var, ":", paste(missing_cols[[var]], collapse = ", "), "\n"))
|
|
}
|
|
}
|
|
|
|
# Apply future mappings
|
|
cat("\nCombining future variables...\n")
|
|
missing_cols_future <- list()
|
|
for (new_col in names(future_mappings)) {
|
|
self_col <- future_mappings[[new_col]][1]
|
|
other_col <- future_mappings[[new_col]][2]
|
|
|
|
# Check if all required columns exist
|
|
missing <- c()
|
|
if (!new_col %in% names(df)) {
|
|
missing <- c(missing, paste("target:", new_col))
|
|
}
|
|
if (!self_col %in% names(df)) {
|
|
missing <- c(missing, paste("self:", self_col))
|
|
}
|
|
if (!other_col %in% names(df)) {
|
|
missing <- c(missing, paste("other:", other_col))
|
|
}
|
|
|
|
if (length(missing) > 0) {
|
|
missing_cols_future[[new_col]] <- missing
|
|
warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", ")))
|
|
next
|
|
}
|
|
|
|
# All columns exist, proceed with combination
|
|
df[[new_col]] <- combine_vars(df, self_col, other_col)
|
|
cat(paste(" Updated:", new_col, "\n"))
|
|
}
|
|
|
|
# Report any missing columns
|
|
if (length(missing_cols_future) > 0) {
|
|
cat("\n⚠ Missing columns detected in FUTURE variables:\n")
|
|
for (var in names(missing_cols_future)) {
|
|
cat(paste(" ", var, ":", paste(missing_cols_future[[var]], collapse = ", "), "\n"))
|
|
}
|
|
}
|
|
|
|
# =============================================================================
|
|
# 4. VALIDATION CHECKS
|
|
# =============================================================================
|
|
|
|
cat("\n=== VALIDATION CHECKS ===\n\n")
|
|
|
|
# Check 1: Ensure no row has values in both self and other for the same variable
|
|
check_conflicts <- function(df, mappings) {
|
|
conflicts <- data.frame()
|
|
|
|
for (new_col in names(mappings)) {
|
|
self_col <- mappings[[new_col]][1]
|
|
other_col <- mappings[[new_col]][2]
|
|
|
|
if (self_col %in% names(df) && other_col %in% names(df)) {
|
|
# Find rows where both self and other have non-empty values
|
|
both_filled <- !is.na(df[[self_col]]) & df[[self_col]] != "" &
|
|
!is.na(df[[other_col]]) & df[[other_col]] != ""
|
|
|
|
if (any(both_filled, na.rm = TRUE)) {
|
|
conflict_rows <- which(both_filled)
|
|
conflicts <- rbind(conflicts, data.frame(
|
|
variable = new_col,
|
|
self_col = self_col,
|
|
other_col = other_col,
|
|
n_conflicts = length(conflict_rows),
|
|
example_rows = paste(head(conflict_rows, 5), collapse = ", ")
|
|
))
|
|
}
|
|
}
|
|
}
|
|
|
|
return(conflicts)
|
|
}
|
|
|
|
past_conflicts <- check_conflicts(df, past_mappings)
|
|
future_conflicts <- check_conflicts(df, future_mappings)
|
|
|
|
if (nrow(past_conflicts) > 0) {
|
|
cat("WARNING: Found conflicts in PAST variables (both self and other have values):\n")
|
|
print(past_conflicts)
|
|
} else {
|
|
cat("✓ No conflicts found in PAST variables\n")
|
|
}
|
|
|
|
if (nrow(future_conflicts) > 0) {
|
|
cat("\nWARNING: Found conflicts in FUTURE variables (both self and other have values):\n")
|
|
print(future_conflicts)
|
|
} else {
|
|
cat("✓ No conflicts found in FUTURE variables\n")
|
|
}
|
|
|
|
# Check 2: Verify that combined columns have values where expected
|
|
check_coverage <- function(df, mappings) {
|
|
coverage <- data.frame()
|
|
|
|
for (new_col in names(mappings)) {
|
|
self_col <- mappings[[new_col]][1]
|
|
other_col <- mappings[[new_col]][2]
|
|
|
|
# Check if columns exist before counting
|
|
self_exists <- self_col %in% names(df)
|
|
other_exists <- other_col %in% names(df)
|
|
target_exists <- new_col %in% names(df)
|
|
|
|
# Count non-empty values in original columns (only if they exist)
|
|
self_count <- if (self_exists) {
|
|
sum(!is.na(df[[self_col]]) & df[[self_col]] != "", na.rm = TRUE)
|
|
} else {
|
|
NA
|
|
}
|
|
|
|
other_count <- if (other_exists) {
|
|
sum(!is.na(df[[other_col]]) & df[[other_col]] != "", na.rm = TRUE)
|
|
} else {
|
|
NA
|
|
}
|
|
|
|
combined_count <- if (target_exists) {
|
|
sum(!is.na(df[[new_col]]) & df[[new_col]] != "", na.rm = TRUE)
|
|
} else {
|
|
NA
|
|
}
|
|
|
|
# Combined should equal sum of self and other (since they don't overlap)
|
|
expected_count <- if (!is.na(self_count) && !is.na(other_count)) {
|
|
self_count + other_count
|
|
} else {
|
|
NA
|
|
}
|
|
|
|
match <- if (!is.na(combined_count) && !is.na(expected_count)) {
|
|
combined_count == expected_count
|
|
} else {
|
|
NA
|
|
}
|
|
|
|
coverage <- rbind(coverage, data.frame(
|
|
variable = new_col,
|
|
self_non_empty = self_count,
|
|
other_non_empty = other_count,
|
|
combined_non_empty = combined_count,
|
|
expected_non_empty = expected_count,
|
|
match = match
|
|
))
|
|
}
|
|
|
|
return(coverage)
|
|
}
|
|
|
|
past_coverage <- check_coverage(df, past_mappings)
|
|
future_coverage <- check_coverage(df, future_mappings)
|
|
|
|
cat("\n=== COVERAGE CHECK ===\n")
|
|
cat("\nPAST variables:\n")
|
|
print(past_coverage)
|
|
|
|
cat("\nFUTURE variables:\n")
|
|
print(future_coverage)
|
|
|
|
# Check if all coverage matches
|
|
all_past_match <- all(past_coverage$match, na.rm = TRUE)
|
|
all_future_match <- all(future_coverage$match, na.rm = TRUE)
|
|
|
|
if (all_past_match && all_future_match) {
|
|
cat("\n✓ All combined variables have correct coverage\n")
|
|
} else {
|
|
cat("\n⚠ Some variables may have missing coverage - check the table above\n")
|
|
}
|
|
|
|
# Check 3: Sample check - verify a few rows manually
|
|
cat("\n=== SAMPLE ROW CHECK ===\n")
|
|
sample_rows <- min(5, nrow(df))
|
|
cat(paste("Checking first", sample_rows, "rows:\n\n"))
|
|
|
|
for (i in 1:sample_rows) {
|
|
cat(paste("Row", i, ":\n"))
|
|
|
|
# Check one past variable
|
|
test_var <- "past_val_trad"
|
|
self_val <- if (past_mappings[[test_var]][1] %in% names(df)) df[i, past_mappings[[test_var]][1]] else NA
|
|
other_val <- if (past_mappings[[test_var]][2] %in% names(df)) df[i, past_mappings[[test_var]][2]] else NA
|
|
combined_val <- df[i, test_var]
|
|
|
|
cat(sprintf(" %s: self=%s, other=%s, combined=%s\n",
|
|
test_var,
|
|
ifelse(is.na(self_val) || self_val == "", "empty", self_val),
|
|
ifelse(is.na(other_val) || other_val == "", "empty", other_val),
|
|
ifelse(is.na(combined_val) || combined_val == "", "empty", combined_val)))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 5. SAVE UPDATED DATA
|
|
# =============================================================================
|
|
|
|
|
|
write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
|
|
cat("Updated data saved to: eohi3.csv\n")
|
|
cat(paste("Total rows:", nrow(df), "\n"))
|
|
cat(paste("Total columns:", ncol(df), "\n"))
|