188 lines
7.0 KiB
R
188 lines
7.0 KiB
R
library(dplyr)
|
|
|
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3")
|
|
|
|
# Read the data (with check.names=FALSE to preserve original column names)
|
|
# Keep empty cells as empty strings, not NA
|
|
# Only convert the literal string "NA" to NA, not empty strings
|
|
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
|
|
|
# =============================================================================
|
|
# 1. CREATE BACKUP
|
|
# =============================================================================
|
|
file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)
|
|
|
|
# =============================================================================
|
|
# 2. DEFINE VARIABLE MAPPINGS
|
|
# =============================================================================
|
|
|
|
# Target variables (excluding those ending in _MEAN)
|
|
# Each target var = past_var - fut_var
|
|
ehi_mappings <- list(
|
|
# Preferences (PREF)
|
|
"ehi_pref_hobbies" = c("past_pref_hobbies", "fut_pref_hobbies"),
|
|
"ehi_pref_music" = c("past_pref_music", "fut_pref_music"),
|
|
"ehi_pref_dress" = c("past_pref_dress", "fut_pref_dress"),
|
|
"ehi_pref_exer" = c("past_pref_exer", "fut_pref_exer"),
|
|
"ehi_pref_food" = c("past_pref_food", "fut_pref_food"),
|
|
"ehi_pref_friends" = c("past_pref_friends", "fut_pref_friends"),
|
|
"ehi_pref_DGEN" = c("past_pref_DGEN", "fut_pref_DGEN"),
|
|
|
|
# Personality (PERS)
|
|
"ehi_pers_open" = c("past_pers_open", "fut_pers_open"),
|
|
"ehi_pers_goal" = c("past_pers_goal", "fut_pers_goal"),
|
|
"ehi_pers_social" = c("past_pers_social", "fut_pers_social"),
|
|
"ehi_pers_agree" = c("past_pers_agree", "fut_pers_agree"),
|
|
"ehi_pers_stress" = c("past_pers_stress", "fut_pers_stress"),
|
|
"ehi_pers_DGEN" = c("past_pers_DGEN", "fut_pers_DGEN"),
|
|
|
|
# Values (VAL)
|
|
"ehi_val_trad" = c("past_val_trad", "fut_val_trad"),
|
|
"ehi_val_autonomy" = c("past_val_autonomy", "fut_val_autonomy"),
|
|
"ehi_val_personal" = c("past_val_personal", "fut_val_personal"),
|
|
"ehi_val_justice" = c("past_val_justice", "fut_val_justice"),
|
|
"ehi_val_close" = c("past_val_close", "fut_val_close"),
|
|
"ehi_val_connect" = c("past_val_connect", "fut_val_connect"),
|
|
"ehi_val_DGEN" = c("past_val_DGEN", "fut_val_DGEN")
|
|
)
|
|
|
|
# =============================================================================
|
|
# 3. CHECK IF TARGET VARIABLES EXIST
|
|
# =============================================================================
|
|
|
|
missing_targets <- c()
|
|
for (target_var in names(ehi_mappings)) {
|
|
if (!target_var %in% names(df)) {
|
|
missing_targets <- c(missing_targets, target_var)
|
|
cat(paste("⚠ Target variable not found:", target_var, "\n"))
|
|
}
|
|
}
|
|
|
|
if (length(missing_targets) > 0) {
|
|
cat("\nERROR: The following target variables are missing from eohi3.csv:\n")
|
|
for (var in missing_targets) {
|
|
cat(paste(" -", var, "\n"))
|
|
}
|
|
stop("Cannot proceed without target variables. Please add them to the CSV file.")
|
|
}
|
|
|
|
# =============================================================================
|
|
# 4. CALCULATE EHI VARIABLES (past - future)
|
|
# =============================================================================
|
|
|
|
missing_source_cols <- list()
|
|
|
|
for (target_var in names(ehi_mappings)) {
|
|
past_var <- ehi_mappings[[target_var]][1]
|
|
fut_var <- ehi_mappings[[target_var]][2]
|
|
|
|
# Check if source columns exist
|
|
missing <- c()
|
|
if (!past_var %in% names(df)) {
|
|
missing <- c(missing, past_var)
|
|
}
|
|
if (!fut_var %in% names(df)) {
|
|
missing <- c(missing, fut_var)
|
|
}
|
|
|
|
if (length(missing) > 0) {
|
|
missing_source_cols[[target_var]] <- missing
|
|
warning(paste("Skipping", target_var, "- missing source columns:", paste(missing, collapse = ", ")))
|
|
next
|
|
}
|
|
|
|
# Convert to numeric, handling empty strings and NA
|
|
past_vals <- as.numeric(ifelse(df[[past_var]] == "" | is.na(df[[past_var]]), NA, df[[past_var]]))
|
|
fut_vals <- as.numeric(ifelse(df[[fut_var]] == "" | is.na(df[[fut_var]]), NA, df[[fut_var]]))
|
|
|
|
# Calculate difference: past - future
|
|
ehi_vals <- past_vals - fut_vals
|
|
|
|
# Update target column
|
|
df[[target_var]] <- ehi_vals
|
|
cat(paste(" Calculated:", target_var, "=", past_var, "-", fut_var, "\n"))
|
|
}
|
|
|
|
# Report any missing source columns
|
|
if (length(missing_source_cols) > 0) {
|
|
for (var in names(missing_source_cols)) {
|
|
cat(paste(" ", var, ":", paste(missing_source_cols[[var]], collapse = ", "), "\n"))
|
|
}
|
|
}
|
|
|
|
# =============================================================================
|
|
# 5. VALIDATION: CHECK 5 RANDOM ROWS
|
|
# =============================================================================
|
|
|
|
cat("\n=== VALIDATION: CHECKING 5 RANDOM ROWS ===\n\n")
|
|
|
|
# Set seed for reproducibility
|
|
set.seed(123)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
sample_rows <- sort(sample_rows)
|
|
|
|
for (i in sample_rows) {
|
|
cat(paste("Row", i, ":\n"))
|
|
|
|
# Check a few representative variables from each category
|
|
test_vars <- c(
|
|
"ehi_pref_hobbies",
|
|
"ehi_pers_open",
|
|
"ehi_val_trad"
|
|
)
|
|
|
|
for (target_var in test_vars) {
|
|
if (target_var %in% names(ehi_mappings)) {
|
|
past_var <- ehi_mappings[[target_var]][1]
|
|
fut_var <- ehi_mappings[[target_var]][2]
|
|
|
|
if (past_var %in% names(df) && fut_var %in% names(df)) {
|
|
past_val <- df[i, past_var]
|
|
fut_val <- df[i, fut_var]
|
|
ehi_val <- df[i, target_var]
|
|
|
|
# Convert to numeric for calculation check
|
|
past_num <- as.numeric(ifelse(past_val == "" | is.na(past_val), NA, past_val))
|
|
fut_num <- as.numeric(ifelse(fut_val == "" | is.na(fut_val), NA, fut_val))
|
|
ehi_num <- as.numeric(ifelse(is.na(ehi_val), NA, ehi_val))
|
|
|
|
# Calculate expected value
|
|
expected <- if (!is.na(past_num) && !is.na(fut_num)) {
|
|
past_num - fut_num
|
|
} else {
|
|
NA
|
|
}
|
|
|
|
# Check if calculation is correct
|
|
match <- if (!is.na(expected) && !is.na(ehi_num)) {
|
|
abs(expected - ehi_num) < 0.0001 # Allow for floating point precision
|
|
} else {
|
|
is.na(expected) && is.na(ehi_num)
|
|
}
|
|
|
|
cat(sprintf(" %s:\n", target_var))
|
|
cat(sprintf(" %s = %s\n", past_var, ifelse(is.na(past_val) || past_val == "", "NA/empty", past_val)))
|
|
cat(sprintf(" %s = %s\n", fut_var, ifelse(is.na(fut_val) || fut_val == "", "NA/empty", fut_val)))
|
|
cat(sprintf(" %s = %s\n", target_var, ifelse(is.na(ehi_val), "NA", ehi_val)))
|
|
cat(sprintf(" Expected: %s - %s = %s\n",
|
|
ifelse(is.na(past_num), "NA", past_num),
|
|
ifelse(is.na(fut_num), "NA", fut_num),
|
|
ifelse(is.na(expected), "NA", expected)))
|
|
cat(sprintf(" Match: %s\n\n", ifelse(match, "✓", "✗ ERROR")))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# =============================================================================
|
|
# 6. SAVE UPDATED DATA
|
|
# =============================================================================
|
|
# COMMENTED OUT: Uncomment when ready to save
|
|
|
|
# cat("\n=== SAVING DATA ===\n")
|
|
write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
|
|
# cat("Updated data saved to: eohi3.csv\n")
|
|
# cat(paste("Total rows:", nrow(df), "\n"))
|
|
# cat(paste("Total columns:", ncol(df), "\n"))
|
|
|