eohi/eohi3/datap 04 - combined vars.r

library(dplyr)

setwd("/home/ladmin/Documents/DND/EOHI/eohi3")

# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")

# =============================================================================
# 1. CREATE BACKUP
# =============================================================================
#file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)

# =============================================================================
# 2. DEFINE VARIABLE MAPPINGS
# =============================================================================

# Past variables mapping: [self/other][VAL/PERS/PREF]_p5_[string] -> past_[val/pers/pref]_[string]
past_mappings <- list(
  # Values (VAL)
  "past_val_trad" = c("selfVAL_p5_trad", "otherVAL_p5_trad"),
  "past_val_autonomy" = c("selfVAL_p5_autonomy", "otherVAL_p5_autonomy"),
  "past_val_personal" = c("selfVAL_p5_personal", "otherVAL_p5_personal"),
  "past_val_justice" = c("selfVAL_p5_justice", "otherVAL_p5_justice"),
  "past_val_close" = c("selfVAL_p5_close", "otherVAL_p5_close"),
  "past_val_connect" = c("selfVAL_p5_connect", "otherVAL_p5_connect"),
  "past_val_DGEN" = c("selfVAL_p5_dgen", "otherVAL_p5_dgen"),

  # Personality (PERS)
  "past_pers_open" = c("selfPERS_p5_open", "otherPERS_p5_open"),
  "past_pers_goal" = c("selfPERS_p5_goal", "otherPERS_p5_goal"),
  "past_pers_social" = c("selfPERS_p5_social", "otherPERS_p5_social"),
  "past_pers_agree" = c("selfPERS_p5_agree", "otherPERS_p5_agree"),
  "past_pers_stress" = c("selfPERS_p5_stress", "otherPERS_p5_stress"),
  "past_pers_DGEN" = c("selfPERS_p5_dgen", "otherPERS_p5_dgen"),

  # Preferences (PREF)
  "past_pref_hobbies" = c("selfPREF_p5_hobbies", "otherPREF_p5_hobbies"),
  "past_pref_music" = c("selfPREF_p5_music", "otherPREF_p5_music"),
  "past_pref_dress" = c("selfPREF_p5_dress", "otherPREF_p5_dress"),
  "past_pref_exer" = c("selfPREF_p5_exer", "otherPREF_p5_exer"),
  "past_pref_food" = c("selfPREF_p5_food", "otherPREF_p5_food"),
  "past_pref_friends" = c("selfPREF_p5_friends", "otherPREF_p5_friends"),
  "past_pref_DGEN" = c("selfPREF_p5_dgen", "otherPREF_p5_dgen")
)

# Future variables mapping: [self/other][VAL/PERS/PREF]_f5_[string] -> fut_[val/pers/pref]_[string]
future_mappings <- list(
  # Values (VAL)
  "fut_val_trad" = c("selfVAL_f5_trad", "otherVAL_f5_trad"),
  "fut_val_autonomy" = c("selfVAL_f5_autonomy", "otherVAL_f5_autonomy"),
  "fut_val_personal" = c("selfVAL_f5_personal", "otherVAL_f5_personal"),
  "fut_val_justice" = c("selfVAL_f5_justice", "otherVAL_f5_justice"),
  "fut_val_close" = c("selfVAL_f5_close", "otherVAL_f5_close"),
  "fut_val_connect" = c("selfVAL_f5_connect", "otherVAL_f5_connect"),
  "fut_val_DGEN" = c("selfVAL_f5_dgen", "otherVAL_f5_dgen"),

  # Personality (PERS)
  "fut_pers_open" = c("selfPERS_f5_open", "otherPERS_f5_open"),
  "fut_pers_goal" = c("selfPERS_f5_goal", "otherPERS_f5_goal"),
  "fut_pers_social" = c("selfPERS_f5_social", "otherPERS_f5_social"),
  "fut_pers_agree" = c("selfPERS_f5_agree", "otherPERS_f5_agree"),
  "fut_pers_stress" = c("selfPERS_f5_stress", "otherPERS_f5_stress"),
  "fut_pers_DGEN" = c("selfPERS_f5_dgen", "otherPERS_f5_dgen"),

  # Preferences (PREF)
  "fut_pref_hobbies" = c("selfPREF_f5_hobbies", "otherPREF_f5_hobbies"),
  "fut_pref_music" = c("selfPREF_f5_music", "otherPREF_f5_music"),
  "fut_pref_dress" = c("selfPREF_f5_dress", "otherPREF_f5_dress"),
  "fut_pref_exer" = c("selfPREF_f5_exer", "otherPREF_f5_exer"),
  "fut_pref_food" = c("selfPREF_f5_food", "otherPREF_f5_food"),
  "fut_pref_friends" = c("selfPREF_f5_friends", "otherPREF_f5_friends"),
  "fut_pref_DGEN" = c("selfPREF_f5_dgen", "otherPREF_f5_dgen")
)

# =============================================================================
# 3. COMBINE VARIABLES
# =============================================================================

# Function to combine self and other variables
# For each row, values exist in either self OR other, never both
# NOTE: Column existence should be checked before calling this function
combine_vars <- function(df, self_col, other_col) {
  # Safety check: if columns don't exist, return appropriate fallback
  if (!self_col %in% names(df)) {
    stop(paste("ERROR: Column", self_col, "not found. This should have been caught earlier."))
  }
  if (!other_col %in% names(df)) {
    stop(paste("ERROR: Column", other_col, "not found. This should have been caught earlier."))
  }

  # Combine: use self value if not empty/NA, otherwise use other value
  # Handle both NA and empty strings
  result <- ifelse(
    !is.na(df[[self_col]]) & df[[self_col]] != "",
    df[[self_col]],
    ifelse(
      !is.na(df[[other_col]]) & df[[other_col]] != "",
      df[[other_col]],
      NA
    )
  )

  return(result)
}

# Apply past mappings
cat("\nCombining past variables...\n")
missing_cols <- list()
for (new_col in names(past_mappings)) {
  self_col <- past_mappings[[new_col]][1]
  other_col <- past_mappings[[new_col]][2]

  # Check if all required columns exist
  missing <- c()
  if (!new_col %in% names(df)) {
    missing <- c(missing, paste("target:", new_col))
  }
  if (!self_col %in% names(df)) {
    missing <- c(missing, paste("self:", self_col))
  }
  if (!other_col %in% names(df)) {
    missing <- c(missing, paste("other:", other_col))
  }

  if (length(missing) > 0) {
    missing_cols[[new_col]] <- missing
    warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", ")))
    next
  }

  # All columns exist, proceed with combination
  df[[new_col]] <- combine_vars(df, self_col, other_col)
  cat(paste("  Updated:", new_col, "\n"))
}

# Report any missing columns
if (length(missing_cols) > 0) {
  cat("\n⚠ Missing columns detected in PAST variables:\n")
  for (var in names(missing_cols)) {
    cat(paste("  ", var, ":", paste(missing_cols[[var]], collapse = ", "), "\n"))
  }
}

# Apply future mappings
cat("\nCombining future variables...\n")
missing_cols_future <- list()
for (new_col in names(future_mappings)) {
  self_col <- future_mappings[[new_col]][1]
  other_col <- future_mappings[[new_col]][2]

  # Check if all required columns exist
  missing <- c()
  if (!new_col %in% names(df)) {
    missing <- c(missing, paste("target:", new_col))
  }
  if (!self_col %in% names(df)) {
    missing <- c(missing, paste("self:", self_col))
  }
  if (!other_col %in% names(df)) {
    missing <- c(missing, paste("other:", other_col))
  }

  if (length(missing) > 0) {
    missing_cols_future[[new_col]] <- missing
    warning(paste("Skipping", new_col, "- missing columns:", paste(missing, collapse = ", ")))
    next
  }

  # All columns exist, proceed with combination
  df[[new_col]] <- combine_vars(df, self_col, other_col)
  cat(paste("  Updated:", new_col, "\n"))
}

# Report any missing columns
if (length(missing_cols_future) > 0) {
  cat("\n⚠ Missing columns detected in FUTURE variables:\n")
  for (var in names(missing_cols_future)) {
    cat(paste("  ", var, ":", paste(missing_cols_future[[var]], collapse = ", "), "\n"))
  }
}

# =============================================================================
# 4. VALIDATION CHECKS
# =============================================================================

cat("\n=== VALIDATION CHECKS ===\n\n")

# Check 1: Ensure no row has values in both self and other for the same variable
check_conflicts <- function(df, mappings) {
  conflicts <- data.frame()

  for (new_col in names(mappings)) {
    self_col <- mappings[[new_col]][1]
    other_col <- mappings[[new_col]][2]

    if (self_col %in% names(df) && other_col %in% names(df)) {
      # Find rows where both self and other have non-empty values
      both_filled <- !is.na(df[[self_col]]) & df[[self_col]] != "" &
                     !is.na(df[[other_col]]) & df[[other_col]] != ""

      if (any(both_filled, na.rm = TRUE)) {
        conflict_rows <- which(both_filled)
        conflicts <- rbind(conflicts, data.frame(
          variable = new_col,
          self_col = self_col,
          other_col = other_col,
          n_conflicts = length(conflict_rows),
          example_rows = paste(head(conflict_rows, 5), collapse = ", ")
        ))
      }
    }
  }

  return(conflicts)
}

past_conflicts <- check_conflicts(df, past_mappings)
future_conflicts <- check_conflicts(df, future_mappings)

if (nrow(past_conflicts) > 0) {
  cat("WARNING: Found conflicts in PAST variables (both self and other have values):\n")
  print(past_conflicts)
} else {
  cat("✓ No conflicts found in PAST variables\n")
}

if (nrow(future_conflicts) > 0) {
  cat("\nWARNING: Found conflicts in FUTURE variables (both self and other have values):\n")
  print(future_conflicts)
} else {
  cat("✓ No conflicts found in FUTURE variables\n")
}

# Check 2: Verify that combined columns have values where expected
check_coverage <- function(df, mappings) {
  coverage <- data.frame()

  for (new_col in names(mappings)) {
    self_col <- mappings[[new_col]][1]
    other_col <- mappings[[new_col]][2]

    # Check if columns exist before counting
    self_exists <- self_col %in% names(df)
    other_exists <- other_col %in% names(df)
    target_exists <- new_col %in% names(df)

    # Count non-empty values in original columns (only if they exist)
    self_count <- if (self_exists) {
      sum(!is.na(df[[self_col]]) & df[[self_col]] != "", na.rm = TRUE)
    } else {
      NA
    }

    other_count <- if (other_exists) {
      sum(!is.na(df[[other_col]]) & df[[other_col]] != "", na.rm = TRUE)
    } else {
      NA
    }

    combined_count <- if (target_exists) {
      sum(!is.na(df[[new_col]]) & df[[new_col]] != "", na.rm = TRUE)
    } else {
      NA
    }

    # Combined should equal sum of self and other (since they don't overlap)
    expected_count <- if (!is.na(self_count) && !is.na(other_count)) {
      self_count + other_count
    } else {
      NA
    }

    match <- if (!is.na(combined_count) && !is.na(expected_count)) {
      combined_count == expected_count
    } else {
      NA
    }

    coverage <- rbind(coverage, data.frame(
      variable = new_col,
      self_non_empty = self_count,
      other_non_empty = other_count,
      combined_non_empty = combined_count,
      expected_non_empty = expected_count,
      match = match
    ))
  }

  return(coverage)
}

past_coverage <- check_coverage(df, past_mappings)
future_coverage <- check_coverage(df, future_mappings)

cat("\n=== COVERAGE CHECK ===\n")
cat("\nPAST variables:\n")
print(past_coverage)

cat("\nFUTURE variables:\n")
print(future_coverage)

# Check if all coverage matches
all_past_match <- all(past_coverage$match, na.rm = TRUE)
all_future_match <- all(future_coverage$match, na.rm = TRUE)

if (all_past_match && all_future_match) {
  cat("\n✓ All combined variables have correct coverage\n")
} else {
  cat("\n⚠ Some variables may have missing coverage - check the table above\n")
}

# Check 3: Sample check - verify a few rows manually
cat("\n=== SAMPLE ROW CHECK ===\n")
sample_rows <- min(5, nrow(df))
cat(paste("Checking first", sample_rows, "rows:\n\n"))

for (i in 1:sample_rows) {
  cat(paste("Row", i, ":\n"))

  # Check one past variable
  test_var <- "past_val_trad"
  self_val <- if (past_mappings[[test_var]][1] %in% names(df)) df[i, past_mappings[[test_var]][1]] else NA
  other_val <- if (past_mappings[[test_var]][2] %in% names(df)) df[i, past_mappings[[test_var]][2]] else NA
  combined_val <- df[i, test_var]

  cat(sprintf("  %s: self=%s, other=%s, combined=%s\n",
              test_var,
              ifelse(is.na(self_val) || self_val == "", "empty", self_val),
              ifelse(is.na(other_val) || other_val == "", "empty", other_val),
              ifelse(is.na(combined_val) || combined_val == "", "empty", combined_val)))
}

# =============================================================================
# 5. SAVE UPDATED DATA
# =============================================================================


write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
cat("Updated data saved to: eohi3.csv\n")
cat(paste("Total rows:", nrow(df), "\n"))
cat(paste("Total columns:", ncol(df), "\n"))