eohi/eohi3/datap 06 - mean vars.r

226 lines
8.2 KiB
R

library(dplyr)
setwd("/home/ladmin/Documents/DND/EOHI/eohi3")
# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
# =============================================================================
# 1. CREATE BACKUP
# =============================================================================
file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)
# =============================================================================
# 2. DEFINE MEAN VARIABLE MAPPINGS
# =============================================================================
mean_mappings <- list(
# Past Preferences MEAN
"past_pref_MEAN" = c("past_pref_hobbies", "past_pref_music", "past_pref_dress",
"past_pref_exer", "past_pref_food", "past_pref_friends"),
# Future Preferences MEAN
"fut_pref_MEAN" = c("fut_pref_hobbies", "fut_pref_music", "fut_pref_dress",
"fut_pref_exer", "fut_pref_food", "fut_pref_friends"),
# Past Personality MEAN
"past_pers_MEAN" = c("past_pers_open", "past_pers_goal", "past_pers_social",
"past_pers_agree", "past_pers_stress"),
# Future Personality MEAN
"fut_pers_MEAN" = c("fut_pers_open", "fut_pers_goal", "fut_pers_social",
"fut_pers_agree", "fut_pers_stress"),
# Past Values MEAN
"past_val_MEAN" = c("past_val_trad", "past_val_autonomy", "past_val_personal",
"past_val_justice", "past_val_close", "past_val_connect"),
# Future Values MEAN
"fut_val_MEAN" = c("fut_val_trad", "fut_val_autonomy", "fut_val_personal",
"fut_val_justice", "fut_val_close", "fut_val_connect"),
# EHI Preferences MEAN
"ehi_pref_MEAN" = c("ehi_pref_hobbies", "ehi_pref_music", "ehi_pref_dress",
"ehi_pref_exer", "ehi_pref_food", "ehi_pref_friends"),
# EHI Personality MEAN
"ehi_pers_MEAN" = c("ehi_pers_open", "ehi_pers_goal", "ehi_pers_social",
"ehi_pers_agree", "ehi_pers_stress"),
# EHI Values MEAN
"ehi_val_MEAN" = c("ehi_val_trad", "ehi_val_autonomy", "ehi_val_personal",
"ehi_val_justice", "ehi_val_close", "ehi_val_connect")
)
# Additional means
additional_means <- list(
"ehiDS_mean" = c("ehi_pref_MEAN", "ehi_pers_MEAN", "ehi_val_MEAN"),
"ehiDGEN_mean" = c("ehi_pref_DGEN", "ehi_pers_DGEN", "ehi_val_DGEN")
)
# =============================================================================
# 3. CHECK IF VARIABLES EXIST
# =============================================================================
# Check source variables for mean_mappings
missing_source_vars <- list()
for (target_var in names(mean_mappings)) {
source_vars <- mean_mappings[[target_var]]
missing <- setdiff(source_vars, names(df))
if (length(missing) > 0) {
missing_source_vars[[target_var]] <- missing
cat(paste("⚠ Missing source variables for", target_var, ":", paste(missing, collapse = ", "), "\n"))
}
}
# Check source variables for additional_means
missing_additional_vars <- list()
for (target_var in names(additional_means)) {
source_vars <- additional_means[[target_var]]
missing <- setdiff(source_vars, names(df))
if (length(missing) > 0) {
missing_additional_vars[[target_var]] <- missing
cat(paste("⚠ Missing source variables for", target_var, ":", paste(missing, collapse = ", "), "\n"))
}
}
# Check if target variables exist
expected_targets <- c(names(mean_mappings), names(additional_means))
actual_targets <- names(df)
missing_targets <- setdiff(expected_targets, actual_targets)
if (length(missing_targets) > 0) {
cat("\nERROR: The following target variables are missing from eohi3.csv:\n")
for (var in missing_targets) {
cat(paste(" -", var, "\n"))
}
stop("Cannot proceed without target variables. Please add them to the CSV file.")
}
# =============================================================================
# 4. CALCULATE MEAN VARIABLES
# =============================================================================
# Function to calculate row means, handling NA and empty strings
calculate_mean <- function(df, source_vars) {
# Extract columns and convert to numeric
cols_data <- df[, source_vars, drop = FALSE]
# Convert to numeric matrix, treating empty strings and "NA" as NA
numeric_matrix <- apply(cols_data, 2, function(x) {
as.numeric(ifelse(x == "" | is.na(x) | x == "NA", NA, x))
})
# Calculate row means, ignoring NA values
rowMeans(numeric_matrix, na.rm = TRUE)
}
# Calculate means for main mappings
for (target_var in names(mean_mappings)) {
source_vars <- mean_mappings[[target_var]]
# Check if all source variables exist
missing <- setdiff(source_vars, names(df))
if (length(missing) > 0) {
warning(paste("Skipping", target_var, "- missing source variables:", paste(missing, collapse = ", ")))
next
}
# Calculate mean
df[[target_var]] <- calculate_mean(df, source_vars)
cat(paste(" Calculated:", target_var, "from", length(source_vars), "variables\n"))
}
# Calculate additional means
for (target_var in names(additional_means)) {
source_vars <- additional_means[[target_var]]
# Check if all source variables exist
missing <- setdiff(source_vars, names(df))
if (length(missing) > 0) {
warning(paste("Skipping", target_var, "- missing source variables:", paste(missing, collapse = ", ")))
next
}
# Calculate mean
df[[target_var]] <- calculate_mean(df, source_vars)
cat(paste(" Calculated:", target_var, "from", length(source_vars), "variables\n"))
}
# =============================================================================
# 5. VALIDATION: CHECK 5 RANDOM ROWS
# =============================================================================
# Set seed for reproducibility
set.seed(123)
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
sample_rows <- sort(sample_rows)
for (i in sample_rows) {
cat(paste("Row", i, ":\n"))
# Check a few representative mean variables
test_vars <- c(
"past_pref_MEAN",
"ehi_pref_MEAN",
"ehiDS_mean"
)
for (target_var in test_vars) {
# Determine which mapping to use
if (target_var %in% names(mean_mappings)) {
source_vars <- mean_mappings[[target_var]]
} else if (target_var %in% names(additional_means)) {
source_vars <- additional_means[[target_var]]
} else {
next
}
# Check if all source variables exist
if (!all(source_vars %in% names(df))) {
next
}
# Get values
source_vals <- df[i, source_vars]
target_val <- df[i, target_var]
# Convert to numeric for calculation
source_nums <- as.numeric(ifelse(source_vals == "" | is.na(source_vals) | source_vals == "NA", NA, source_vals))
target_num <- as.numeric(ifelse(is.na(target_val), NA, target_val))
# Calculate expected mean (ignoring NA)
expected <- mean(source_nums, na.rm = TRUE)
if (all(is.na(source_nums))) {
expected <- NA
}
# Check if calculation is correct
match <- if (!is.na(expected) && !is.na(target_num)) {
abs(expected - target_num) < 0.0001 # Allow for floating point precision
} else {
is.na(expected) && is.na(target_num)
}
cat(sprintf(" %s:\n", target_var))
cat(sprintf(" Source variables: %s\n", paste(source_vars, collapse = ", ")))
cat(sprintf(" Source values: %s\n", paste(ifelse(is.na(source_vals) | source_vals == "", "NA/empty", source_vals), collapse = ", ")))
cat(sprintf(" %s = %s\n", target_var, ifelse(is.na(target_val), "NA", round(target_val, 4))))
cat(sprintf(" Expected mean: %s\n", ifelse(is.na(expected), "NA", round(expected, 4))))
cat(sprintf(" Match: %s\n\n", ifelse(match, "✓", "✗ ERROR")))
}
}
# =============================================================================
# 6. SAVE UPDATED DATA
# =============================================================================
# COMMENTED OUT: Uncomment when ready to save
write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
# cat("Updated data saved to: eohi3.csv\n")
# cat(paste("Total rows:", nrow(df), "\n"))
# cat(paste("Total columns:", ncol(df), "\n"))