226 lines
8.2 KiB
R
226 lines
8.2 KiB
R
library(dplyr)
|
|
|
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3")
|
|
|
|
# Read the data (with check.names=FALSE to preserve original column names)
|
|
# Keep empty cells as empty strings, not NA
|
|
# Only convert the literal string "NA" to NA, not empty strings
|
|
df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
|
|
|
# =============================================================================
|
|
# 1. CREATE BACKUP
|
|
# =============================================================================
|
|
file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE)
|
|
|
|
# =============================================================================
|
|
# 2. DEFINE MEAN VARIABLE MAPPINGS
|
|
# =============================================================================
|
|
|
|
mean_mappings <- list(
|
|
# Past Preferences MEAN
|
|
"past_pref_MEAN" = c("past_pref_hobbies", "past_pref_music", "past_pref_dress",
|
|
"past_pref_exer", "past_pref_food", "past_pref_friends"),
|
|
|
|
# Future Preferences MEAN
|
|
"fut_pref_MEAN" = c("fut_pref_hobbies", "fut_pref_music", "fut_pref_dress",
|
|
"fut_pref_exer", "fut_pref_food", "fut_pref_friends"),
|
|
|
|
# Past Personality MEAN
|
|
"past_pers_MEAN" = c("past_pers_open", "past_pers_goal", "past_pers_social",
|
|
"past_pers_agree", "past_pers_stress"),
|
|
|
|
# Future Personality MEAN
|
|
"fut_pers_MEAN" = c("fut_pers_open", "fut_pers_goal", "fut_pers_social",
|
|
"fut_pers_agree", "fut_pers_stress"),
|
|
|
|
# Past Values MEAN
|
|
"past_val_MEAN" = c("past_val_trad", "past_val_autonomy", "past_val_personal",
|
|
"past_val_justice", "past_val_close", "past_val_connect"),
|
|
|
|
# Future Values MEAN
|
|
"fut_val_MEAN" = c("fut_val_trad", "fut_val_autonomy", "fut_val_personal",
|
|
"fut_val_justice", "fut_val_close", "fut_val_connect"),
|
|
|
|
# EHI Preferences MEAN
|
|
"ehi_pref_MEAN" = c("ehi_pref_hobbies", "ehi_pref_music", "ehi_pref_dress",
|
|
"ehi_pref_exer", "ehi_pref_food", "ehi_pref_friends"),
|
|
|
|
# EHI Personality MEAN
|
|
"ehi_pers_MEAN" = c("ehi_pers_open", "ehi_pers_goal", "ehi_pers_social",
|
|
"ehi_pers_agree", "ehi_pers_stress"),
|
|
|
|
# EHI Values MEAN
|
|
"ehi_val_MEAN" = c("ehi_val_trad", "ehi_val_autonomy", "ehi_val_personal",
|
|
"ehi_val_justice", "ehi_val_close", "ehi_val_connect")
|
|
)
|
|
|
|
# Additional means
|
|
additional_means <- list(
|
|
"ehiDS_mean" = c("ehi_pref_MEAN", "ehi_pers_MEAN", "ehi_val_MEAN"),
|
|
"ehiDGEN_mean" = c("ehi_pref_DGEN", "ehi_pers_DGEN", "ehi_val_DGEN")
|
|
)
|
|
|
|
# =============================================================================
|
|
# 3. CHECK IF VARIABLES EXIST
|
|
# =============================================================================
|
|
|
|
# Check source variables for mean_mappings
|
|
missing_source_vars <- list()
|
|
for (target_var in names(mean_mappings)) {
|
|
source_vars <- mean_mappings[[target_var]]
|
|
missing <- setdiff(source_vars, names(df))
|
|
if (length(missing) > 0) {
|
|
missing_source_vars[[target_var]] <- missing
|
|
cat(paste("⚠ Missing source variables for", target_var, ":", paste(missing, collapse = ", "), "\n"))
|
|
}
|
|
}
|
|
|
|
# Check source variables for additional_means
|
|
missing_additional_vars <- list()
|
|
for (target_var in names(additional_means)) {
|
|
source_vars <- additional_means[[target_var]]
|
|
missing <- setdiff(source_vars, names(df))
|
|
if (length(missing) > 0) {
|
|
missing_additional_vars[[target_var]] <- missing
|
|
cat(paste("⚠ Missing source variables for", target_var, ":", paste(missing, collapse = ", "), "\n"))
|
|
}
|
|
}
|
|
|
|
# Check if target variables exist
|
|
expected_targets <- c(names(mean_mappings), names(additional_means))
|
|
actual_targets <- names(df)
|
|
missing_targets <- setdiff(expected_targets, actual_targets)
|
|
|
|
if (length(missing_targets) > 0) {
|
|
cat("\nERROR: The following target variables are missing from eohi3.csv:\n")
|
|
for (var in missing_targets) {
|
|
cat(paste(" -", var, "\n"))
|
|
}
|
|
stop("Cannot proceed without target variables. Please add them to the CSV file.")
|
|
}
|
|
|
|
# =============================================================================
|
|
# 4. CALCULATE MEAN VARIABLES
|
|
# =============================================================================
|
|
|
|
# Function to calculate row means, handling NA and empty strings
|
|
calculate_mean <- function(df, source_vars) {
|
|
# Extract columns and convert to numeric
|
|
cols_data <- df[, source_vars, drop = FALSE]
|
|
|
|
# Convert to numeric matrix, treating empty strings and "NA" as NA
|
|
numeric_matrix <- apply(cols_data, 2, function(x) {
|
|
as.numeric(ifelse(x == "" | is.na(x) | x == "NA", NA, x))
|
|
})
|
|
|
|
# Calculate row means, ignoring NA values
|
|
rowMeans(numeric_matrix, na.rm = TRUE)
|
|
}
|
|
|
|
# Calculate means for main mappings
|
|
for (target_var in names(mean_mappings)) {
|
|
source_vars <- mean_mappings[[target_var]]
|
|
|
|
# Check if all source variables exist
|
|
missing <- setdiff(source_vars, names(df))
|
|
if (length(missing) > 0) {
|
|
warning(paste("Skipping", target_var, "- missing source variables:", paste(missing, collapse = ", ")))
|
|
next
|
|
}
|
|
|
|
# Calculate mean
|
|
df[[target_var]] <- calculate_mean(df, source_vars)
|
|
cat(paste(" Calculated:", target_var, "from", length(source_vars), "variables\n"))
|
|
}
|
|
|
|
# Calculate additional means
|
|
for (target_var in names(additional_means)) {
|
|
source_vars <- additional_means[[target_var]]
|
|
|
|
# Check if all source variables exist
|
|
missing <- setdiff(source_vars, names(df))
|
|
if (length(missing) > 0) {
|
|
warning(paste("Skipping", target_var, "- missing source variables:", paste(missing, collapse = ", ")))
|
|
next
|
|
}
|
|
|
|
# Calculate mean
|
|
df[[target_var]] <- calculate_mean(df, source_vars)
|
|
cat(paste(" Calculated:", target_var, "from", length(source_vars), "variables\n"))
|
|
}
|
|
|
|
# =============================================================================
|
|
# 5. VALIDATION: CHECK 5 RANDOM ROWS
|
|
# =============================================================================
|
|
|
|
# Set seed for reproducibility
|
|
set.seed(123)
|
|
sample_rows <- sample(1:nrow(df), min(5, nrow(df)))
|
|
sample_rows <- sort(sample_rows)
|
|
|
|
for (i in sample_rows) {
|
|
cat(paste("Row", i, ":\n"))
|
|
|
|
# Check a few representative mean variables
|
|
test_vars <- c(
|
|
"past_pref_MEAN",
|
|
"ehi_pref_MEAN",
|
|
"ehiDS_mean"
|
|
)
|
|
|
|
for (target_var in test_vars) {
|
|
# Determine which mapping to use
|
|
if (target_var %in% names(mean_mappings)) {
|
|
source_vars <- mean_mappings[[target_var]]
|
|
} else if (target_var %in% names(additional_means)) {
|
|
source_vars <- additional_means[[target_var]]
|
|
} else {
|
|
next
|
|
}
|
|
|
|
# Check if all source variables exist
|
|
if (!all(source_vars %in% names(df))) {
|
|
next
|
|
}
|
|
|
|
# Get values
|
|
source_vals <- df[i, source_vars]
|
|
target_val <- df[i, target_var]
|
|
|
|
# Convert to numeric for calculation
|
|
source_nums <- as.numeric(ifelse(source_vals == "" | is.na(source_vals) | source_vals == "NA", NA, source_vals))
|
|
target_num <- as.numeric(ifelse(is.na(target_val), NA, target_val))
|
|
|
|
# Calculate expected mean (ignoring NA)
|
|
expected <- mean(source_nums, na.rm = TRUE)
|
|
if (all(is.na(source_nums))) {
|
|
expected <- NA
|
|
}
|
|
|
|
# Check if calculation is correct
|
|
match <- if (!is.na(expected) && !is.na(target_num)) {
|
|
abs(expected - target_num) < 0.0001 # Allow for floating point precision
|
|
} else {
|
|
is.na(expected) && is.na(target_num)
|
|
}
|
|
|
|
cat(sprintf(" %s:\n", target_var))
|
|
cat(sprintf(" Source variables: %s\n", paste(source_vars, collapse = ", ")))
|
|
cat(sprintf(" Source values: %s\n", paste(ifelse(is.na(source_vals) | source_vals == "", "NA/empty", source_vals), collapse = ", ")))
|
|
cat(sprintf(" %s = %s\n", target_var, ifelse(is.na(target_val), "NA", round(target_val, 4))))
|
|
cat(sprintf(" Expected mean: %s\n", ifelse(is.na(expected), "NA", round(expected, 4))))
|
|
cat(sprintf(" Match: %s\n\n", ifelse(match, "✓", "✗ ERROR")))
|
|
}
|
|
}
|
|
|
|
# =============================================================================
|
|
# 6. SAVE UPDATED DATA
|
|
# =============================================================================
|
|
# COMMENTED OUT: Uncomment when ready to save
|
|
|
|
write.csv(df, "eohi3.csv", row.names = FALSE, na = "")
|
|
# cat("Updated data saved to: eohi3.csv\n")
|
|
# cat(paste("Total rows:", nrow(df), "\n"))
|
|
# cat(paste("Total columns:", ncol(df), "\n"))
|
|
|