library(dplyr) setwd("/home/ladmin/Documents/DND/EOHI/eohi3") # Read the data (with check.names=FALSE to preserve original column names) # Keep empty cells as empty strings, not NA # Only convert the literal string "NA" to NA, not empty strings df <- read.csv("eohi3.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA") # ============================================================================= # 1. CREATE BACKUP # ============================================================================= file.copy("eohi3.csv", "eohi3_2.csv", overwrite = TRUE) # ============================================================================= # 2. DEFINE MEAN VARIABLE MAPPINGS # ============================================================================= mean_mappings <- list( # Past Preferences MEAN "past_pref_MEAN" = c("past_pref_hobbies", "past_pref_music", "past_pref_dress", "past_pref_exer", "past_pref_food", "past_pref_friends"), # Future Preferences MEAN "fut_pref_MEAN" = c("fut_pref_hobbies", "fut_pref_music", "fut_pref_dress", "fut_pref_exer", "fut_pref_food", "fut_pref_friends"), # Past Personality MEAN "past_pers_MEAN" = c("past_pers_open", "past_pers_goal", "past_pers_social", "past_pers_agree", "past_pers_stress"), # Future Personality MEAN "fut_pers_MEAN" = c("fut_pers_open", "fut_pers_goal", "fut_pers_social", "fut_pers_agree", "fut_pers_stress"), # Past Values MEAN "past_val_MEAN" = c("past_val_trad", "past_val_autonomy", "past_val_personal", "past_val_justice", "past_val_close", "past_val_connect"), # Future Values MEAN "fut_val_MEAN" = c("fut_val_trad", "fut_val_autonomy", "fut_val_personal", "fut_val_justice", "fut_val_close", "fut_val_connect"), # EHI Preferences MEAN "ehi_pref_MEAN" = c("ehi_pref_hobbies", "ehi_pref_music", "ehi_pref_dress", "ehi_pref_exer", "ehi_pref_food", "ehi_pref_friends"), # EHI Personality MEAN "ehi_pers_MEAN" = c("ehi_pers_open", "ehi_pers_goal", "ehi_pers_social", "ehi_pers_agree", "ehi_pers_stress"), # EHI Values MEAN "ehi_val_MEAN" = c("ehi_val_trad", "ehi_val_autonomy", "ehi_val_personal", "ehi_val_justice", "ehi_val_close", "ehi_val_connect") ) # Additional means additional_means <- list( "ehiDS_mean" = c("ehi_pref_MEAN", "ehi_pers_MEAN", "ehi_val_MEAN"), "ehiDGEN_mean" = c("ehi_pref_DGEN", "ehi_pers_DGEN", "ehi_val_DGEN") ) # ============================================================================= # 3. CHECK IF VARIABLES EXIST # ============================================================================= # Check source variables for mean_mappings missing_source_vars <- list() for (target_var in names(mean_mappings)) { source_vars <- mean_mappings[[target_var]] missing <- setdiff(source_vars, names(df)) if (length(missing) > 0) { missing_source_vars[[target_var]] <- missing cat(paste("⚠ Missing source variables for", target_var, ":", paste(missing, collapse = ", "), "\n")) } } # Check source variables for additional_means missing_additional_vars <- list() for (target_var in names(additional_means)) { source_vars <- additional_means[[target_var]] missing <- setdiff(source_vars, names(df)) if (length(missing) > 0) { missing_additional_vars[[target_var]] <- missing cat(paste("⚠ Missing source variables for", target_var, ":", paste(missing, collapse = ", "), "\n")) } } # Check if target variables exist expected_targets <- c(names(mean_mappings), names(additional_means)) actual_targets <- names(df) missing_targets <- setdiff(expected_targets, actual_targets) if (length(missing_targets) > 0) { cat("\nERROR: The following target variables are missing from eohi3.csv:\n") for (var in missing_targets) { cat(paste(" -", var, "\n")) } stop("Cannot proceed without target variables. Please add them to the CSV file.") } # ============================================================================= # 4. CALCULATE MEAN VARIABLES # ============================================================================= # Function to calculate row means, handling NA and empty strings calculate_mean <- function(df, source_vars) { # Extract columns and convert to numeric cols_data <- df[, source_vars, drop = FALSE] # Convert to numeric matrix, treating empty strings and "NA" as NA numeric_matrix <- apply(cols_data, 2, function(x) { as.numeric(ifelse(x == "" | is.na(x) | x == "NA", NA, x)) }) # Calculate row means, ignoring NA values rowMeans(numeric_matrix, na.rm = TRUE) } # Calculate means for main mappings for (target_var in names(mean_mappings)) { source_vars <- mean_mappings[[target_var]] # Check if all source variables exist missing <- setdiff(source_vars, names(df)) if (length(missing) > 0) { warning(paste("Skipping", target_var, "- missing source variables:", paste(missing, collapse = ", "))) next } # Calculate mean df[[target_var]] <- calculate_mean(df, source_vars) cat(paste(" Calculated:", target_var, "from", length(source_vars), "variables\n")) } # Calculate additional means for (target_var in names(additional_means)) { source_vars <- additional_means[[target_var]] # Check if all source variables exist missing <- setdiff(source_vars, names(df)) if (length(missing) > 0) { warning(paste("Skipping", target_var, "- missing source variables:", paste(missing, collapse = ", "))) next } # Calculate mean df[[target_var]] <- calculate_mean(df, source_vars) cat(paste(" Calculated:", target_var, "from", length(source_vars), "variables\n")) } # ============================================================================= # 5. VALIDATION: CHECK 5 RANDOM ROWS # ============================================================================= # Set seed for reproducibility set.seed(123) sample_rows <- sample(1:nrow(df), min(5, nrow(df))) sample_rows <- sort(sample_rows) for (i in sample_rows) { cat(paste("Row", i, ":\n")) # Check a few representative mean variables test_vars <- c( "past_pref_MEAN", "ehi_pref_MEAN", "ehiDS_mean" ) for (target_var in test_vars) { # Determine which mapping to use if (target_var %in% names(mean_mappings)) { source_vars <- mean_mappings[[target_var]] } else if (target_var %in% names(additional_means)) { source_vars <- additional_means[[target_var]] } else { next } # Check if all source variables exist if (!all(source_vars %in% names(df))) { next } # Get values source_vals <- df[i, source_vars] target_val <- df[i, target_var] # Convert to numeric for calculation source_nums <- as.numeric(ifelse(source_vals == "" | is.na(source_vals) | source_vals == "NA", NA, source_vals)) target_num <- as.numeric(ifelse(is.na(target_val), NA, target_val)) # Calculate expected mean (ignoring NA) expected <- mean(source_nums, na.rm = TRUE) if (all(is.na(source_nums))) { expected <- NA } # Check if calculation is correct match <- if (!is.na(expected) && !is.na(target_num)) { abs(expected - target_num) < 0.0001 # Allow for floating point precision } else { is.na(expected) && is.na(target_num) } cat(sprintf(" %s:\n", target_var)) cat(sprintf(" Source variables: %s\n", paste(source_vars, collapse = ", "))) cat(sprintf(" Source values: %s\n", paste(ifelse(is.na(source_vals) | source_vals == "", "NA/empty", source_vals), collapse = ", "))) cat(sprintf(" %s = %s\n", target_var, ifelse(is.na(target_val), "NA", round(target_val, 4)))) cat(sprintf(" Expected mean: %s\n", ifelse(is.na(expected), "NA", round(expected, 4)))) cat(sprintf(" Match: %s\n\n", ifelse(match, "✓", "✗ ERROR"))) } } # ============================================================================= # 6. SAVE UPDATED DATA # ============================================================================= # COMMENTED OUT: Uncomment when ready to save write.csv(df, "eohi3.csv", row.names = FALSE, na = "") # cat("Updated data saved to: eohi3.csv\n") # cat(paste("Total rows:", nrow(df), "\n")) # cat(paste("Total columns:", ncol(df), "\n"))