# Script to calculate domain means for time interval differences in eohi2.csv # Averages the 5 items within each domain (pref, pers, val) for each time interval type # Load necessary library library(dplyr) setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") # Read the data (with check.names=FALSE to preserve original column names) # na.strings=NULL keeps empty cells as empty strings instead of converting to NA df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) # Define the 15 item names (same order for all time periods) items <- c( "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" ) # Define domain groupings (indices in items vector) pref_indices <- 1:5 pers_indices <- 6:10 val_indices <- 11:15 # Define time interval prefixes time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") # Define domain names domain_names <- c("pref", "pers", "val") # Define all source columns (90 total) source_cols <- c( paste0("NPast_5_", items), paste0("NPast_10_", items), paste0("NFut_5_", items), paste0("NFut_10_", items), paste0("X5.10past_", items), paste0("X5.10fut_", items) ) # Define all target columns (18 total = 6 time intervals × 3 domains) target_cols <- c( paste0("NPast_5_", domain_names, "_MEAN"), paste0("NPast_10_", domain_names, "_MEAN"), paste0("NFut_5_", domain_names, "_MEAN"), paste0("NFut_10_", domain_names, "_MEAN"), paste0("X5.10past_", domain_names, "_MEAN"), paste0("X5.10fut_", domain_names, "_MEAN") ) # ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") # Get actual column names from dataframe (trimmed) df_cols <- trimws(names(df)) # Check Source columns missing_source <- source_cols[!source_cols %in% df_cols] existing_source <- source_cols[source_cols %in% df_cols] cat("Source Columns:\n") cat(" Expected: 90 columns\n") cat(" Found:", length(existing_source), "columns\n") cat(" Missing:", length(missing_source), "columns\n") if (length(missing_source) > 0 && length(missing_source) <= 20) { cat("\n Missing source columns:\n") for (col in missing_source) { cat(" -", col, "\n") } } else if (length(missing_source) > 20) { cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") } # Check Target columns missing_targets <- target_cols[!target_cols %in% df_cols] existing_targets <- target_cols[target_cols %in% df_cols] cat("\nTarget Columns:\n") cat(" Expected: 18 columns\n") cat(" Found:", length(existing_targets), "columns\n") cat(" Missing:", length(missing_targets), "columns\n") if (length(missing_targets) > 0) { cat("\n Target columns do NOT exist yet - will create them.\n") if (length(existing_targets) > 0) { cat(" WARNING: Some target columns already exist and will be overwritten.\n") } } else { cat(" All target columns exist - will overwrite with calculated values.\n") } cat("\n=== END CHECK ===\n\n") # Stop if critical columns are missing if (length(missing_source) > 45) { stop("ERROR: Too many source columns missing! Please check column names in CSV file.") } cat("Proceeding with processing...\n\n") # ============= CALCULATE DOMAIN MEANS ============= cat("Calculating domain means for time interval differences...\n") # Convert source columns to numeric for (col in source_cols) { if (col %in% names(df)) { df[[col]] <- as.numeric(df[[col]]) } } # Calculate means for each time interval × domain combination for (time_prefix in time_prefixes) { # Preferences mean pref_cols <- paste0(time_prefix, "_", items[pref_indices]) existing_pref_cols <- pref_cols[pref_cols %in% names(df)] if (length(existing_pref_cols) > 0) { df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") } # Personality mean pers_cols <- paste0(time_prefix, "_", items[pers_indices]) existing_pers_cols <- pers_cols[pers_cols %in% names(df)] if (length(existing_pers_cols) > 0) { df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") } # Values mean val_cols <- paste0(time_prefix, "_", items[val_indices]) existing_val_cols <- val_cols[val_cols %in% names(df)] if (length(existing_val_cols) > 0) { df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") } } cat("\n=== CALCULATION COMPLETE ===\n") cat(" 18 domain mean columns created.\n\n") # ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= # This function can be run multiple times to check different random rows and time intervals qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { # Pick a random row or use specified row if (is.null(row_num)) { random_row <- sample(seq_len(nrow(df)), 1) cat("\n========================================\n") cat("QA CHECK: Random Row #", random_row, "\n") } else { if (row_num < 1 || row_num > nrow(df)) { cat("ERROR: Row number must be between 1 and", nrow(df), "\n") return() } random_row <- row_num cat("\n========================================\n") cat("QA CHECK: Specified Row #", random_row, "\n") } # Pick a random time interval or use specified interval if (is.null(time_interval_num)) { test_interval_idx <- sample(1:6, 1) cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") } else { if (time_interval_num < 1 || time_interval_num > 6) { cat("ERROR: Time interval number must be between 1 and 6\n") cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") return() } test_interval_idx <- time_interval_num cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") } cat("========================================\n\n") time_prefix <- time_prefixes[test_interval_idx] # Check each of the 3 domains for (domain_idx in 1:3) { domain_name <- domain_names[domain_idx] # Get the appropriate item indices if (domain_idx == 1) { item_indices <- pref_indices domain_label <- "Preferences" } else if (domain_idx == 2) { item_indices <- pers_indices domain_label <- "Personality" } else { item_indices <- val_indices domain_label <- "Values" } cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) # Get source column names source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") # Get values values <- numeric(5) cat("Source values:\n") for (i in 1:5) { col <- source_cols_domain[i] val <- if (col %in% names(df)) df[random_row, col] else NA values[i] <- val cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) } # Calculate expected mean valid_values <- values[!is.na(values)] if (length(valid_values) > 0) { expected_mean <- mean(valid_values) actual_value <- df[random_row, target_col] cat(sprintf("\nCalculation:\n")) cat(sprintf(" Sum: %s = %.5f\n", paste(sprintf("%.5f", valid_values), collapse = " + "), sum(valid_values))) cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) } else { cat(" No valid values to calculate mean.\n") } cat("\n") } cat("========================================\n") cat("END QA CHECK\n") cat("========================================\n\n") } # Run QA check on random row and random time interval cat("\n\n") qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) # Instructions for running additional checks cat("\n") cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") cat("For random row AND random time interval, run:\n") cat(" qa_check_random_row()\n") cat("\nFor specific row (e.g., row 118) with random interval:\n") cat(" qa_check_random_row(118)\n") cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") cat(" qa_check_random_row(time_interval_num = 3)\n") cat("\nFor specific row AND specific interval:\n") cat(" qa_check_random_row(118, 3)\n") cat("\n") cat("Time Interval Numbers:\n") cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") cat(" 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") cat("\n") # Save the modified dataframe back to CSV # na="" writes NA values as empty cells instead of "NA" text # COMMENTED OUT FOR REVIEW - Uncomment when ready to save write.csv(df, "eohi2.csv", row.names = FALSE, na = "") cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") cat("Review the output above, then uncomment line 234 to save changes.\n") cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n")