# Script to calculate absolute differences between time intervals in eohi2.csv # Compares present vs past/future, and 5-year vs 10-year intervals # Load necessary library library(dplyr) setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") # Read the data (with check.names=FALSE to preserve original column names) # na.strings=NULL keeps empty cells as empty strings instead of converting to NA df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) # Define the 15 item names (same order for all time periods) items <- c( "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" ) # Note: present uses lowercase "tv", others use uppercase "TV" items_present <- c( "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" ) # Define all source columns (75 total) source_cols <- c( paste0("present_", items_present), paste0("past_5_", items), paste0("past_10_", items), paste0("fut_5_", items), paste0("fut_10_", items) ) # Define all target columns (90 total = 6 calculation types × 15 items) target_NPast_5 <- paste0("NPast_5_", items) target_NPast_10 <- paste0("NPast_10_", items) target_NFut_5 <- paste0("NFut_5_", items) target_NFut_10 <- paste0("NFut_10_", items) target_5_10past <- paste0("5.10past_", items) target_5_10fut <- paste0("5.10fut_", items) target_cols <- c( target_NPast_5, target_NPast_10, target_NFut_5, target_NFut_10, target_5_10past, target_5_10fut ) # ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") # Get actual column names from dataframe (trimmed) df_cols <- trimws(names(df)) # Check Source columns missing_source <- source_cols[!source_cols %in% df_cols] existing_source <- source_cols[source_cols %in% df_cols] cat("Source Columns:\n") cat(" Expected: 75 columns\n") cat(" Found:", length(existing_source), "columns\n") cat(" Missing:", length(missing_source), "columns\n") if (length(missing_source) > 0 && length(missing_source) <= 10) { cat("\n Missing source columns:\n") for (col in missing_source) { cat(" -", col, "\n") } } else if (length(missing_source) > 10) { cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") } # Check Target columns missing_targets <- target_cols[!target_cols %in% df_cols] existing_targets <- target_cols[target_cols %in% df_cols] cat("\nTarget Columns:\n") cat(" Expected: 90 columns\n") cat(" Found:", length(existing_targets), "columns\n") cat(" Missing:", length(missing_targets), "columns\n") if (length(missing_targets) > 0) { cat("\n Target columns do NOT exist yet - will create them.\n") if (length(existing_targets) > 0) { cat(" WARNING: Some target columns already exist and will be overwritten.\n") } } else { cat(" All target columns exist - will overwrite with calculated values.\n") } cat("\n=== END CHECK ===\n\n") # Stop if critical columns are missing if (length(missing_source) > 30) { stop("ERROR: Too many source columns missing! Please check column names in CSV file.") } cat("Proceeding with processing...\n\n") # ============= CALCULATE DIFFERENCES ============= cat("Calculating time interval differences...\n") # Convert source columns to numeric for (col in source_cols) { if (col %in% names(df)) { df[[col]] <- as.numeric(df[[col]]) } } # Helper function to calculate absolute difference calc_abs_diff <- function(col1, col2) { val1 <- if (col1 %in% names(df)) df[[col1]] else NA val2 <- if (col2 %in% names(df)) df[[col2]] else NA abs(val1 - val2) } # Calculate NPast_5: |present - past_5| cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") for (i in 1:15) { target <- target_NPast_5[i] source1 <- paste0("present_", items_present[i]) source2 <- paste0("past_5_", items[i]) df[[target]] <- calc_abs_diff(source1, source2) } # Calculate NPast_10: |present - past_10| cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") for (i in 1:15) { target <- target_NPast_10[i] source1 <- paste0("present_", items_present[i]) source2 <- paste0("past_10_", items[i]) df[[target]] <- calc_abs_diff(source1, source2) } # Calculate NFut_5: |present - fut_5| cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") for (i in 1:15) { target <- target_NFut_5[i] source1 <- paste0("present_", items_present[i]) source2 <- paste0("fut_5_", items[i]) df[[target]] <- calc_abs_diff(source1, source2) } # Calculate NFut_10: |present - fut_10| cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") for (i in 1:15) { target <- target_NFut_10[i] source1 <- paste0("present_", items_present[i]) source2 <- paste0("fut_10_", items[i]) df[[target]] <- calc_abs_diff(source1, source2) } # Calculate 5.10past: |past_5 - past_10| cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") for (i in 1:15) { target <- target_5_10past[i] source1 <- paste0("past_5_", items[i]) source2 <- paste0("past_10_", items[i]) df[[target]] <- calc_abs_diff(source1, source2) } # Calculate 5.10fut: |fut_5 - fut_10| cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") for (i in 1:15) { target <- target_5_10fut[i] source1 <- paste0("fut_5_", items[i]) source2 <- paste0("fut_10_", items[i]) df[[target]] <- calc_abs_diff(source1, source2) } cat("\n=== CALCULATION COMPLETE ===\n") cat(" 90 difference columns created.\n\n") # ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK ============= # This function can be run multiple times to check different random rows and items qa_check_random_row <- function(row_num = NULL, item_num = NULL) { # Pick a random row or use specified row if (is.null(row_num)) { random_row <- sample(seq_len(nrow(df)), 1) cat("\n========================================\n") cat("QA CHECK: Random Row #", random_row, "\n") } else { if (row_num < 1 || row_num > nrow(df)) { cat("ERROR: Row number must be between 1 and", nrow(df), "\n") return() } random_row <- row_num cat("\n========================================\n") cat("QA CHECK: Specified Row #", random_row, "\n") } # Pick a random item or use specified item if (is.null(item_num)) { test_item_idx <- sample(1:15, 1) cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") } else { if (item_num < 1 || item_num > 15) { cat("ERROR: Item number must be between 1 and 15\n") return() } test_item_idx <- item_num cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") } cat("========================================\n\n") calculations <- list( list(name = "NPast_5", target = target_NPast_5[test_item_idx], source1 = paste0("present_", items_present[test_item_idx]), source2 = paste0("past_5_", items[test_item_idx]), desc = "|present - past_5|"), list(name = "NPast_10", target = target_NPast_10[test_item_idx], source1 = paste0("present_", items_present[test_item_idx]), source2 = paste0("past_10_", items[test_item_idx]), desc = "|present - past_10|"), list(name = "NFut_5", target = target_NFut_5[test_item_idx], source1 = paste0("present_", items_present[test_item_idx]), source2 = paste0("fut_5_", items[test_item_idx]), desc = "|present - fut_5|"), list(name = "NFut_10", target = target_NFut_10[test_item_idx], source1 = paste0("present_", items_present[test_item_idx]), source2 = paste0("fut_10_", items[test_item_idx]), desc = "|present - fut_10|"), list(name = "5.10past", target = target_5_10past[test_item_idx], source1 = paste0("past_5_", items[test_item_idx]), source2 = paste0("past_10_", items[test_item_idx]), desc = "|past_5 - past_10|"), list(name = "5.10fut", target = target_5_10fut[test_item_idx], source1 = paste0("fut_5_", items[test_item_idx]), source2 = paste0("fut_10_", items[test_item_idx]), desc = "|fut_5 - fut_10|") ) for (calc in calculations) { cat(sprintf("--- %s ---\n", calc$name)) cat(sprintf("Formula: %s\n", calc$desc)) val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA target_val <- df[random_row, calc$target] cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) if (!is.na(val1) && !is.na(val2)) { expected_diff <- abs(val1 - val2) cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) } else { cat(" Cannot calculate (missing values)\n") } cat("\n") } cat("========================================\n") cat("END QA CHECK\n") cat("========================================\n\n") } # Run QA check on random row and random item cat("\n\n") qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) # Instructions for running additional checks cat("\n") cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") cat("For random row AND random item, run:\n") cat(" qa_check_random_row()\n") cat("\nFor specific row (e.g., row 118) with random item:\n") cat(" qa_check_random_row(118)\n") cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") cat(" qa_check_random_row(item_num = 5)\n") cat("\nFor specific row AND specific item:\n") cat(" qa_check_random_row(118, 5)\n") cat("\n") # Save the modified dataframe back to CSV # na="" writes NA values as empty cells instead of "NA" text # COMMENTED OUT FOR REVIEW - Uncomment when ready to save # write.csv(df, "eohi2.csv", row.names = FALSE, na = "") cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") cat("Review the output above, then uncomment line 243 to save changes.\n") cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n")