# Script 08: Create 5_10 DGEN Variables # PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings # for both Past and Future directions # VARIABLES CREATED: 6 total (3 domains × 2 time directions) library(tidyverse) setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") # Read the data data <- read.csv("eohi2.csv") print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) print(paste("Number of participants:", length(unique(data$ResponseId)))) # Verify source columns exist source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") missing_vars <- source_vars[!source_vars %in% colnames(data)] if (length(missing_vars) > 0) { stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) } print("All source DGEN variables found!") # Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) # Formula: |DGEN_5 - DGEN_10| # NOTE: Using X prefix because R adds it to column names starting with numbers # PAST direction data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) # FUTURE direction data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) # Verify variables were created target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") print("\n=== VARIABLES CREATED ===") print(target_vars) # Check for missing values for(var in target_vars) { n_missing <- sum(is.na(data[[var]])) pct_missing <- round(100 * n_missing / nrow(data), 2) print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) } # Quality check: Display sample rows print("\n=== QUALITY CHECK: Sample Calculations ===") sample_rows <- sample(1:nrow(data), min(5, nrow(data))) for(i in sample_rows) { print(sprintf("\nRow %d:", i)) print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) print(sprintf(" → X5_10DGEN_past_pref = %.2f (expected: %.2f)", data$X5_10DGEN_past_pref[i], abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) print(sprintf(" → X5_10DGEN_fut_pers = %.2f (expected: %.2f)", data$X5_10DGEN_fut_pers[i], abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) } # Descriptive statistics print("\n=== DESCRIPTIVE STATISTICS ===") desc_stats <- data %>% summarise(across(all_of(target_vars), list(n = ~sum(!is.na(.)), mean = ~round(mean(., na.rm = TRUE), 5), sd = ~round(sd(., na.rm = TRUE), 5), min = ~round(min(., na.rm = TRUE), 5), max = ~round(max(., na.rm = TRUE), 5)), .names = "{.col}_{.fn}")) print(t(desc_stats)) # Save to CSV write.csv(data, "eohi2.csv", row.names = FALSE) print("\n=== PROCESSING COMPLETE ===") print("Data saved to eohi2.csv") print(paste("Total columns now:", ncol(data)))