eohi/eohi2/dataP 08 - DGEN 510 vars.r

# Script 08: Create 5_10 DGEN Variables
# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings
#          for both Past and Future directions
# VARIABLES CREATED: 6 total (3 domains × 2 time directions)

library(tidyverse)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")

# Read the data
data <- read.csv("eohi2.csv")

print(paste("Dataset dimensions:", paste(dim(data), collapse = " x")))
print(paste("Number of participants:", length(unique(data$ResponseId))))

# Verify source columns exist
source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val",
                 "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val",
                 "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val",
                 "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val")

missing_vars <- source_vars[!source_vars %in% colnames(data)]
if (length(missing_vars) > 0) {
  stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", ")))
}

print("All source DGEN variables found!")

# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year)
# Formula: |DGEN_5 - DGEN_10|
# NOTE: Using X prefix because R adds it to column names starting with numbers

# PAST direction
data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref)
data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers)
data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val)

# FUTURE direction
data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref)
data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers)
data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val)

# Verify variables were created
target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val",
                 "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val")

print("\n=== VARIABLES CREATED ===")
print(target_vars)

# Check for missing values
for(var in target_vars) {
  n_missing <- sum(is.na(data[[var]]))
  pct_missing <- round(100 * n_missing / nrow(data), 2)
  print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing))
}

# Quality check: Display sample rows
print("\n=== QUALITY CHECK: Sample Calculations ===")
sample_rows <- sample(1:nrow(data), min(5, nrow(data)))

for(i in sample_rows) {
  print(sprintf("\nRow %d:", i))
  print(sprintf("  DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f",
                data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i]))
  print(sprintf("  → X5_10DGEN_past_pref = %.2f (expected: %.2f)",
                data$X5_10DGEN_past_pref[i],
                abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i])))

  print(sprintf("  DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f",
                data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i]))
  print(sprintf("  → X5_10DGEN_fut_pers = %.2f (expected: %.2f)",
                data$X5_10DGEN_fut_pers[i],
                abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i])))
}

# Descriptive statistics
print("\n=== DESCRIPTIVE STATISTICS ===")
desc_stats <- data %>%
  summarise(across(all_of(target_vars),
                   list(n = ~sum(!is.na(.)),
                        mean = ~round(mean(., na.rm = TRUE), 5),
                        sd = ~round(sd(., na.rm = TRUE), 5),
                        min = ~round(min(., na.rm = TRUE), 5),
                        max = ~round(max(., na.rm = TRUE), 5)),
                   .names = "{.col}_{.fn}"))

print(t(desc_stats))

# Save to CSV
write.csv(data, "eohi2.csv", row.names = FALSE)

print("\n=== PROCESSING COMPLETE ===")
print("Data saved to eohi2.csv")
print(paste("Total columns now:", ncol(data)))