eohi/eohi2/dataP 08 - DGEN 510 vars.r
2025-12-23 15:47:09 -05:00

96 lines
3.8 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Script 08: Create 5_10 DGEN Variables
# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings
# for both Past and Future directions
# VARIABLES CREATED: 6 total (3 domains × 2 time directions)
library(tidyverse)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
# Read the data
data <- read.csv("eohi2.csv")
print(paste("Dataset dimensions:", paste(dim(data), collapse = " x")))
print(paste("Number of participants:", length(unique(data$ResponseId))))
# Verify source columns exist
source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val",
"DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val",
"DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val",
"DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val")
missing_vars <- source_vars[!source_vars %in% colnames(data)]
if (length(missing_vars) > 0) {
stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", ")))
}
print("All source DGEN variables found!")
# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year)
# Formula: |DGEN_5 - DGEN_10|
# NOTE: Using X prefix because R adds it to column names starting with numbers
# PAST direction
data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref)
data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers)
data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val)
# FUTURE direction
data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref)
data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers)
data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val)
# Verify variables were created
target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val",
"X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val")
print("\n=== VARIABLES CREATED ===")
print(target_vars)
# Check for missing values
for(var in target_vars) {
n_missing <- sum(is.na(data[[var]]))
pct_missing <- round(100 * n_missing / nrow(data), 2)
print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing))
}
# Quality check: Display sample rows
print("\n=== QUALITY CHECK: Sample Calculations ===")
sample_rows <- sample(1:nrow(data), min(5, nrow(data)))
for(i in sample_rows) {
print(sprintf("\nRow %d:", i))
print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f",
data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i]))
print(sprintf(" → X5_10DGEN_past_pref = %.2f (expected: %.2f)",
data$X5_10DGEN_past_pref[i],
abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i])))
print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f",
data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i]))
print(sprintf(" → X5_10DGEN_fut_pers = %.2f (expected: %.2f)",
data$X5_10DGEN_fut_pers[i],
abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i])))
}
# Descriptive statistics
print("\n=== DESCRIPTIVE STATISTICS ===")
desc_stats <- data %>%
summarise(across(all_of(target_vars),
list(n = ~sum(!is.na(.)),
mean = ~round(mean(., na.rm = TRUE), 5),
sd = ~round(sd(., na.rm = TRUE), 5),
min = ~round(min(., na.rm = TRUE), 5),
max = ~round(max(., na.rm = TRUE), 5)),
.names = "{.col}_{.fn}"))
print(t(desc_stats))
# Save to CSV
write.csv(data, "eohi2.csv", row.names = FALSE)
print("\n=== PROCESSING COMPLETE ===")
print("Data saved to eohi2.csv")
print(paste("Total columns now:", ncol(data)))