96 lines
3.8 KiB
R
96 lines
3.8 KiB
R
# Script 08: Create 5_10 DGEN Variables
|
||
# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings
|
||
# for both Past and Future directions
|
||
# VARIABLES CREATED: 6 total (3 domains × 2 time directions)
|
||
|
||
library(tidyverse)
|
||
|
||
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
|
||
|
||
# Read the data
|
||
data <- read.csv("eohi2.csv")
|
||
|
||
print(paste("Dataset dimensions:", paste(dim(data), collapse = " x")))
|
||
print(paste("Number of participants:", length(unique(data$ResponseId))))
|
||
|
||
# Verify source columns exist
|
||
source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val",
|
||
"DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val",
|
||
"DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val",
|
||
"DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val")
|
||
|
||
missing_vars <- source_vars[!source_vars %in% colnames(data)]
|
||
if (length(missing_vars) > 0) {
|
||
stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", ")))
|
||
}
|
||
|
||
print("All source DGEN variables found!")
|
||
|
||
# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year)
|
||
# Formula: |DGEN_5 - DGEN_10|
|
||
# NOTE: Using X prefix because R adds it to column names starting with numbers
|
||
|
||
# PAST direction
|
||
data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref)
|
||
data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers)
|
||
data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val)
|
||
|
||
# FUTURE direction
|
||
data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref)
|
||
data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers)
|
||
data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val)
|
||
|
||
# Verify variables were created
|
||
target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val",
|
||
"X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val")
|
||
|
||
print("\n=== VARIABLES CREATED ===")
|
||
print(target_vars)
|
||
|
||
# Check for missing values
|
||
for(var in target_vars) {
|
||
n_missing <- sum(is.na(data[[var]]))
|
||
pct_missing <- round(100 * n_missing / nrow(data), 2)
|
||
print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing))
|
||
}
|
||
|
||
# Quality check: Display sample rows
|
||
print("\n=== QUALITY CHECK: Sample Calculations ===")
|
||
sample_rows <- sample(1:nrow(data), min(5, nrow(data)))
|
||
|
||
for(i in sample_rows) {
|
||
print(sprintf("\nRow %d:", i))
|
||
print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f",
|
||
data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i]))
|
||
print(sprintf(" → X5_10DGEN_past_pref = %.2f (expected: %.2f)",
|
||
data$X5_10DGEN_past_pref[i],
|
||
abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i])))
|
||
|
||
print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f",
|
||
data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i]))
|
||
print(sprintf(" → X5_10DGEN_fut_pers = %.2f (expected: %.2f)",
|
||
data$X5_10DGEN_fut_pers[i],
|
||
abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i])))
|
||
}
|
||
|
||
# Descriptive statistics
|
||
print("\n=== DESCRIPTIVE STATISTICS ===")
|
||
desc_stats <- data %>%
|
||
summarise(across(all_of(target_vars),
|
||
list(n = ~sum(!is.na(.)),
|
||
mean = ~round(mean(., na.rm = TRUE), 5),
|
||
sd = ~round(sd(., na.rm = TRUE), 5),
|
||
min = ~round(min(., na.rm = TRUE), 5),
|
||
max = ~round(max(., na.rm = TRUE), 5)),
|
||
.names = "{.col}_{.fn}"))
|
||
|
||
print(t(desc_stats))
|
||
|
||
# Save to CSV
|
||
write.csv(data, "eohi2.csv", row.names = FALSE)
|
||
|
||
print("\n=== PROCESSING COMPLETE ===")
|
||
print("Data saved to eohi2.csv")
|
||
print(paste("Total columns now:", ncol(data)))
|
||
|