184 lines
6.1 KiB
R
184 lines
6.1 KiB
R
# Script to calculate DGEN means by time period in eohi2.csv
|
|
# Averages the 3 domain scores (Pref, Pers, Val) for each time period
|
|
|
|
# Load necessary library
|
|
library(dplyr)
|
|
|
|
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
|
|
|
|
# Read the data (with check.names=FALSE to preserve original column names)
|
|
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
|
|
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
|
|
|
|
# Define source columns (12 total)
|
|
source_cols <- c(
|
|
"DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val",
|
|
"DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val",
|
|
"DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val",
|
|
"DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val"
|
|
)
|
|
|
|
# Define target columns (4 total)
|
|
target_cols <- c(
|
|
"DGEN_past_5_mean",
|
|
"DGEN_past_10_mean",
|
|
"DGEN_fut_5_mean",
|
|
"DGEN_fut_10_mean"
|
|
)
|
|
|
|
# Define groupings: each target gets 3 source columns
|
|
source_groups <- list(
|
|
DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"),
|
|
DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"),
|
|
DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"),
|
|
DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val")
|
|
)
|
|
|
|
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
|
|
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
|
|
|
|
# Get actual column names from dataframe (trimmed)
|
|
df_cols <- trimws(names(df))
|
|
|
|
# Check Source columns
|
|
missing_source <- source_cols[!source_cols %in% df_cols]
|
|
existing_source <- source_cols[source_cols %in% df_cols]
|
|
|
|
cat("Source Columns:\n")
|
|
cat(" Expected: 12 columns\n")
|
|
cat(" Found:", length(existing_source), "columns\n")
|
|
cat(" Missing:", length(missing_source), "columns\n")
|
|
|
|
if (length(missing_source) > 0) {
|
|
cat("\n Missing source columns:\n")
|
|
for (col in missing_source) {
|
|
cat(" -", col, "\n")
|
|
}
|
|
}
|
|
|
|
# Check Target columns
|
|
missing_targets <- target_cols[!target_cols %in% df_cols]
|
|
existing_targets <- target_cols[target_cols %in% df_cols]
|
|
|
|
cat("\nTarget Columns:\n")
|
|
cat(" Expected: 4 columns\n")
|
|
cat(" Found:", length(existing_targets), "columns\n")
|
|
cat(" Missing:", length(missing_targets), "columns\n")
|
|
|
|
if (length(missing_targets) > 0) {
|
|
cat("\n Target columns do NOT exist yet - will create them.\n")
|
|
if (length(existing_targets) > 0) {
|
|
cat(" WARNING: Some target columns already exist and will be overwritten.\n")
|
|
}
|
|
} else {
|
|
cat(" All target columns exist - will overwrite with calculated values.\n")
|
|
}
|
|
|
|
cat("\n=== END CHECK ===\n\n")
|
|
|
|
# Stop if critical columns are missing
|
|
if (length(missing_source) > 6) {
|
|
stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
|
|
}
|
|
|
|
cat("Proceeding with processing...\n\n")
|
|
|
|
# ============= CALCULATE MEANS =============
|
|
cat("Calculating DGEN means by time period...\n")
|
|
|
|
# Convert source columns to numeric
|
|
for (col in source_cols) {
|
|
if (col %in% names(df)) {
|
|
df[[col]] <- as.numeric(df[[col]])
|
|
}
|
|
}
|
|
|
|
# Calculate each target as the mean of its 3 source columns
|
|
for (target in target_cols) {
|
|
source_group <- source_groups[[target]]
|
|
|
|
# Get the columns that exist
|
|
existing_cols <- source_group[source_group %in% names(df)]
|
|
|
|
if (length(existing_cols) > 0) {
|
|
# Calculate row means across the 3 domain columns
|
|
df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE)
|
|
cat(" Processed:", target, "\n")
|
|
} else {
|
|
cat(" WARNING: No source columns found for", target, "\n")
|
|
}
|
|
}
|
|
|
|
cat("\n=== CALCULATION COMPLETE ===\n\n")
|
|
|
|
|
|
# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
|
|
# This function can be run multiple times to check different random rows
|
|
|
|
qa_check_random_row <- function() {
|
|
# Pick a random row
|
|
random_row <- sample(1:nrow(df), 1)
|
|
|
|
cat("\n========================================\n")
|
|
cat("QA CHECK: Random Row #", random_row, "\n")
|
|
cat("========================================\n\n")
|
|
|
|
# Check each of the 4 target columns
|
|
for (target in target_cols) {
|
|
source_group <- source_groups[[target]]
|
|
|
|
cat(sprintf("Target: %s\n", target))
|
|
cat(" Source columns:\n")
|
|
|
|
# Get values from source columns
|
|
values <- numeric(3)
|
|
for (i in 1:3) {
|
|
col <- source_group[i]
|
|
val <- if (col %in% names(df)) df[random_row, col] else NA
|
|
values[i] <- val
|
|
cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val))))
|
|
}
|
|
|
|
# Calculate expected mean
|
|
valid_values <- values[!is.na(values)]
|
|
if (length(valid_values) > 0) {
|
|
expected_mean <- mean(valid_values)
|
|
actual_value <- df[random_row, target]
|
|
|
|
cat(sprintf("\n Calculation:\n"))
|
|
cat(sprintf(" Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values)))
|
|
cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean))
|
|
cat(sprintf(" Target value: %.5f\n", actual_value))
|
|
cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗")))
|
|
} else {
|
|
cat(" No valid values to calculate mean.\n")
|
|
}
|
|
cat("\n")
|
|
}
|
|
|
|
cat("========================================\n")
|
|
cat("END QA CHECK\n")
|
|
cat("========================================\n\n")
|
|
}
|
|
|
|
# Run QA check on first random row
|
|
cat("\n\n")
|
|
qa_check_random_row()
|
|
|
|
# Instructions for running additional checks
|
|
cat("\n")
|
|
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
|
|
cat("Run this command in R console:\n")
|
|
cat(" qa_check_random_row()\n")
|
|
cat("\n")
|
|
|
|
|
|
# Save the modified dataframe back to CSV
|
|
# na="" writes NA values as empty cells instead of "NA" text
|
|
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
|
|
# write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
|
|
|
|
cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
|
|
cat("Review the output above, then uncomment line 163 to save changes.\n")
|
|
cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n")
|