eohi/.history/eohi2/dataP - time interval differences_20251001130503.r
2025-12-23 15:47:09 -05:00

251 lines
9.0 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Script to calculate absolute differences between time intervals in eohi2.csv
# Compares present vs past/future, and 5-year vs 10-year intervals
# Load necessary library
library(dplyr)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
# Define the 15 item names (same order for all time periods)
items <- c(
"pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel",
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)
# Note: present uses lowercase "tv", others use uppercase "TV"
items_present <- c(
"pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel",
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)
# Define all source columns (75 total)
source_cols <- c(
paste0("present_", items_present),
paste0("past_5_", items),
paste0("past_10_", items),
paste0("fut_5_", items),
paste0("fut_10_", items)
)
# Define all target columns (90 total = 6 calculation types × 15 items)
target_NPast_5 <- paste0("NPast_5_", items)
target_NPast_10 <- paste0("NPast_10_", items)
target_NFut_5 <- paste0("NFut_5_", items)
target_NFut_10 <- paste0("NFut_10_", items)
target_5_10past <- paste0("5.10past_", items)
target_5_10fut <- paste0("5.10fut_", items)
target_cols <- c(
target_NPast_5,
target_NPast_10,
target_NFut_5,
target_NFut_10,
target_5_10past,
target_5_10fut
)
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))
# Check Source columns
missing_source <- source_cols[!source_cols %in% df_cols]
existing_source <- source_cols[source_cols %in% df_cols]
cat("Source Columns:\n")
cat(" Expected: 75 columns\n")
cat(" Found:", length(existing_source), "columns\n")
cat(" Missing:", length(missing_source), "columns\n")
if (length(missing_source) > 0 && length(missing_source) <= 10) {
cat("\n Missing source columns:\n")
for (col in missing_source) {
cat(" -", col, "\n")
}
} else if (length(missing_source) > 10) {
cat("\n Too many missing to list individually (", length(missing_source), "missing)\n")
}
cat("\n=== END CHECK ===\n\n")
# Stop if critical columns are missing
if (length(missing_source) > 30) {
stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
}
cat("Proceeding with processing...\n\n")
# ============= CALCULATE DIFFERENCES =============
cat("Calculating time interval differences...\n")
# Convert source columns to numeric
for (col in source_cols) {
if (col %in% names(df)) {
df[[col]] <- as.numeric(df[[col]])
}
}
# Helper function to calculate absolute difference
calc_abs_diff <- function(col1, col2) {
val1 <- if (col1 %in% names(df)) df[[col1]] else NA
val2 <- if (col2 %in% names(df)) df[[col2]] else NA
abs(val1 - val2)
}
# Calculate NPast_5: |present - past_5|
cat(" Calculating NPast_5 differences (present vs past 5 years)...\n")
for (i in 1:15) {
target <- target_NPast_5[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("past_5_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate NPast_10: |present - past_10|
cat(" Calculating NPast_10 differences (present vs past 10 years)...\n")
for (i in 1:15) {
target <- target_NPast_10[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("past_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate NFut_5: |present - fut_5|
cat(" Calculating NFut_5 differences (present vs future 5 years)...\n")
for (i in 1:15) {
target <- target_NFut_5[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("fut_5_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate NFut_10: |present - fut_10|
cat(" Calculating NFut_10 differences (present vs future 10 years)...\n")
for (i in 1:15) {
target <- target_NFut_10[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("fut_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate 5.10past: |past_5 - past_10|
cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n")
for (i in 1:15) {
target <- target_5_10past[i]
source1 <- paste0("past_5_", items[i])
source2 <- paste0("past_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate 5.10fut: |fut_5 - fut_10|
cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n")
for (i in 1:15) {
target <- target_5_10fut[i]
source1 <- paste0("fut_5_", items[i])
source2 <- paste0("fut_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
cat("\n=== CALCULATION COMPLETE ===\n")
cat(" 90 difference columns created.\n\n")
# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
# This function can be run multiple times to check different random rows
qa_check_random_row <- function() {
# Pick a random row
random_row <- sample(1:nrow(df), 1)
cat("\n========================================\n")
cat("QA CHECK: Random Row #", random_row, "\n")
cat("========================================\n\n")
# Sample one calculation from each type (item 1: pref_read)
test_item_idx <- 1
calculations <- list(
list(name = "NPast_5", target = target_NPast_5[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("past_5_", items[test_item_idx]),
desc = "|present - past_5|"),
list(name = "NPast_10", target = target_NPast_10[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("past_10_", items[test_item_idx]),
desc = "|present - past_10|"),
list(name = "NFut_5", target = target_NFut_5[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("fut_5_", items[test_item_idx]),
desc = "|present - fut_5|"),
list(name = "NFut_10", target = target_NFut_10[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("fut_10_", items[test_item_idx]),
desc = "|present - fut_10|"),
list(name = "5.10past", target = target_5_10past[test_item_idx],
source1 = paste0("past_5_", items[test_item_idx]),
source2 = paste0("past_10_", items[test_item_idx]),
desc = "|past_5 - past_10|"),
list(name = "5.10fut", target = target_5_10fut[test_item_idx],
source1 = paste0("fut_5_", items[test_item_idx]),
source2 = paste0("fut_10_", items[test_item_idx]),
desc = "|fut_5 - fut_10|")
)
cat("Checking sample item: pref_read\n\n")
for (calc in calculations) {
cat(sprintf("--- %s ---\n", calc$name))
cat(sprintf("Formula: %s\n", calc$desc))
val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA
val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA
target_val <- df[random_row, calc$target]
cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1))))
cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2))))
if (!is.na(val1) && !is.na(val2)) {
expected_diff <- abs(val1 - val2)
cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff))
cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val))
cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗")))
} else {
cat(" Cannot calculate (missing values)\n")
}
cat("\n")
}
cat("========================================\n")
cat("END QA CHECK\n")
cat("========================================\n\n")
}
# Run QA check on first random row
cat("\n\n")
qa_check_random_row()
# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
cat("Run this command in R console:\n")
cat(" qa_check_random_row()\n")
cat("\n")
# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
# write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
cat("Review the output above, then uncomment line 243 to save changes.\n")
cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n")