eohi/eohi2/dataP 06 - time interval differences.r
2025-12-23 15:47:09 -05:00

293 lines
11 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Script to calculate absolute differences between time intervals in eohi2.csv
# Compares present vs past/future, and 5-year vs 10-year intervals
# Load necessary library
library(dplyr)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
# Define the 15 item names (same order for all time periods)
items <- c(
"pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel",
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)
# Note: present uses lowercase "tv", others use uppercase "TV"
items_present <- c(
"pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel",
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)
# Define all source columns (75 total)
source_cols <- c(
paste0("present_", items_present),
paste0("past_5_", items),
paste0("past_10_", items),
paste0("fut_5_", items),
paste0("fut_10_", items)
)
# Define all target columns (90 total = 6 calculation types × 15 items)
target_NPast_5 <- paste0("NPast_5_", items)
target_NPast_10 <- paste0("NPast_10_", items)
target_NFut_5 <- paste0("NFut_5_", items)
target_NFut_10 <- paste0("NFut_10_", items)
target_5_10past <- paste0("5.10past_", items)
target_5_10fut <- paste0("5.10fut_", items)
target_cols <- c(
target_NPast_5,
target_NPast_10,
target_NFut_5,
target_NFut_10,
target_5_10past,
target_5_10fut
)
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))
# Check Source columns
missing_source <- source_cols[!source_cols %in% df_cols]
existing_source <- source_cols[source_cols %in% df_cols]
cat("Source Columns:\n")
cat(" Expected: 75 columns\n")
cat(" Found:", length(existing_source), "columns\n")
cat(" Missing:", length(missing_source), "columns\n")
if (length(missing_source) > 0 && length(missing_source) <= 10) {
cat("\n Missing source columns:\n")
for (col in missing_source) {
cat(" -", col, "\n")
}
} else if (length(missing_source) > 10) {
cat("\n Too many missing to list individually (", length(missing_source), "missing)\n")
}
# Check Target columns
missing_targets <- target_cols[!target_cols %in% df_cols]
existing_targets <- target_cols[target_cols %in% df_cols]
cat("\nTarget Columns:\n")
cat(" Expected: 90 columns\n")
cat(" Found:", length(existing_targets), "columns\n")
cat(" Missing:", length(missing_targets), "columns\n")
if (length(missing_targets) > 0) {
cat("\n Target columns do NOT exist yet - will create them.\n")
if (length(existing_targets) > 0) {
cat(" WARNING: Some target columns already exist and will be overwritten.\n")
}
} else {
cat(" All target columns exist - will overwrite with calculated values.\n")
}
cat("\n=== END CHECK ===\n\n")
# Stop if critical columns are missing
if (length(missing_source) > 30) {
stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
}
cat("Proceeding with processing...\n\n")
# ============= CALCULATE DIFFERENCES =============
cat("Calculating time interval differences...\n")
# Convert source columns to numeric
for (col in source_cols) {
if (col %in% names(df)) {
df[[col]] <- as.numeric(df[[col]])
}
}
# Helper function to calculate absolute difference
calc_abs_diff <- function(col1, col2) {
val1 <- if (col1 %in% names(df)) df[[col1]] else NA
val2 <- if (col2 %in% names(df)) df[[col2]] else NA
abs(val1 - val2)
}
# Calculate NPast_5: |present - past_5|
cat(" Calculating NPast_5 differences (present vs past 5 years)...\n")
for (i in 1:15) {
target <- target_NPast_5[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("past_5_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate NPast_10: |present - past_10|
cat(" Calculating NPast_10 differences (present vs past 10 years)...\n")
for (i in 1:15) {
target <- target_NPast_10[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("past_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate NFut_5: |present - fut_5|
cat(" Calculating NFut_5 differences (present vs future 5 years)...\n")
for (i in 1:15) {
target <- target_NFut_5[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("fut_5_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate NFut_10: |present - fut_10|
cat(" Calculating NFut_10 differences (present vs future 10 years)...\n")
for (i in 1:15) {
target <- target_NFut_10[i]
source1 <- paste0("present_", items_present[i])
source2 <- paste0("fut_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate 5.10past: |past_5 - past_10|
cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n")
for (i in 1:15) {
target <- target_5_10past[i]
source1 <- paste0("past_5_", items[i])
source2 <- paste0("past_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
# Calculate 5.10fut: |fut_5 - fut_10|
cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n")
for (i in 1:15) {
target <- target_5_10fut[i]
source1 <- paste0("fut_5_", items[i])
source2 <- paste0("fut_10_", items[i])
df[[target]] <- calc_abs_diff(source1, source2)
}
cat("\n=== CALCULATION COMPLETE ===\n")
cat(" 90 difference columns created.\n\n")
# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK =============
# This function can be run multiple times to check different random rows and items
qa_check_random_row <- function(row_num = NULL, item_num = NULL) {
# Pick a random row or use specified row
if (is.null(row_num)) {
random_row <- sample(seq_len(nrow(df)), 1)
cat("\n========================================\n")
cat("QA CHECK: Random Row #", random_row, "\n")
} else {
if (row_num < 1 || row_num > nrow(df)) {
cat("ERROR: Row number must be between 1 and", nrow(df), "\n")
return()
}
random_row <- row_num
cat("\n========================================\n")
cat("QA CHECK: Specified Row #", random_row, "\n")
}
# Pick a random item or use specified item
if (is.null(item_num)) {
test_item_idx <- sample(1:15, 1)
cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n")
} else {
if (item_num < 1 || item_num > 15) {
cat("ERROR: Item number must be between 1 and 15\n")
return()
}
test_item_idx <- item_num
cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n")
}
cat("========================================\n\n")
calculations <- list(
list(name = "NPast_5", target = target_NPast_5[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("past_5_", items[test_item_idx]),
desc = "|present - past_5|"),
list(name = "NPast_10", target = target_NPast_10[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("past_10_", items[test_item_idx]),
desc = "|present - past_10|"),
list(name = "NFut_5", target = target_NFut_5[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("fut_5_", items[test_item_idx]),
desc = "|present - fut_5|"),
list(name = "NFut_10", target = target_NFut_10[test_item_idx],
source1 = paste0("present_", items_present[test_item_idx]),
source2 = paste0("fut_10_", items[test_item_idx]),
desc = "|present - fut_10|"),
list(name = "5.10past", target = target_5_10past[test_item_idx],
source1 = paste0("past_5_", items[test_item_idx]),
source2 = paste0("past_10_", items[test_item_idx]),
desc = "|past_5 - past_10|"),
list(name = "5.10fut", target = target_5_10fut[test_item_idx],
source1 = paste0("fut_5_", items[test_item_idx]),
source2 = paste0("fut_10_", items[test_item_idx]),
desc = "|fut_5 - fut_10|")
)
for (calc in calculations) {
cat(sprintf("--- %s ---\n", calc$name))
cat(sprintf("Formula: %s\n", calc$desc))
val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA
val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA
target_val <- df[random_row, calc$target]
cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1))))
cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2))))
if (!is.na(val1) && !is.na(val2)) {
expected_diff <- abs(val1 - val2)
cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff))
cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val))
cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗")))
} else {
cat(" Cannot calculate (missing values)\n")
}
cat("\n")
}
cat("========================================\n")
cat("END QA CHECK\n")
cat("========================================\n\n")
}
# Run QA check on random row and random item
cat("\n\n")
qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below)
# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER ROW/ITEM ***\n")
cat("For random row AND random item, run:\n")
cat(" qa_check_random_row()\n")
cat("\nFor specific row (e.g., row 118) with random item:\n")
cat(" qa_check_random_row(118)\n")
cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n")
cat(" qa_check_random_row(item_num = 5)\n")
cat("\nFor specific row AND specific item:\n")
cat(" qa_check_random_row(118, 5)\n")
cat("\n")
# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
# write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
cat("Review the output above, then uncomment line 243 to save changes.\n")
cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n")