293 lines
11 KiB
R
293 lines
11 KiB
R
# Script to calculate absolute differences between time intervals in eohi2.csv
|
||
# Compares present vs past/future, and 5-year vs 10-year intervals
|
||
|
||
# Load necessary library
|
||
library(dplyr)
|
||
|
||
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
|
||
|
||
# Read the data (with check.names=FALSE to preserve original column names)
|
||
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
|
||
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
|
||
|
||
# Define the 15 item names (same order for all time periods)
|
||
items <- c(
|
||
"pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel",
|
||
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
|
||
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
|
||
)
|
||
|
||
# Note: present uses lowercase "tv", others use uppercase "TV"
|
||
items_present <- c(
|
||
"pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel",
|
||
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
|
||
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
|
||
)
|
||
|
||
# Define all source columns (75 total)
|
||
source_cols <- c(
|
||
paste0("present_", items_present),
|
||
paste0("past_5_", items),
|
||
paste0("past_10_", items),
|
||
paste0("fut_5_", items),
|
||
paste0("fut_10_", items)
|
||
)
|
||
|
||
# Define all target columns (90 total = 6 calculation types × 15 items)
|
||
target_NPast_5 <- paste0("NPast_5_", items)
|
||
target_NPast_10 <- paste0("NPast_10_", items)
|
||
target_NFut_5 <- paste0("NFut_5_", items)
|
||
target_NFut_10 <- paste0("NFut_10_", items)
|
||
target_5_10past <- paste0("5.10past_", items)
|
||
target_5_10fut <- paste0("5.10fut_", items)
|
||
|
||
target_cols <- c(
|
||
target_NPast_5,
|
||
target_NPast_10,
|
||
target_NFut_5,
|
||
target_NFut_10,
|
||
target_5_10past,
|
||
target_5_10fut
|
||
)
|
||
|
||
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
|
||
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
|
||
|
||
# Get actual column names from dataframe (trimmed)
|
||
df_cols <- trimws(names(df))
|
||
|
||
# Check Source columns
|
||
missing_source <- source_cols[!source_cols %in% df_cols]
|
||
existing_source <- source_cols[source_cols %in% df_cols]
|
||
|
||
cat("Source Columns:\n")
|
||
cat(" Expected: 75 columns\n")
|
||
cat(" Found:", length(existing_source), "columns\n")
|
||
cat(" Missing:", length(missing_source), "columns\n")
|
||
|
||
if (length(missing_source) > 0 && length(missing_source) <= 10) {
|
||
cat("\n Missing source columns:\n")
|
||
for (col in missing_source) {
|
||
cat(" -", col, "\n")
|
||
}
|
||
} else if (length(missing_source) > 10) {
|
||
cat("\n Too many missing to list individually (", length(missing_source), "missing)\n")
|
||
}
|
||
|
||
# Check Target columns
|
||
missing_targets <- target_cols[!target_cols %in% df_cols]
|
||
existing_targets <- target_cols[target_cols %in% df_cols]
|
||
|
||
cat("\nTarget Columns:\n")
|
||
cat(" Expected: 90 columns\n")
|
||
cat(" Found:", length(existing_targets), "columns\n")
|
||
cat(" Missing:", length(missing_targets), "columns\n")
|
||
|
||
if (length(missing_targets) > 0) {
|
||
cat("\n Target columns do NOT exist yet - will create them.\n")
|
||
if (length(existing_targets) > 0) {
|
||
cat(" WARNING: Some target columns already exist and will be overwritten.\n")
|
||
}
|
||
} else {
|
||
cat(" All target columns exist - will overwrite with calculated values.\n")
|
||
}
|
||
|
||
cat("\n=== END CHECK ===\n\n")
|
||
|
||
# Stop if critical columns are missing
|
||
if (length(missing_source) > 30) {
|
||
stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
|
||
}
|
||
|
||
cat("Proceeding with processing...\n\n")
|
||
|
||
# ============= CALCULATE DIFFERENCES =============
|
||
cat("Calculating time interval differences...\n")
|
||
|
||
# Convert source columns to numeric
|
||
for (col in source_cols) {
|
||
if (col %in% names(df)) {
|
||
df[[col]] <- as.numeric(df[[col]])
|
||
}
|
||
}
|
||
|
||
# Helper function to calculate absolute difference
|
||
calc_abs_diff <- function(col1, col2) {
|
||
val1 <- if (col1 %in% names(df)) df[[col1]] else NA
|
||
val2 <- if (col2 %in% names(df)) df[[col2]] else NA
|
||
abs(val1 - val2)
|
||
}
|
||
|
||
# Calculate NPast_5: |present - past_5|
|
||
cat(" Calculating NPast_5 differences (present vs past 5 years)...\n")
|
||
for (i in 1:15) {
|
||
target <- target_NPast_5[i]
|
||
source1 <- paste0("present_", items_present[i])
|
||
source2 <- paste0("past_5_", items[i])
|
||
df[[target]] <- calc_abs_diff(source1, source2)
|
||
}
|
||
|
||
# Calculate NPast_10: |present - past_10|
|
||
cat(" Calculating NPast_10 differences (present vs past 10 years)...\n")
|
||
for (i in 1:15) {
|
||
target <- target_NPast_10[i]
|
||
source1 <- paste0("present_", items_present[i])
|
||
source2 <- paste0("past_10_", items[i])
|
||
df[[target]] <- calc_abs_diff(source1, source2)
|
||
}
|
||
|
||
# Calculate NFut_5: |present - fut_5|
|
||
cat(" Calculating NFut_5 differences (present vs future 5 years)...\n")
|
||
for (i in 1:15) {
|
||
target <- target_NFut_5[i]
|
||
source1 <- paste0("present_", items_present[i])
|
||
source2 <- paste0("fut_5_", items[i])
|
||
df[[target]] <- calc_abs_diff(source1, source2)
|
||
}
|
||
|
||
# Calculate NFut_10: |present - fut_10|
|
||
cat(" Calculating NFut_10 differences (present vs future 10 years)...\n")
|
||
for (i in 1:15) {
|
||
target <- target_NFut_10[i]
|
||
source1 <- paste0("present_", items_present[i])
|
||
source2 <- paste0("fut_10_", items[i])
|
||
df[[target]] <- calc_abs_diff(source1, source2)
|
||
}
|
||
|
||
# Calculate 5.10past: |past_5 - past_10|
|
||
cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n")
|
||
for (i in 1:15) {
|
||
target <- target_5_10past[i]
|
||
source1 <- paste0("past_5_", items[i])
|
||
source2 <- paste0("past_10_", items[i])
|
||
df[[target]] <- calc_abs_diff(source1, source2)
|
||
}
|
||
|
||
# Calculate 5.10fut: |fut_5 - fut_10|
|
||
cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n")
|
||
for (i in 1:15) {
|
||
target <- target_5_10fut[i]
|
||
source1 <- paste0("fut_5_", items[i])
|
||
source2 <- paste0("fut_10_", items[i])
|
||
df[[target]] <- calc_abs_diff(source1, source2)
|
||
}
|
||
|
||
cat("\n=== CALCULATION COMPLETE ===\n")
|
||
cat(" 90 difference columns created.\n\n")
|
||
|
||
|
||
# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK =============
|
||
# This function can be run multiple times to check different random rows and items
|
||
|
||
qa_check_random_row <- function(row_num = NULL, item_num = NULL) {
|
||
# Pick a random row or use specified row
|
||
if (is.null(row_num)) {
|
||
random_row <- sample(seq_len(nrow(df)), 1)
|
||
cat("\n========================================\n")
|
||
cat("QA CHECK: Random Row #", random_row, "\n")
|
||
} else {
|
||
if (row_num < 1 || row_num > nrow(df)) {
|
||
cat("ERROR: Row number must be between 1 and", nrow(df), "\n")
|
||
return()
|
||
}
|
||
random_row <- row_num
|
||
cat("\n========================================\n")
|
||
cat("QA CHECK: Specified Row #", random_row, "\n")
|
||
}
|
||
|
||
# Pick a random item or use specified item
|
||
if (is.null(item_num)) {
|
||
test_item_idx <- sample(1:15, 1)
|
||
cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n")
|
||
} else {
|
||
if (item_num < 1 || item_num > 15) {
|
||
cat("ERROR: Item number must be between 1 and 15\n")
|
||
return()
|
||
}
|
||
test_item_idx <- item_num
|
||
cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n")
|
||
}
|
||
|
||
cat("========================================\n\n")
|
||
|
||
calculations <- list(
|
||
list(name = "NPast_5", target = target_NPast_5[test_item_idx],
|
||
source1 = paste0("present_", items_present[test_item_idx]),
|
||
source2 = paste0("past_5_", items[test_item_idx]),
|
||
desc = "|present - past_5|"),
|
||
list(name = "NPast_10", target = target_NPast_10[test_item_idx],
|
||
source1 = paste0("present_", items_present[test_item_idx]),
|
||
source2 = paste0("past_10_", items[test_item_idx]),
|
||
desc = "|present - past_10|"),
|
||
list(name = "NFut_5", target = target_NFut_5[test_item_idx],
|
||
source1 = paste0("present_", items_present[test_item_idx]),
|
||
source2 = paste0("fut_5_", items[test_item_idx]),
|
||
desc = "|present - fut_5|"),
|
||
list(name = "NFut_10", target = target_NFut_10[test_item_idx],
|
||
source1 = paste0("present_", items_present[test_item_idx]),
|
||
source2 = paste0("fut_10_", items[test_item_idx]),
|
||
desc = "|present - fut_10|"),
|
||
list(name = "5.10past", target = target_5_10past[test_item_idx],
|
||
source1 = paste0("past_5_", items[test_item_idx]),
|
||
source2 = paste0("past_10_", items[test_item_idx]),
|
||
desc = "|past_5 - past_10|"),
|
||
list(name = "5.10fut", target = target_5_10fut[test_item_idx],
|
||
source1 = paste0("fut_5_", items[test_item_idx]),
|
||
source2 = paste0("fut_10_", items[test_item_idx]),
|
||
desc = "|fut_5 - fut_10|")
|
||
)
|
||
|
||
for (calc in calculations) {
|
||
cat(sprintf("--- %s ---\n", calc$name))
|
||
cat(sprintf("Formula: %s\n", calc$desc))
|
||
|
||
val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA
|
||
val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA
|
||
target_val <- df[random_row, calc$target]
|
||
|
||
cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1))))
|
||
cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2))))
|
||
|
||
if (!is.na(val1) && !is.na(val2)) {
|
||
expected_diff <- abs(val1 - val2)
|
||
cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff))
|
||
cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val))
|
||
cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗")))
|
||
} else {
|
||
cat(" Cannot calculate (missing values)\n")
|
||
}
|
||
cat("\n")
|
||
}
|
||
|
||
cat("========================================\n")
|
||
cat("END QA CHECK\n")
|
||
cat("========================================\n\n")
|
||
}
|
||
|
||
# Run QA check on random row and random item
|
||
cat("\n\n")
|
||
qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below)
|
||
|
||
# Instructions for running additional checks
|
||
cat("\n")
|
||
cat("*** TO CHECK ANOTHER ROW/ITEM ***\n")
|
||
cat("For random row AND random item, run:\n")
|
||
cat(" qa_check_random_row()\n")
|
||
cat("\nFor specific row (e.g., row 118) with random item:\n")
|
||
cat(" qa_check_random_row(118)\n")
|
||
cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n")
|
||
cat(" qa_check_random_row(item_num = 5)\n")
|
||
cat("\nFor specific row AND specific item:\n")
|
||
cat(" qa_check_random_row(118, 5)\n")
|
||
cat("\n")
|
||
|
||
|
||
# Save the modified dataframe back to CSV
|
||
# na="" writes NA values as empty cells instead of "NA" text
|
||
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
|
||
# write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
|
||
|
||
cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
|
||
cat("Review the output above, then uncomment line 243 to save changes.\n")
|
||
cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n")
|