266 lines
9.6 KiB
R
266 lines
9.6 KiB
R
# Script to calculate domain means for time interval differences in eohi2.csv
|
||
# Averages the 5 items within each domain (pref, pers, val) for each time interval type
|
||
|
||
# Load necessary library
|
||
library(dplyr)
|
||
|
||
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
|
||
|
||
# Read the data (with check.names=FALSE to preserve original column names)
|
||
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
|
||
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
|
||
|
||
# Define the 15 item names (same order for all time periods)
|
||
items <- c(
|
||
"pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel",
|
||
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
|
||
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
|
||
)
|
||
|
||
# Define domain groupings (indices in items vector)
|
||
pref_indices <- 1:5
|
||
pers_indices <- 6:10
|
||
val_indices <- 11:15
|
||
|
||
# Define time interval prefixes
|
||
time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut")
|
||
|
||
# Define domain names
|
||
domain_names <- c("pref", "pers", "val")
|
||
|
||
# Define all source columns (90 total)
|
||
source_cols <- c(
|
||
paste0("NPast_5_", items),
|
||
paste0("NPast_10_", items),
|
||
paste0("NFut_5_", items),
|
||
paste0("NFut_10_", items),
|
||
paste0("X5.10past_", items),
|
||
paste0("X5.10fut_", items)
|
||
)
|
||
|
||
# Define all target columns (18 total = 6 time intervals × 3 domains)
|
||
target_cols <- c(
|
||
paste0("NPast_5_", domain_names, "_MEAN"),
|
||
paste0("NPast_10_", domain_names, "_MEAN"),
|
||
paste0("NFut_5_", domain_names, "_MEAN"),
|
||
paste0("NFut_10_", domain_names, "_MEAN"),
|
||
paste0("X5.10past_", domain_names, "_MEAN"),
|
||
paste0("X5.10fut_", domain_names, "_MEAN")
|
||
)
|
||
|
||
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
|
||
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
|
||
|
||
# Get actual column names from dataframe (trimmed)
|
||
df_cols <- trimws(names(df))
|
||
|
||
# Check Source columns
|
||
missing_source <- source_cols[!source_cols %in% df_cols]
|
||
existing_source <- source_cols[source_cols %in% df_cols]
|
||
|
||
cat("Source Columns:\n")
|
||
cat(" Expected: 90 columns\n")
|
||
cat(" Found:", length(existing_source), "columns\n")
|
||
cat(" Missing:", length(missing_source), "columns\n")
|
||
|
||
if (length(missing_source) > 0 && length(missing_source) <= 20) {
|
||
cat("\n Missing source columns:\n")
|
||
for (col in missing_source) {
|
||
cat(" -", col, "\n")
|
||
}
|
||
} else if (length(missing_source) > 20) {
|
||
cat("\n Too many missing to list individually (", length(missing_source), "missing)\n")
|
||
}
|
||
|
||
# Check Target columns
|
||
missing_targets <- target_cols[!target_cols %in% df_cols]
|
||
existing_targets <- target_cols[target_cols %in% df_cols]
|
||
|
||
cat("\nTarget Columns:\n")
|
||
cat(" Expected: 18 columns\n")
|
||
cat(" Found:", length(existing_targets), "columns\n")
|
||
cat(" Missing:", length(missing_targets), "columns\n")
|
||
|
||
if (length(missing_targets) > 0) {
|
||
cat("\n Target columns do NOT exist yet - will create them.\n")
|
||
if (length(existing_targets) > 0) {
|
||
cat(" WARNING: Some target columns already exist and will be overwritten.\n")
|
||
}
|
||
} else {
|
||
cat(" All target columns exist - will overwrite with calculated values.\n")
|
||
}
|
||
|
||
cat("\n=== END CHECK ===\n\n")
|
||
|
||
# Stop if critical columns are missing
|
||
if (length(missing_source) > 45) {
|
||
stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
|
||
}
|
||
|
||
cat("Proceeding with processing...\n\n")
|
||
|
||
# ============= CALCULATE DOMAIN MEANS =============
|
||
cat("Calculating domain means for time interval differences...\n")
|
||
|
||
# Convert source columns to numeric
|
||
for (col in source_cols) {
|
||
if (col %in% names(df)) {
|
||
df[[col]] <- as.numeric(df[[col]])
|
||
}
|
||
}
|
||
|
||
# Calculate means for each time interval × domain combination
|
||
for (time_prefix in time_prefixes) {
|
||
# Preferences mean
|
||
pref_cols <- paste0(time_prefix, "_", items[pref_indices])
|
||
existing_pref_cols <- pref_cols[pref_cols %in% names(df)]
|
||
if (length(existing_pref_cols) > 0) {
|
||
df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE)
|
||
cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n")
|
||
}
|
||
|
||
# Personality mean
|
||
pers_cols <- paste0(time_prefix, "_", items[pers_indices])
|
||
existing_pers_cols <- pers_cols[pers_cols %in% names(df)]
|
||
if (length(existing_pers_cols) > 0) {
|
||
df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE)
|
||
cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n")
|
||
}
|
||
|
||
# Values mean
|
||
val_cols <- paste0(time_prefix, "_", items[val_indices])
|
||
existing_val_cols <- val_cols[val_cols %in% names(df)]
|
||
if (length(existing_val_cols) > 0) {
|
||
df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE)
|
||
cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n")
|
||
}
|
||
}
|
||
|
||
cat("\n=== CALCULATION COMPLETE ===\n")
|
||
cat(" 18 domain mean columns created.\n\n")
|
||
|
||
|
||
# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK =============
|
||
# This function can be run multiple times to check different random rows and time intervals
|
||
|
||
qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) {
|
||
# Pick a random row or use specified row
|
||
if (is.null(row_num)) {
|
||
random_row <- sample(seq_len(nrow(df)), 1)
|
||
cat("\n========================================\n")
|
||
cat("QA CHECK: Random Row #", random_row, "\n")
|
||
} else {
|
||
if (row_num < 1 || row_num > nrow(df)) {
|
||
cat("ERROR: Row number must be between 1 and", nrow(df), "\n")
|
||
return()
|
||
}
|
||
random_row <- row_num
|
||
cat("\n========================================\n")
|
||
cat("QA CHECK: Specified Row #", random_row, "\n")
|
||
}
|
||
|
||
# Pick a random time interval or use specified interval
|
||
if (is.null(time_interval_num)) {
|
||
test_interval_idx <- sample(1:6, 1)
|
||
cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n")
|
||
} else {
|
||
if (time_interval_num < 1 || time_interval_num > 6) {
|
||
cat("ERROR: Time interval number must be between 1 and 6\n")
|
||
cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n")
|
||
return()
|
||
}
|
||
test_interval_idx <- time_interval_num
|
||
cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n")
|
||
}
|
||
|
||
cat("========================================\n\n")
|
||
|
||
time_prefix <- time_prefixes[test_interval_idx]
|
||
|
||
# Check each of the 3 domains
|
||
for (domain_idx in 1:3) {
|
||
domain_name <- domain_names[domain_idx]
|
||
|
||
# Get the appropriate item indices
|
||
if (domain_idx == 1) {
|
||
item_indices <- pref_indices
|
||
domain_label <- "Preferences"
|
||
} else if (domain_idx == 2) {
|
||
item_indices <- pers_indices
|
||
domain_label <- "Personality"
|
||
} else {
|
||
item_indices <- val_indices
|
||
domain_label <- "Values"
|
||
}
|
||
|
||
cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label))
|
||
|
||
# Get source column names
|
||
source_cols_domain <- paste0(time_prefix, "_", items[item_indices])
|
||
target_col <- paste0(time_prefix, "_", domain_name, "_MEAN")
|
||
|
||
# Get values
|
||
values <- numeric(5)
|
||
cat("Source values:\n")
|
||
for (i in 1:5) {
|
||
col <- source_cols_domain[i]
|
||
val <- if (col %in% names(df)) df[random_row, col] else NA
|
||
values[i] <- val
|
||
cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val))))
|
||
}
|
||
|
||
# Calculate expected mean
|
||
valid_values <- values[!is.na(values)]
|
||
if (length(valid_values) > 0) {
|
||
expected_mean <- mean(valid_values)
|
||
actual_value <- df[random_row, target_col]
|
||
|
||
cat(sprintf("\nCalculation:\n"))
|
||
cat(sprintf(" Sum: %s = %.5f\n",
|
||
paste(sprintf("%.5f", valid_values), collapse = " + "),
|
||
sum(valid_values)))
|
||
cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean))
|
||
cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value))
|
||
cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗")))
|
||
} else {
|
||
cat(" No valid values to calculate mean.\n")
|
||
}
|
||
cat("\n")
|
||
}
|
||
|
||
cat("========================================\n")
|
||
cat("END QA CHECK\n")
|
||
cat("========================================\n\n")
|
||
}
|
||
|
||
# Run QA check on random row and random time interval
|
||
cat("\n\n")
|
||
qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below)
|
||
|
||
# Instructions for running additional checks
|
||
cat("\n")
|
||
cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n")
|
||
cat("For random row AND random time interval, run:\n")
|
||
cat(" qa_check_random_row()\n")
|
||
cat("\nFor specific row (e.g., row 118) with random interval:\n")
|
||
cat(" qa_check_random_row(118)\n")
|
||
cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n")
|
||
cat(" qa_check_random_row(time_interval_num = 3)\n")
|
||
cat("\nFor specific row AND specific interval:\n")
|
||
cat(" qa_check_random_row(118, 3)\n")
|
||
cat("\n")
|
||
cat("Time Interval Numbers:\n")
|
||
cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n")
|
||
cat(" 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n")
|
||
cat("\n")
|
||
|
||
|
||
# Save the modified dataframe back to CSV
|
||
# na="" writes NA values as empty cells instead of "NA" text
|
||
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
|
||
write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
|
||
|
||
cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
|
||
cat("Review the output above, then uncomment line 234 to save changes.\n")
|
||
cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n")
|