eohi/eohi2/dataP 07 - domain means.r
2025-12-23 15:47:09 -05:00

266 lines
9.6 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Script to calculate domain means for time interval differences in eohi2.csv
# Averages the 5 items within each domain (pref, pers, val) for each time interval type
# Load necessary library
library(dplyr)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
# Define the 15 item names (same order for all time periods)
items <- c(
"pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel",
"pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
"val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)
# Define domain groupings (indices in items vector)
pref_indices <- 1:5
pers_indices <- 6:10
val_indices <- 11:15
# Define time interval prefixes
time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut")
# Define domain names
domain_names <- c("pref", "pers", "val")
# Define all source columns (90 total)
source_cols <- c(
paste0("NPast_5_", items),
paste0("NPast_10_", items),
paste0("NFut_5_", items),
paste0("NFut_10_", items),
paste0("X5.10past_", items),
paste0("X5.10fut_", items)
)
# Define all target columns (18 total = 6 time intervals × 3 domains)
target_cols <- c(
paste0("NPast_5_", domain_names, "_MEAN"),
paste0("NPast_10_", domain_names, "_MEAN"),
paste0("NFut_5_", domain_names, "_MEAN"),
paste0("NFut_10_", domain_names, "_MEAN"),
paste0("X5.10past_", domain_names, "_MEAN"),
paste0("X5.10fut_", domain_names, "_MEAN")
)
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))
# Check Source columns
missing_source <- source_cols[!source_cols %in% df_cols]
existing_source <- source_cols[source_cols %in% df_cols]
cat("Source Columns:\n")
cat(" Expected: 90 columns\n")
cat(" Found:", length(existing_source), "columns\n")
cat(" Missing:", length(missing_source), "columns\n")
if (length(missing_source) > 0 && length(missing_source) <= 20) {
cat("\n Missing source columns:\n")
for (col in missing_source) {
cat(" -", col, "\n")
}
} else if (length(missing_source) > 20) {
cat("\n Too many missing to list individually (", length(missing_source), "missing)\n")
}
# Check Target columns
missing_targets <- target_cols[!target_cols %in% df_cols]
existing_targets <- target_cols[target_cols %in% df_cols]
cat("\nTarget Columns:\n")
cat(" Expected: 18 columns\n")
cat(" Found:", length(existing_targets), "columns\n")
cat(" Missing:", length(missing_targets), "columns\n")
if (length(missing_targets) > 0) {
cat("\n Target columns do NOT exist yet - will create them.\n")
if (length(existing_targets) > 0) {
cat(" WARNING: Some target columns already exist and will be overwritten.\n")
}
} else {
cat(" All target columns exist - will overwrite with calculated values.\n")
}
cat("\n=== END CHECK ===\n\n")
# Stop if critical columns are missing
if (length(missing_source) > 45) {
stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
}
cat("Proceeding with processing...\n\n")
# ============= CALCULATE DOMAIN MEANS =============
cat("Calculating domain means for time interval differences...\n")
# Convert source columns to numeric
for (col in source_cols) {
if (col %in% names(df)) {
df[[col]] <- as.numeric(df[[col]])
}
}
# Calculate means for each time interval × domain combination
for (time_prefix in time_prefixes) {
# Preferences mean
pref_cols <- paste0(time_prefix, "_", items[pref_indices])
existing_pref_cols <- pref_cols[pref_cols %in% names(df)]
if (length(existing_pref_cols) > 0) {
df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE)
cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n")
}
# Personality mean
pers_cols <- paste0(time_prefix, "_", items[pers_indices])
existing_pers_cols <- pers_cols[pers_cols %in% names(df)]
if (length(existing_pers_cols) > 0) {
df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE)
cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n")
}
# Values mean
val_cols <- paste0(time_prefix, "_", items[val_indices])
existing_val_cols <- val_cols[val_cols %in% names(df)]
if (length(existing_val_cols) > 0) {
df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE)
cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n")
}
}
cat("\n=== CALCULATION COMPLETE ===\n")
cat(" 18 domain mean columns created.\n\n")
# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK =============
# This function can be run multiple times to check different random rows and time intervals
qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) {
# Pick a random row or use specified row
if (is.null(row_num)) {
random_row <- sample(seq_len(nrow(df)), 1)
cat("\n========================================\n")
cat("QA CHECK: Random Row #", random_row, "\n")
} else {
if (row_num < 1 || row_num > nrow(df)) {
cat("ERROR: Row number must be between 1 and", nrow(df), "\n")
return()
}
random_row <- row_num
cat("\n========================================\n")
cat("QA CHECK: Specified Row #", random_row, "\n")
}
# Pick a random time interval or use specified interval
if (is.null(time_interval_num)) {
test_interval_idx <- sample(1:6, 1)
cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n")
} else {
if (time_interval_num < 1 || time_interval_num > 6) {
cat("ERROR: Time interval number must be between 1 and 6\n")
cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n")
return()
}
test_interval_idx <- time_interval_num
cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n")
}
cat("========================================\n\n")
time_prefix <- time_prefixes[test_interval_idx]
# Check each of the 3 domains
for (domain_idx in 1:3) {
domain_name <- domain_names[domain_idx]
# Get the appropriate item indices
if (domain_idx == 1) {
item_indices <- pref_indices
domain_label <- "Preferences"
} else if (domain_idx == 2) {
item_indices <- pers_indices
domain_label <- "Personality"
} else {
item_indices <- val_indices
domain_label <- "Values"
}
cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label))
# Get source column names
source_cols_domain <- paste0(time_prefix, "_", items[item_indices])
target_col <- paste0(time_prefix, "_", domain_name, "_MEAN")
# Get values
values <- numeric(5)
cat("Source values:\n")
for (i in 1:5) {
col <- source_cols_domain[i]
val <- if (col %in% names(df)) df[random_row, col] else NA
values[i] <- val
cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val))))
}
# Calculate expected mean
valid_values <- values[!is.na(values)]
if (length(valid_values) > 0) {
expected_mean <- mean(valid_values)
actual_value <- df[random_row, target_col]
cat(sprintf("\nCalculation:\n"))
cat(sprintf(" Sum: %s = %.5f\n",
paste(sprintf("%.5f", valid_values), collapse = " + "),
sum(valid_values)))
cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean))
cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value))
cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗")))
} else {
cat(" No valid values to calculate mean.\n")
}
cat("\n")
}
cat("========================================\n")
cat("END QA CHECK\n")
cat("========================================\n\n")
}
# Run QA check on random row and random time interval
cat("\n\n")
qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below)
# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n")
cat("For random row AND random time interval, run:\n")
cat(" qa_check_random_row()\n")
cat("\nFor specific row (e.g., row 118) with random interval:\n")
cat(" qa_check_random_row(118)\n")
cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n")
cat(" qa_check_random_row(time_interval_num = 3)\n")
cat("\nFor specific row AND specific interval:\n")
cat(" qa_check_random_row(118, 3)\n")
cat("\n")
cat("Time Interval Numbers:\n")
cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n")
cat(" 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n")
cat("\n")
# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
cat("Review the output above, then uncomment line 234 to save changes.\n")
cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n")