eohi/eohi2/dataP 07 - domain means.r

# Script to calculate domain means for time interval differences in eohi2.csv
# Averages the 5 items within each domain (pref, pers, val) for each time interval type

# Load necessary library
library(dplyr)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")

# Read the data (with check.names=FALSE to preserve original column names)
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)

# Define the 15 item names (same order for all time periods)
items <- c(
  "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel",
  "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
  "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice"
)

# Define domain groupings (indices in items vector)
pref_indices <- 1:5
pers_indices <- 6:10
val_indices <- 11:15

# Define time interval prefixes
time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut")

# Define domain names
domain_names <- c("pref", "pers", "val")

# Define all source columns (90 total)
source_cols <- c(
  paste0("NPast_5_", items),
  paste0("NPast_10_", items),
  paste0("NFut_5_", items),
  paste0("NFut_10_", items),
  paste0("X5.10past_", items),
  paste0("X5.10fut_", items)
)

# Define all target columns (18 total = 6 time intervals × 3 domains)
target_cols <- c(
  paste0("NPast_5_", domain_names, "_MEAN"),
  paste0("NPast_10_", domain_names, "_MEAN"),
  paste0("NFut_5_", domain_names, "_MEAN"),
  paste0("NFut_10_", domain_names, "_MEAN"),
  paste0("X5.10past_", domain_names, "_MEAN"),
  paste0("X5.10fut_", domain_names, "_MEAN")
)

# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")

# Get actual column names from dataframe (trimmed)
df_cols <- trimws(names(df))

# Check Source columns
missing_source <- source_cols[!source_cols %in% df_cols]
existing_source <- source_cols[source_cols %in% df_cols]

cat("Source Columns:\n")
cat("  Expected: 90 columns\n")
cat("  Found:", length(existing_source), "columns\n")
cat("  Missing:", length(missing_source), "columns\n")

if (length(missing_source) > 0 && length(missing_source) <= 20) {
  cat("\n  Missing source columns:\n")
  for (col in missing_source) {
    cat("    -", col, "\n")
  }
} else if (length(missing_source) > 20) {
  cat("\n  Too many missing to list individually (", length(missing_source), "missing)\n")
}

# Check Target columns
missing_targets <- target_cols[!target_cols %in% df_cols]
existing_targets <- target_cols[target_cols %in% df_cols]

cat("\nTarget Columns:\n")
cat("  Expected: 18 columns\n")
cat("  Found:", length(existing_targets), "columns\n")
cat("  Missing:", length(missing_targets), "columns\n")

if (length(missing_targets) > 0) {
  cat("\n  Target columns do NOT exist yet - will create them.\n")
  if (length(existing_targets) > 0) {
    cat("  WARNING: Some target columns already exist and will be overwritten.\n")
  }
} else {
  cat("  All target columns exist - will overwrite with calculated values.\n")
}

cat("\n=== END CHECK ===\n\n")

# Stop if critical columns are missing
if (length(missing_source) > 45) {
  stop("ERROR: Too many source columns missing! Please check column names in CSV file.")
}

cat("Proceeding with processing...\n\n")

# ============= CALCULATE DOMAIN MEANS =============
cat("Calculating domain means for time interval differences...\n")

# Convert source columns to numeric
for (col in source_cols) {
  if (col %in% names(df)) {
    df[[col]] <- as.numeric(df[[col]])
  }
}

# Calculate means for each time interval × domain combination
for (time_prefix in time_prefixes) {
  # Preferences mean
  pref_cols <- paste0(time_prefix, "_", items[pref_indices])
  existing_pref_cols <- pref_cols[pref_cols %in% names(df)]
  if (length(existing_pref_cols) > 0) {
    df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE)
    cat("  Processed:", paste0(time_prefix, "_pref_MEAN"), "\n")
  }

  # Personality mean
  pers_cols <- paste0(time_prefix, "_", items[pers_indices])
  existing_pers_cols <- pers_cols[pers_cols %in% names(df)]
  if (length(existing_pers_cols) > 0) {
    df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE)
    cat("  Processed:", paste0(time_prefix, "_pers_MEAN"), "\n")
  }

  # Values mean
  val_cols <- paste0(time_prefix, "_", items[val_indices])
  existing_val_cols <- val_cols[val_cols %in% names(df)]
  if (length(existing_val_cols) > 0) {
    df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE)
    cat("  Processed:", paste0(time_prefix, "_val_MEAN"), "\n")
  }
}

cat("\n=== CALCULATION COMPLETE ===\n")
cat("  18 domain mean columns created.\n\n")


# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK =============
# This function can be run multiple times to check different random rows and time intervals

qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) {
  # Pick a random row or use specified row
  if (is.null(row_num)) {
    random_row <- sample(seq_len(nrow(df)), 1)
    cat("\n========================================\n")
    cat("QA CHECK: Random Row #", random_row, "\n")
  } else {
    if (row_num < 1 || row_num > nrow(df)) {
      cat("ERROR: Row number must be between 1 and", nrow(df), "\n")
      return()
    }
    random_row <- row_num
    cat("\n========================================\n")
    cat("QA CHECK: Specified Row #", random_row, "\n")
  }

  # Pick a random time interval or use specified interval
  if (is.null(time_interval_num)) {
    test_interval_idx <- sample(1:6, 1)
    cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n")
  } else {
    if (time_interval_num < 1 || time_interval_num > 6) {
      cat("ERROR: Time interval number must be between 1 and 6\n")
      cat("  1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n")
      return()
    }
    test_interval_idx <- time_interval_num
    cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n")
  }

  cat("========================================\n\n")

  time_prefix <- time_prefixes[test_interval_idx]

  # Check each of the 3 domains
  for (domain_idx in 1:3) {
    domain_name <- domain_names[domain_idx]

    # Get the appropriate item indices
    if (domain_idx == 1) {
      item_indices <- pref_indices
      domain_label <- "Preferences"
    } else if (domain_idx == 2) {
      item_indices <- pers_indices
      domain_label <- "Personality"
    } else {
      item_indices <- val_indices
      domain_label <- "Values"
    }

    cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label))

    # Get source column names
    source_cols_domain <- paste0(time_prefix, "_", items[item_indices])
    target_col <- paste0(time_prefix, "_", domain_name, "_MEAN")

    # Get values
    values <- numeric(5)
    cat("Source values:\n")
    for (i in 1:5) {
      col <- source_cols_domain[i]
      val <- if (col %in% names(df)) df[random_row, col] else NA
      values[i] <- val
      cat(sprintf("  %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val))))
    }

    # Calculate expected mean
    valid_values <- values[!is.na(values)]
    if (length(valid_values) > 0) {
      expected_mean <- mean(valid_values)
      actual_value <- df[random_row, target_col]

      cat(sprintf("\nCalculation:\n"))
      cat(sprintf("  Sum: %s = %.5f\n",
                  paste(sprintf("%.5f", valid_values), collapse = " + "),
                  sum(valid_values)))
      cat(sprintf("  Average of %d values: %.5f\n", length(valid_values), expected_mean))
      cat(sprintf("  Target (%s): %.5f\n", target_col, actual_value))
      cat(sprintf("  Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗")))
    } else {
      cat("  No valid values to calculate mean.\n")
    }
    cat("\n")
  }

  cat("========================================\n")
  cat("END QA CHECK\n")
  cat("========================================\n\n")
}

# Run QA check on random row and random time interval
cat("\n\n")
qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below)

# Instructions for running additional checks
cat("\n")
cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n")
cat("For random row AND random time interval, run:\n")
cat("  qa_check_random_row()\n")
cat("\nFor specific row (e.g., row 118) with random interval:\n")
cat("  qa_check_random_row(118)\n")
cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n")
cat("  qa_check_random_row(time_interval_num = 3)\n")
cat("\nFor specific row AND specific interval:\n")
cat("  qa_check_random_row(118, 3)\n")
cat("\n")
cat("Time Interval Numbers:\n")
cat("  1 = NPast_5,  2 = NPast_10,  3 = NFut_5\n")
cat("  4 = NFut_10,  5 = X5.10past,  6 = X5.10fut\n")
cat("\n")


# Save the modified dataframe back to CSV
# na="" writes NA values as empty cells instead of "NA" text
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
write.csv(df, "eohi2.csv", row.names = FALSE, na = "")

cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
cat("Review the output above, then uncomment line 234 to save changes.\n")
cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n")