eohi/.history/eohi1/mixed anova - domain means_20250915112521.r
2025-12-23 15:47:09 -05:00

410 lines
15 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# mixed anova not working
# 12/09/2025
# add sum contrasts
# Mixed ANOVA Analysis for Domain Means
# EOHI Experiment Data Analysis - Domain Level Analysis
# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life
# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life
# Load required libraries
library(tidyverse)
library(ez)
library(car)
library(nortest) # For normality tests
library(ggplot2) # For plotting
library(emmeans) # For post-hoc comparisons
library(purrr) # For map functions
# Global options to remove scientific notation
options(scipen = 999)
# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation)
options(contrasts = c("contr.sum", "contr.poly"))
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
# Read the data
data <- read.csv("exp1.csv")
# Display basic information about the dataset
print(paste("Dataset dimensions:", paste(dim(data), collapse = " x")))
print(paste("Number of participants:", length(unique(data$pID))))
# Verify the specific variables we need
required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life",
"NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life")
missing_vars <- required_vars[!required_vars %in% colnames(data)]
if (length(missing_vars) > 0) {
print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", ")))
} else {
print("All required domain mean variables found!")
}
# Define domain mapping
domain_mapping <- data.frame(
variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life",
"NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"),
time = c(rep("Past", 4), rep("Future", 4)),
domain = rep(c("Preferences", "Personality", "Values", "Life"), 2),
stringsAsFactors = FALSE
)
# Domain mapping created
# More efficient data pivoting (avoiding pivot_longer issues)
pivot_domain_means <- function(data, domain_mapping) {
# Pre-allocate the result data frame for better performance
n_rows <- nrow(data) * nrow(domain_mapping)
long_data <- data.frame(
pID = character(n_rows),
ResponseId = character(n_rows),
TEMPORAL_DO = character(n_rows),
TIME = character(n_rows),
DOMAIN = character(n_rows),
MEAN_DIFFERENCE = numeric(n_rows),
stringsAsFactors = FALSE
)
row_idx <- 1
for (i in 1:nrow(domain_mapping)) {
var_name <- domain_mapping$variable[i]
time_level <- domain_mapping$time[i]
domain_level <- domain_mapping$domain[i]
# Check if variable exists
if (!var_name %in% colnames(data)) {
print(paste("Warning: Variable", var_name, "not found in data"))
next
}
# Get the number of rows for this variable
n_data_rows <- nrow(data)
# Fill in the data for this variable
long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID)
long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId)
long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO)
long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level
long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level
long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]]
row_idx <- row_idx + n_data_rows
}
# Convert to factors with proper levels
long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future"))
long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life"))
long_data$pID <- as.factor(long_data$pID)
long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO)
return(long_data)
}
# Pivot data to long format
tryCatch({
long_data <- pivot_domain_means(data, domain_mapping)
}, error = function(e) {
print(paste("Error in data pivoting:", e$message))
stop("Cannot proceed without proper data structure")
})
print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x")))
print(paste("Number of participants:", length(unique(long_data$pID))))
# =============================================================================
# DESCRIPTIVE STATISTICS
# =============================================================================
# Overall descriptive statistics by TIME and DOMAIN
desc_stats <- long_data %>%
group_by(TIME, DOMAIN) %>%
summarise(
n = n(),
mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5),
variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5),
sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5),
median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5),
q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5),
q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5),
min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5),
max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5),
.groups = 'drop'
)
print("Descriptive statistics by TIME and DOMAIN:")
print(desc_stats)
# Descriptive statistics by between-subjects factors
desc_stats_by_temporal <- long_data %>%
group_by(TEMPORAL_DO, TIME, DOMAIN) %>%
summarise(
n = n(),
mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5),
variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5),
sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5),
.groups = 'drop'
)
print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:")
print(desc_stats_by_temporal)
# =============================================================================
# ASSUMPTION TESTING
# =============================================================================
# Remove missing values for assumption testing
long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ]
print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x")))
# 1. Missing values check
missing_summary <- long_data %>%
group_by(TIME, DOMAIN) %>%
summarise(
n_total = n(),
n_missing = sum(is.na(MEAN_DIFFERENCE)),
pct_missing = round(100 * n_missing / n_total, 2),
.groups = 'drop'
)
print("Missing values by TIME and DOMAIN:")
print(missing_summary)
# 2. Outlier detection
outlier_summary <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
n = n(),
mean = mean(MEAN_DIFFERENCE),
sd = sd(MEAN_DIFFERENCE),
q1 = quantile(MEAN_DIFFERENCE, 0.25),
q3 = quantile(MEAN_DIFFERENCE, 0.75),
iqr = q3 - q1,
lower_bound = q1 - 1.5 * iqr,
upper_bound = q3 + 1.5 * iqr,
n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound),
.groups = 'drop'
)
print("Outlier summary (IQR method):")
print(outlier_summary)
# 3. Anderson-Darling normality test (streamlined)
normality_results <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
n = n(),
ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic,
ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value,
.groups = 'drop'
)
print("Anderson-Darling normality test results:")
print(round(normality_results, 5))
# 4. Homogeneity of variance (Levene's test)
# Test homogeneity across TIME within each DOMAIN
homogeneity_time <- long_data_clean %>%
group_by(DOMAIN) %>%
summarise(
levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1],
levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1],
.groups = 'drop'
)
print("Homogeneity of variance across TIME within each DOMAIN:")
print(homogeneity_time)
# Test homogeneity across DOMAIN within each TIME
homogeneity_domain <- long_data_clean %>%
group_by(TIME) %>%
summarise(
levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1],
levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1],
.groups = 'drop'
)
print("Homogeneity of variance across DOMAIN within each TIME:")
print(homogeneity_domain)
# =============================================================================
# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES
# =============================================================================
# Function to calculate Hartley's F-max ratio
calculate_hartley_ratio <- function(variances) {
max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE)
}
# =============================================================================
# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA
# =============================================================================
# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO)
# within each combination of within-subjects factors (TIME × DOMAIN)
# First, let's check what values TEMPORAL_DO actually has
print("=== CHECKING TEMPORAL_DO VALUES ===")
print("Unique TEMPORAL_DO values:")
print(unique(long_data_clean$TEMPORAL_DO))
print("TEMPORAL_DO value counts:")
print(table(long_data_clean$TEMPORAL_DO))
print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===")
observed_temporal_ratios <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
# Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination
past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE),
fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE),
# Calculate F-max ratio
f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var),
.groups = 'drop'
) %>%
select(TIME, DOMAIN, past_var, fut_var, f_max_ratio)
print(observed_temporal_ratios)
# More efficient bootstrap function for Hartley's F-max test
bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) {
# Get unique groups and their sample sizes
groups <- unique(data[[group_var]])
# Calculate observed variances for each group
observed_vars <- data %>%
group_by(!!sym(group_var)) %>%
summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>%
pull(var)
# Handle invalid variances
if(any(observed_vars <= 0 | is.na(observed_vars))) {
observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10
}
# Calculate observed F-max ratio
observed_ratio <- max(observed_vars) / min(observed_vars)
# Pre-allocate storage for bootstrap ratios
bootstrap_ratios <- numeric(n_iter)
# Get group data once
group_data_list <- map(groups, ~ {
group_data <- data[data[[group_var]] == .x, response_var]
group_data[!is.na(group_data)]
})
# Bootstrap with pre-allocated storage
for(i in 1:n_iter) {
# Bootstrap sample from each group independently
sample_vars <- map_dbl(group_data_list, ~ {
bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE)
var(bootstrap_sample, na.rm = TRUE)
})
bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars)
}
# Remove invalid ratios
valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)]
if(length(valid_ratios) == 0) {
stop("No valid bootstrap ratios generated")
}
# Calculate critical value (95th percentile)
critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE)
# Return only essential information
return(list(
observed_ratio = observed_ratio,
critical_95 = critical_95,
n_valid_iterations = length(valid_ratios)
))
}
# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination
print("\n=== HARTLEY'S F-MAX TEST RESULTS ===")
set.seed(123) # For reproducibility
hartley_temporal_results <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")),
.groups = 'drop'
) %>%
mutate(
observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio),
critical_95 = map_dbl(hartley_result, ~ .x$critical_95),
significant = observed_ratio > critical_95
) %>%
select(TIME, DOMAIN, observed_ratio, critical_95, significant)
print(hartley_temporal_results)
# =============================================================================
# MIXED ANOVA ANALYSIS
# =============================================================================
# Mixed ANOVA using aov()
# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT)
# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life)
mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)),
data = long_data_clean)
print("Mixed ANOVA Results:")
print(summary(mixed_anova_model))
# Extract effect sizes (generalized eta squared)
# For aov() objects, we need to extract from the summary
anova_summary <- summary(mixed_anova_model)
# Effect sizes will be calculated separately
# Post-hoc comparisons using emmeans
print("\n=== POST-HOC COMPARISONS ===")
# Main effect of TIME
print("Main Effect of TIME:")
time_emmeans <- emmeans(mixed_anova_model, ~ TIME)
time_contrasts <- pairs(time_emmeans, adjust = "bonferroni")
print(time_contrasts)
# Main effect of DOMAIN
print("\nMain Effect of DOMAIN:")
domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN)
domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni")
print(domain_contrasts)
# Main effect of TEMPORAL_DO
print("\nMain Effect of TEMPORAL_DO:")
temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO)
temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni")
print(temporal_contrasts)
# Two-way interactions
print("\nTIME × DOMAIN Interaction:")
time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN)
time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni")
print(time_domain_contrasts)
print("\nTEMPORAL_DO × TIME Interaction:")
temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME)
temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni")
print(temporal_time_contrasts)
print("\nTEMPORAL_DO × DOMAIN Interaction:")
temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN)
temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni")
print(temporal_domain_contrasts)
# Three-way interaction
print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:")
three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN)
three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni")
print(three_way_contrasts)