eohi/.history/eohi1/mixed anova - domain means_20250912161937.r
2025-12-23 15:47:09 -05:00

404 lines
14 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Mixed ANOVA Analysis for Domain Means
# EOHI Experiment Data Analysis - Domain Level Analysis
# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life
# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life
# Load required libraries
library(tidyverse)
library(ez)
library(car)
library(nortest) # For normality tests
library(ggplot2) # For plotting
library(emmeans) # For post-hoc comparisons
# Global options to remove scientific notation
options(scipen = 999)
setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
# Read the data
data <- read.csv("exp1.csv")
# Display basic information about the dataset
print(dim(data))
print(length(unique(data$pID)))
# Check experimental conditions
print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO))
# Check what domain mean columns are available
domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))]
print(domain_mean_cols)
# Verify the specific variables we need
required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life",
"NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life")
missing_vars <- required_vars[!required_vars %in% colnames(data)]
if (length(missing_vars) > 0) {
print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", ")))
} else {
print("All required domain mean variables found!")
}
# Define domain mapping
domain_mapping <- data.frame(
variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life",
"NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"),
time = c(rep("Past", 4), rep("Future", 4)),
domain = rep(c("Preferences", "Personality", "Values", "Life"), 2),
stringsAsFactors = FALSE
)
print(domain_mapping)
# Function to pivot data to long format
pivot_domain_means <- function(data, domain_mapping) {
long_data <- data.frame()
for (i in 1:nrow(domain_mapping)) {
var_name <- domain_mapping$variable[i]
time_level <- domain_mapping$time[i]
domain_level <- domain_mapping$domain[i]
# Check if variable exists
if (!var_name %in% colnames(data)) {
print(paste("Warning: Variable", var_name, "not found in data"))
next
}
# Create subset for this variable
subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)]
subset_data$TIME <- time_level
subset_data$DOMAIN <- domain_level
subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]]
subset_data[[var_name]] <- NULL # Remove original column
# Add to long data
long_data <- rbind(long_data, subset_data)
}
# Convert to factors with proper levels
long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future"))
long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life"))
long_data$pID <- as.factor(long_data$pID)
long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO)
return(long_data)
}
# Pivot data to long format
tryCatch({
long_data <- pivot_domain_means(data, domain_mapping)
}, error = function(e) {
print(paste("Error in data pivoting:", e$message))
stop("Cannot proceed without proper data structure")
})
print(dim(long_data))
print(length(unique(long_data$pID)))
print(levels(long_data$TIME))
print(levels(long_data$DOMAIN))
# Check data types
print(is.factor(long_data$TIME))
print(is.factor(long_data$DOMAIN))
print(is.factor(long_data$pID))
print(is.numeric(long_data$MEAN_DIFFERENCE))
# Show first 20 rows
print(utils::head(long_data, 20))
# Display structure and sample
str(long_data)
print(utils::head(long_data, 10))
# Show example data for one participant
participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")]
print(participant_1_data)
# =============================================================================
# DESCRIPTIVE STATISTICS
# =============================================================================
# Overall descriptive statistics by TIME and DOMAIN
desc_stats <- long_data %>%
group_by(TIME, DOMAIN) %>%
summarise(
n = n(),
mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5),
variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5),
sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5),
median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5),
q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5),
q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5),
min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5),
max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5),
.groups = 'drop'
)
print("Descriptive statistics by TIME and DOMAIN:")
print(desc_stats)
# Descriptive statistics by between-subjects factors
desc_stats_by_temporal <- long_data %>%
group_by(TEMPORAL_DO, TIME, DOMAIN) %>%
summarise(
n = n(),
mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5),
variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5),
sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5),
.groups = 'drop'
)
print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:")
print(desc_stats_by_temporal)
# =============================================================================
# ASSUMPTION TESTING
# =============================================================================
# Remove missing values for assumption testing
long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ]
print(paste("Data after removing missing values:", dim(long_data_clean)))
# 1. Missing values check
missing_summary <- long_data %>%
group_by(TIME, DOMAIN) %>%
summarise(
n_total = n(),
n_missing = sum(is.na(MEAN_DIFFERENCE)),
pct_missing = round(100 * n_missing / n_total, 2),
.groups = 'drop'
)
print("Missing values by TIME and DOMAIN:")
print(missing_summary)
# 2. Outlier detection
outlier_summary <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
n = n(),
mean = mean(MEAN_DIFFERENCE),
sd = sd(MEAN_DIFFERENCE),
q1 = quantile(MEAN_DIFFERENCE, 0.25),
q3 = quantile(MEAN_DIFFERENCE, 0.75),
iqr = q3 - q1,
lower_bound = q1 - 1.5 * iqr,
upper_bound = q3 + 1.5 * iqr,
n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound),
.groups = 'drop'
)
print("Outlier summary (IQR method):")
print(outlier_summary)
# 3. Normality tests
normality_results <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
n = n(),
shapiro_W = ifelse(n >= 3 & n <= 5000,
shapiro.test(MEAN_DIFFERENCE)$statistic,
NA),
shapiro_p = ifelse(n >= 3 & n <= 5000,
shapiro.test(MEAN_DIFFERENCE)$p.value,
NA),
anderson_A = ifelse(n >= 7,
ad.test(MEAN_DIFFERENCE)$statistic,
NA),
anderson_p = ifelse(n >= 7,
ad.test(MEAN_DIFFERENCE)$p.value,
NA),
.groups = 'drop'
)
print("Normality test results:")
print(normality_results)
# Note: Anderson-Darling p-values may appear identical due to machine precision limits
# All p-values are extremely small (< 2.2e-16) indicating strong non-normality
# The test statistics (A values) are actually different across conditions
# Debug: Check if Anderson-Darling test is working properly
print("\n=== DEBUG: Anderson-Darling Test Details ===")
# Get unique combinations of TIME and DOMAIN
unique_combos <- long_data_clean %>%
select(TIME, DOMAIN) %>%
distinct()
# Run Anderson-Darling test for each combination
for(i in 1:nrow(unique_combos)) {
time_val <- unique_combos$TIME[i]
domain_val <- unique_combos$DOMAIN[i]
# Subset data for this combination
subset_data <- long_data_clean %>%
filter(TIME == time_val, DOMAIN == domain_val)
cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n")
# Run Anderson-Darling test
if(nrow(subset_data) >= 7) {
ad_result <- ad.test(subset_data$MEAN_DIFFERENCE)
print(ad_result)
} else {
cat("Sample size too small for Anderson-Darling test\n")
}
cat("\n")
}
# 4. Homogeneity of variance (Levene's test)
# Test homogeneity across TIME within each DOMAIN
homogeneity_time <- long_data_clean %>%
group_by(DOMAIN) %>%
summarise(
levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1],
levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1],
.groups = 'drop'
)
print("Homogeneity of variance across TIME within each DOMAIN:")
print(homogeneity_time)
# Test homogeneity across DOMAIN within each TIME
homogeneity_domain <- long_data_clean %>%
group_by(TIME) %>%
summarise(
levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1],
levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1],
.groups = 'drop'
)
print("Homogeneity of variance across DOMAIN within each TIME:")
print(homogeneity_domain)
# =============================================================================
# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES
# =============================================================================
# Function to calculate Hartley's F-max ratio
calculate_hartley_ratio <- function(variances) {
max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE)
}
# =============================================================================
# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA
# =============================================================================
# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO)
# within each combination of within-subjects factors (TIME × DOMAIN)
# First, let's check what values TEMPORAL_DO actually has
print("=== CHECKING TEMPORAL_DO VALUES ===")
print("Unique TEMPORAL_DO values:")
print(unique(long_data_clean$TEMPORAL_DO))
print("TEMPORAL_DO value counts:")
print(table(long_data_clean$TEMPORAL_DO))
print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===")
# Get the actual TEMPORAL_DO levels
temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO))
print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", ")))
observed_temporal_ratios <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
# Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination
temporal_var_1 = var(MEAN_DIFFERENCE[TEMPORAL_DO == temporal_levels[1]], na.rm = TRUE),
temporal_var_2 = var(MEAN_DIFFERENCE[TEMPORAL_DO == temporal_levels[2]], na.rm = TRUE),
# Calculate F-max ratio
f_max_ratio = max(temporal_var_1, temporal_var_2) / min(temporal_var_1, temporal_var_2),
.groups = 'drop'
) %>%
select(TIME, DOMAIN, temporal_var_1, temporal_var_2, f_max_ratio)
print(observed_temporal_ratios)
# Function to bootstrap critical values for Hartley's F-max test
bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) {
# Get unique groups and their sample sizes
groups <- unique(data[[group_var]])
n_groups <- length(groups)
# Calculate observed variances for each group
observed_vars <- data %>%
group_by(!!sym(group_var)) %>%
summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>%
pull(var)
# Debug: Check for zero or invalid variances
if(any(observed_vars <= 0 | is.na(observed_vars))) {
cat("Warning: Invalid observed variances detected:", observed_vars, "\n")
observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10
}
# Calculate observed F-max ratio
observed_ratio <- calculate_hartley_ratio(observed_vars)
# Bootstrap under null hypothesis (equal variances)
# Bootstrap from each group independently to create natural variation
group_data_list <- map(groups, ~ {
group_data <- data[data[[group_var]] == .x, response_var]
group_data[!is.na(group_data)] # Remove any NA values
})
# Bootstrap F-max ratios under null hypothesis
bootstrap_ratios <- replicate(n_iter, {
# Bootstrap sample from each group independently
sample_vars <- map_dbl(group_data_list, ~ {
bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE)
var(bootstrap_sample, na.rm = TRUE)
})
calculate_hartley_ratio(sample_vars)
})
# Calculate critical value (95th percentile)
# Remove any NaN or infinite values
valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)]
# Debug: Check what's happening
cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n")
cat("Valid bootstrap ratios:", length(valid_ratios), "\n")
cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n")
cat("Sample of valid ratios:", head(valid_ratios, 5), "\n")
if(length(valid_ratios) == 0) {
cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n")
stop("No valid bootstrap ratios generated")
}
critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE)
return(list(
observed_ratio = observed_ratio,
critical_95 = critical_95,
bootstrap_ratios = bootstrap_ratios
))
}
# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination
print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===")
set.seed(123) # For reproducibility
hartley_temporal_results <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")),
.groups = 'drop'
) %>%
mutate(
observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio),
critical_95 = map_dbl(hartley_result, ~ .x$critical_95),
significant = observed_ratio > critical_95
) %>%
select(TIME, DOMAIN, observed_ratio, critical_95, significant)
print(hartley_temporal_results)