eohi/.history/eohi2/mixed anova - domain means_20251003145845.r
2025-12-23 15:47:09 -05:00

280 lines
11 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Mixed ANOVA Analysis for Domain Means - EOHI2
# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor
# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc.
# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc.
# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc.
# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc.
# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN
# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN
# Load required libraries
library(tidyverse)
library(ez)
library(car)
library(afex) # For aov_ez (cleaner ANOVA output)
library(nortest) # For normality tests
library(emmeans) # For post-hoc comparisons
library(purrr) # For map functions
library(effsize) # For Cohen's d calculations
library(effectsize) # For effect size calculations
# Global options to remove scientific notation
options(scipen = 999)
# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation)
options(contrasts = c("contr.sum", "contr.poly"))
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
# Read the data
data <- read.csv("eohi2.csv")
# Display basic information about the dataset
print(paste("Dataset dimensions:", paste(dim(data), collapse = " x")))
print(paste("Number of participants:", length(unique(data$ResponseId))))
# Verify the specific variables we need
required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN",
"NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN",
"NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN",
"NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN",
"5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN",
"5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN")
missing_vars <- required_vars[!required_vars %in% colnames(data)]
if (length(missing_vars) > 0) {
print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", ")))
} else {
print("All required domain mean variables found!")
}
# Define domain mapping with TIME, DOMAIN, and INTERVAL factors
domain_mapping <- data.frame(
variable = required_vars,
time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3),
rep("Past", 3), rep("Future", 3)),
domain = rep(c("Preferences", "Personality", "Values"), 6),
interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3),
rep("5_10", 3), rep("5_10", 3)),
stringsAsFactors = FALSE
)
print("Domain mapping created:")
print(domain_mapping)
# Efficient data pivoting using pivot_longer
long_data <- data %>%
select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>%
pivot_longer(
cols = all_of(required_vars),
names_to = "variable",
values_to = "MEAN_DIFFERENCE"
) %>%
left_join(domain_mapping, by = "variable") %>%
# Convert to factors with proper levels
mutate(
TIME = factor(time, levels = c("Past", "Future")),
DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")),
INTERVAL = factor(interval, levels = c("5", "10", "5_10")),
ResponseId = as.factor(ResponseId),
TEMPORAL_DO = as.factor(TEMPORAL_DO),
INTERVAL_DO = as.factor(INTERVAL_DO)
) %>%
# Select final columns and remove any rows with missing values
select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>%
filter(!is.na(MEAN_DIFFERENCE))
print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x")))
print(paste("Number of participants:", length(unique(long_data$ResponseId))))
# =============================================================================
# DESCRIPTIVE STATISTICS
# =============================================================================
# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL
desc_stats <- long_data %>%
group_by(TIME, DOMAIN, INTERVAL) %>%
summarise(
n = n(),
mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5),
variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5),
sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5),
median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5),
q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5),
q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5),
min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5),
max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5),
.groups = 'drop'
)
print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:")
print(desc_stats)
# Descriptive statistics by between-subjects factors
desc_stats_by_between <- long_data %>%
group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>%
summarise(
n = n(),
mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5),
variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5),
sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5),
.groups = 'drop'
)
print("Descriptive statistics by between-subjects factors:")
print(desc_stats_by_between)
# Summary by between-subjects factors only
desc_stats_between_only <- long_data %>%
group_by(TEMPORAL_DO, INTERVAL_DO) %>%
summarise(
n = n(),
mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5),
variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5),
sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5),
.groups = 'drop'
)
print("Descriptive statistics by between-subjects factors only:")
print(desc_stats_between_only)
# =============================================================================
# ASSUMPTION TESTING
# =============================================================================
# Remove missing values for assumption testing
long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ]
print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x")))
# 1. Missing values check
missing_summary <- long_data %>%
group_by(TIME, DOMAIN, INTERVAL) %>%
summarise(
n_total = n(),
n_missing = sum(is.na(MEAN_DIFFERENCE)),
pct_missing = round(100 * n_missing / n_total, 2),
.groups = 'drop'
)
print("Missing values by TIME, DOMAIN, and INTERVAL:")
print(missing_summary)
# 2. Outlier detection
outlier_summary <- long_data_clean %>%
group_by(TIME, DOMAIN, INTERVAL) %>%
summarise(
n = n(),
mean = mean(MEAN_DIFFERENCE),
sd = sd(MEAN_DIFFERENCE),
q1 = quantile(MEAN_DIFFERENCE, 0.25),
q3 = quantile(MEAN_DIFFERENCE, 0.75),
iqr = q3 - q1,
lower_bound = q1 - 1.5 * iqr,
upper_bound = q3 + 1.5 * iqr,
n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound),
.groups = 'drop'
)
print("Outlier summary (IQR method):")
print(outlier_summary)
# 3. Anderson-Darling normality test
normality_results <- long_data_clean %>%
group_by(TIME, DOMAIN, INTERVAL) %>%
summarise(
n = n(),
ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic,
ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value,
.groups = 'drop'
)
print("Anderson-Darling normality test results:")
# Round only the numeric columns
normality_results_rounded <- normality_results %>%
mutate(across(where(is.numeric), ~ round(.x, 5)))
print(normality_results_rounded)
# 4. Homogeneity of variance (Levene's test)
# Test homogeneity across TIME within each DOMAIN × INTERVAL combination
homogeneity_time <- long_data_clean %>%
group_by(DOMAIN, INTERVAL) %>%
summarise(
levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1],
levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1],
.groups = 'drop'
)
print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:")
print(homogeneity_time)
# Test homogeneity across DOMAIN within each TIME × INTERVAL combination
homogeneity_domain <- long_data_clean %>%
group_by(TIME, INTERVAL) %>%
summarise(
levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1],
levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1],
.groups = 'drop'
)
print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:")
print(homogeneity_domain)
# Test homogeneity across INTERVAL within each TIME × DOMAIN combination
homogeneity_interval <- long_data_clean %>%
group_by(TIME, DOMAIN) %>%
summarise(
levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1],
levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1],
.groups = 'drop'
)
print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:")
print(homogeneity_interval)
# 5. Hartley's F-max test for between-subjects factors
print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===")
# Check what values the between-subjects factors actually have
print("Unique TEMPORAL_DO values:")
print(unique(long_data_clean$TEMPORAL_DO))
print("Unique INTERVAL_DO values:")
print(unique(long_data_clean$INTERVAL_DO))
# Function to calculate Hartley's F-max ratio
calculate_hartley_ratio <- function(variances) {
max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE)
}
# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination
print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===")
observed_temporal_ratios <- long_data_clean %>%
group_by(TIME, DOMAIN, INTERVAL) %>%
summarise(
# Calculate variances for each TEMPORAL_DO level within this combination
past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE),
fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE),
# Calculate F-max ratio
f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var),
.groups = 'drop'
) %>%
select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio)
print(observed_temporal_ratios)
# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination
print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===")
observed_interval_ratios <- long_data_clean %>%
group_by(TIME, DOMAIN, TEMPORAL_DO) %>%
summarise(
# Calculate variances for each INTERVAL_DO level within this combination
int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE),
int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE),
# Calculate F-max ratio
f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var),
.groups = 'drop'
) %>%
select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio)
print(observed_interval_ratios)