# Mixed ANOVA Analysis for Past vs Future Differences
# EOHI Experiment Data Analysis

# Load required libraries
library(tidyverse)
library(ez)
library(car)
library(nortest)      # For normality tests

# Read the data
data <- read.csv("eohi1/exp1.csv")

# Display basic information about the dataset
cat("Dataset dimensions:", dim(data), "\n")
cat("Number of participants:", length(unique(data$pID)), "\n")

# Check experimental conditions
cat("\nExperimental conditions:\n")
table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO)

# STEP 1: PROPER DATA RESHAPING
# Define domains with their categories
domain_info <- data.frame(
  domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel",
             "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex",
             "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice",
             "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"),
  domain_type = c(rep("Preferences", 5),
                  rep("Personality", 5),
                  rep("Values", 5),
                  rep("Life_Satisfaction", 5)),
  stringsAsFactors = FALSE
)

# Function to reshape ALL domains at once with domain information
reshape_all_domains <- function(data, domain_info) {
  all_long_data <- data.frame()
  
  for (i in 1:nrow(domain_info)) {
    domain_name <- domain_info$domain[i]
    domain_type <- domain_info$domain_type[i]
    
    past_col <- paste0("NPastDiff_", domain_name)
    fut_col <- paste0("NFutDiff_", domain_name)
    
    # Check if columns exist
    if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) {
      cat("Warning: Columns", past_col, "or", fut_col, "not found\n")
      next
    }
    
    # Create long format data for this domain
    past_data <- data %>%
      select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, 
             demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>%
      mutate(
        TimePerspective = "Past",
        Difference = .data[[past_col]],
        Domain_Type = domain_type,      # e.g., "Preferences"
        Domain_Item = domain_name       # e.g., "pref_read"
      ) %>%
      select(-all_of(past_col))
    
    fut_data <- data %>%
      select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO,
             demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>%
      mutate(
        TimePerspective = "Future", 
        Difference = .data[[fut_col]],
        Domain_Type = domain_type,
        Domain_Item = domain_name
      ) %>%
      select(-all_of(fut_col))
    
    # Combine past and future data for this domain
    domain_long_data <- rbind(past_data, fut_data)
    all_long_data <- rbind(all_long_data, domain_long_data)
  }
  
  # Convert to factors
  all_long_data <- all_long_data %>%
    mutate(
      TimePerspective = as.factor(TimePerspective),
      Domain_Type = as.factor(Domain_Type),
      Domain_Item = as.factor(Domain_Item),
      pID = as.factor(pID)
    )
  
  return(all_long_data)
}

# Reshape all data to long format
cat("\nReshaping data to long format...\n")
long_data <- reshape_all_domains(data, domain_info)

cat("Long format data dimensions:", dim(long_data), "\n")
cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n")
cat("Domain types:", unique(long_data$Domain_Type), "\n")

# STEP 2: ASSUMPTION CHECKING
cat("\n", "="*80, "\n")
cat("STEP 2: CHECKING ASSUMPTIONS\n")
cat("="*80, "\n")

# 2.1 Check for missing values
missing_summary <- long_data %>%
  group_by(Domain_Type, Domain_Item, TimePerspective) %>%
  summarise(
    n_total = n(),
    n_missing = sum(is.na(Difference)),
    pct_missing = round(100 * n_missing / n_total, 2),
    .groups = 'drop'
  )

cat("\nMissing values by domain and time perspective:\n")
print(missing_summary)

# Remove missing values
long_data_clean <- long_data[!is.na(long_data$Difference), ]
cat("\nData after removing missing values:", dim(long_data_clean), "\n")

# 2.2 Outlier detection
cat("\nChecking for outliers...\n")
outlier_summary <- long_data_clean %>%
  group_by(Domain_Type, Domain_Item, TimePerspective) %>%
  summarise(
    n = n(),
    mean = mean(Difference),
    sd = sd(Difference),
    q1 = quantile(Difference, 0.25),
    q3 = quantile(Difference, 0.75),
    iqr = q3 - q1,
    lower_bound = q1 - 1.5 * iqr,
    upper_bound = q3 + 1.5 * iqr,
    n_outliers = sum(Difference < lower_bound | Difference > upper_bound),
    .groups = 'drop'
  )

cat("Outlier summary (IQR method):\n")
print(outlier_summary)

# 2.3 Normality tests
cat("\nTesting normality...\n")
normality_results <- long_data_clean %>%
  group_by(Domain_Type, Domain_Item, TimePerspective) %>%
  summarise(
    n = n(),
    shapiro_p = ifelse(n >= 3 & n <= 5000, 
                      shapiro.test(Difference)$p.value, 
                      NA),
    anderson_p = ifelse(n >= 7, 
                       ad.test(Difference)$p.value, 
                       NA),
    .groups = 'drop'
  ) %>%
  mutate(
    shapiro_normal = shapiro_p > 0.05,
    anderson_normal = anderson_p > 0.05,
    overall_normal = case_when(
      !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal,
      !is.na(shapiro_p) ~ shapiro_normal,
      !is.na(anderson_p) ~ anderson_normal,
      TRUE ~ NA
    )
  )

cat("Normality test results:\n")
print(normality_results)

# 2.4 Homogeneity of variance (Levene's test)
cat("\nTesting homogeneity of variance...\n")
homogeneity_results <- long_data_clean %>%
  group_by(Domain_Type, Domain_Item) %>%
  summarise(
    levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1],
    homogeneous = levene_p > 0.05,
    .groups = 'drop'
  )

cat("Homogeneity of variance results:\n")
print(homogeneity_results)

# STEP 3: DESCRIPTIVE STATISTICS
cat("\n", "="*80, "\n")
cat("STEP 3: DESCRIPTIVE STATISTICS\n")
cat("="*80, "\n")

desc_stats <- long_data_clean %>%
  group_by(Domain_Type, Domain_Item, TimePerspective) %>%
  summarise(
    n = n(),
    mean = mean(Difference),
    sd = sd(Difference),
    median = median(Difference),
    q1 = quantile(Difference, 0.25),
    q3 = quantile(Difference, 0.75),
    min = min(Difference),
    max = max(Difference),
    .groups = 'drop'
  )

cat("Descriptive statistics:\n")
print(desc_stats)

# Summary of all results
cat("\n", "="*80, "\n")
cat("SUMMARY OF ALL DOMAINS\n")
cat("="*80, "\n")

summary_df <- data.frame(
  Domain = character(),
  Past_Mean = numeric(),
  Future_Mean = numeric(),
  Cohen_d = numeric(),
  Significant = logical(),
  stringsAsFactors = FALSE
)

for (domain in names(results_list)) {
  result <- results_list[[domain]]
  
  past_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Past"]
  fut_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Future"]
  cohens_d <- result$cohens_d
  
  # Check if significant (p < 0.05)
  significant <- FALSE
  if (!is.null(result$anova) && !is.null(result$anova$ANOVA)) {
    if ("TimePerspective" %in% result$anova$ANOVA$Effect) {
      p_val <- result$anova$ANOVA$p[result$anova$ANOVA$Effect == "TimePerspective"]
      significant <- !is.na(p_val) && p_val < 0.05
    }
  } else if (!is.null(result$t_test)) {
    significant <- result$t_test$p.value < 0.05
  }
  
  summary_df <- rbind(summary_df, data.frame(
    Domain = domain,
    Past_Mean = round(past_mean, 3),
    Future_Mean = round(fut_mean, 3),
    Cohen_d = round(cohens_d, 5),
    Significant = significant
  ))
}

# Sort by effect size (absolute value)
summary_df <- summary_df[order(abs(summary_df$Cohen_d), decreasing = TRUE), ]

print(summary_df)

# Create visualization
library(ggplot2)

# Prepare data for plotting
plot_data <- summary_df %>%
  mutate(
    Effect_Size = abs(Cohen_d),
    Direction = ifelse(Cohen_d > 0, "Past > Future", "Future > Past"),
    Domain_Type = case_when(
      grepl("pref_", Domain) ~ "Preferences",
      grepl("pers_", Domain) ~ "Personality", 
      grepl("val_", Domain) ~ "Values",
      grepl("life_", Domain) ~ "Life Satisfaction",
      TRUE ~ "Other"
    )
  )

# Effect size plot
p1 <- ggplot(plot_data, aes(x = reorder(Domain, Effect_Size), y = Effect_Size, 
                           fill = Direction, alpha = Significant)) +
  geom_col() +
  coord_flip() +
  scale_alpha_manual(values = c(0.5, 1), name = "Significant\n(p < 0.05)") +
  scale_fill_manual(values = c("Past > Future" = "#E74C3C", "Future > Past" = "#3498DB")) +
  labs(
    title = "Effect Sizes: Past vs Future Differences",
    subtitle = "Absolute Cohen's d values across domains",
    x = "Domain",
    y = "|Cohen's d|",
    fill = "Direction"
  ) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))

print(p1)

# Mean differences plot
plot_data_long <- summary_df %>%
  select(Domain, Past_Mean, Future_Mean) %>%
  pivot_longer(cols = c(Past_Mean, Future_Mean), 
               names_to = "TimePerspective", 
               values_to = "Mean_Difference") %>%
  mutate(TimePerspective = gsub("_Mean", "", TimePerspective))

p2 <- ggplot(plot_data_long, aes(x = reorder(Domain, Mean_Difference), 
                                 y = Mean_Difference, 
                                 fill = TimePerspective)) +
  geom_col(position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("Past" = "#E74C3C", "Future" = "#3498DB")) +
  labs(
    title = "Mean Differences by Time Perspective",
    subtitle = "Past vs Future difference scores",
    x = "Domain", 
    y = "Mean Difference Score",
    fill = "Time Perspective"
  ) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))

print(p2)

cat("\nAnalysis complete! Check the plots and summary table above.\n")
cat("Key findings:\n")
cat("- Domains with largest effect sizes:", paste(head(summary_df$Domain, 3), collapse = ", "), "\n")
cat("- Number of significant differences:", sum(summary_df$Significant), "out of", nrow(summary_df), "\n")