eohi/.history/eohi1/correlations - eohi x calibration_20250916095303.r

# Load required libraries
library(Hmisc)
library(knitr)
library(dplyr)
library(corrr)
library(broom)
library(purrr)
library(tidyr)
library(tibble)
library(boot)

options(scipen = 999)

setwd("C:/Users/irina/Documents/DND/EOHI/eohi1")
# Load data
df1 <- read.csv("exp1.csv")

# Remove columns with all NA values
df1 <- df1 %>% select(where(~ !all(is.na(.))))

# Select variables of interest
eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean",
               "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean")
cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false")

# Create dataset with selected variables
df <- df1[, c(eohi_vars, cal_vars)]

# Ensure all selected variables are numeric
df <- df %>%
  mutate(across(everything(), as.numeric))

# Remove rows with any missing values for correlation analysis
df_complete <- df[complete.cases(df), ]

cat("Sample size for correlation analysis:", nrow(df_complete), "\n")
cat("Total sample size:", nrow(df), "\n")

str(df)
summary(df)
####==== DESCRIPTIVE STATISTICS ====

# Function to compute descriptive statistics
get_descriptives <- function(data, vars) {
  desc_stats <- data %>%
    select(all_of(vars)) %>%
    summarise(across(everything(), list(
      n = ~sum(!is.na(.)),
      mean = ~mean(., na.rm = TRUE),
      sd = ~sd(., na.rm = TRUE),
      min = ~min(., na.rm = TRUE),
      max = ~max(., na.rm = TRUE),
      median = ~median(., na.rm = TRUE),
      q25 = ~quantile(., 0.25, na.rm = TRUE),
      q75 = ~quantile(., 0.75, na.rm = TRUE)
    ))) %>%
    pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
    separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>%
    pivot_wider(names_from = stat, values_from = value) %>%
    mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5)))

  return(desc_stats)
}

# Get descriptives for EOHI variables
eohi_descriptives <- get_descriptives(df, eohi_vars)
cat("\n=== EOHI Variables Descriptives ===\n")
print(eohi_descriptives)

# Get descriptives for calibration variables
cal_descriptives <- get_descriptives(df, cal_vars)
cat("\n=== Calibration Variables Descriptives ===\n")
print(cal_descriptives)

# Check for bimodal or unusual distributions
hist(df$eohi_pref)
hist(df$cal_selfActual)

# Look for extreme values
# boxplot(df$eohi_pref)
# boxplot(df$cal_selfActual)

####==== PEARSON CORRELATIONS ====

# Compute correlation matrix with p-values
cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson")

# Extract correlation coefficients
cor_pearson <- cor_results_pearson$r

# Extract p-values
p_matrix_pearson <- cor_results_pearson$P

# Function to add significance stars
corstars <- function(cor_mat, p_mat) {
  stars <- ifelse(p_mat < 0.001, "***",
                  ifelse(p_mat < 0.01, "**",
                         ifelse(p_mat < 0.05, "*", "")))

  # Combine correlation values with stars, rounded to 5 decimal places
  cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars),
                           nrow = nrow(cor_mat))

  # Set row and column names
  rownames(cor_with_stars) <- rownames(cor_mat)
  colnames(cor_with_stars) <- colnames(cor_mat)

  return(cor_with_stars)
}

# Apply the function
cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson)

cat("\n=== PEARSON CORRELATIONS ===\n")
print(cor_table_pearson, quote = FALSE)

# Extract specific correlations between EOHI and calibration variables
eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars]
eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars]

cat("\n=== EOHI x Calibration Pearson Correlations ===\n")
for(i in 1:nrow(eohi_cal_correlations)) {
  for(j in 1:ncol(eohi_cal_correlations)) {
    cor_val <- eohi_cal_correlations[i, j]
    p_val <- eohi_cal_pvalues[i, j]
    star <- ifelse(p_val < 0.001, "***",
                   ifelse(p_val < 0.01, "**",
                          ifelse(p_val < 0.05, "*", "")))
    cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n",
                rownames(eohi_cal_correlations)[i],
                colnames(eohi_cal_correlations)[j],
                cor_val, star, p_val))
  }
}

####==== SPEARMAN CORRELATIONS ====

# Compute Spearman correlation matrix with p-values
cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman")

# Extract correlation coefficients
cor_spearman <- cor_results_spearman$r

# Extract p-values
p_matrix_spearman <- cor_results_spearman$P

# Apply the function
cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman)

cat("\n=== SPEARMAN CORRELATIONS ===\n")
print(cor_table_spearman, quote = FALSE)

# Extract specific correlations between EOHI and calibration variables
eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars]
eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars]

cat("\n=== EOHI x Calibration Spearman Correlations ===\n")
for(i in 1:nrow(eohi_cal_correlations_spearman)) {
  for(j in 1:ncol(eohi_cal_correlations_spearman)) {
    cor_val <- eohi_cal_correlations_spearman[i, j]
    p_val <- eohi_cal_pvalues_spearman[i, j]
    star <- ifelse(p_val < 0.001, "***",
                   ifelse(p_val < 0.01, "**",
                          ifelse(p_val < 0.05, "*", "")))
    cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n",
                rownames(eohi_cal_correlations_spearman)[i],
                colnames(eohi_cal_correlations_spearman)[j],
                cor_val, star, p_val))
  }
}

####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ====

# Function to compute correlation with bootstrap CI
bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) {
  # Remove missing values
  complete_data <- data[complete.cases(data[, c(var1, var2)]), ]

  if(nrow(complete_data) < 3) {
    return(data.frame(
      correlation = NA,
      ci_lower = NA,
      ci_upper = NA,
      n = nrow(complete_data)
    ))
  }

  # Bootstrap function
  boot_fun <- function(data, indices) {
    cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs")
  }

  # Perform bootstrap
  set.seed(123)  # for reproducibility
  boot_results <- boot(complete_data, boot_fun, R = R)

  # Calculate confidence interval
  ci <- boot.ci(boot_results, type = "perc")

  return(data.frame(
    correlation = boot_results$t0,
    ci_lower = ci$perc[4],
    ci_upper = ci$perc[5],
    n = nrow(complete_data)
  ))
}

# Compute bootstrap CIs for all EOHI x Calibration correlations
cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n")
bootstrap_results_pearson <- expand.grid(
  eohi_var = eohi_vars,
  cal_var = cal_vars,
  stringsAsFactors = FALSE
) %>%
  pmap_dfr(function(eohi_var, cal_var) {
    result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000)
    result$eohi_var <- eohi_var
    result$cal_var <- cal_var
    result$method <- "pearson"
    return(result)
  }) %>%
  mutate(
    correlation = round(correlation, 5),
    ci_lower = round(ci_lower, 5),
    ci_upper = round(ci_upper, 5)
  )

print(bootstrap_results_pearson)

cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n")
bootstrap_results_spearman <- expand.grid(
  eohi_var = eohi_vars,
  cal_var = cal_vars,
  stringsAsFactors = FALSE
) %>%
  pmap_dfr(function(eohi_var, cal_var) {
    result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000)
    result$eohi_var <- eohi_var
    result$cal_var <- cal_var
    result$method <- "spearman"
    return(result)
  }) %>%
  mutate(
    correlation = round(correlation, 5),
    ci_lower = round(ci_lower, 5),
    ci_upper = round(ci_upper, 5)
  )

print(bootstrap_results_spearman)

####==== SUMMARY TABLE ====

# Create comprehensive summary table
summary_table <- bootstrap_results_pearson %>%
  select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>%
  rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>%
  left_join(
    bootstrap_results_spearman %>%
      select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>%
      rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper),
    by = c("eohi_var", "cal_var")
  ) %>%
  # Add p-values
  left_join(
    expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>%
      pmap_dfr(function(eohi_var, cal_var) {
        pearson_p <- p_matrix_pearson[eohi_var, cal_var]
        spearman_p <- p_matrix_spearman[eohi_var, cal_var]
        data.frame(
          eohi_var = eohi_var,
          cal_var = cal_var,
          pearson_p = pearson_p,
          spearman_p = spearman_p
        )
      }),
    by = c("eohi_var", "cal_var")
  ) %>%
  mutate(
    pearson_p = round(pearson_p, 5),
    spearman_p = round(spearman_p, 5)
  )

cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n")
print(summary_table)

# Save results to CSV
# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE)
# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n")