options(scipen = 999) setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") # Load required libraries library(corrplot) library(Hmisc) library(psych) # Load the data exp1_data <- read.csv("exp1.csv") # Define the two sets of variables set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") # Create subset with only the variables of interest correlation_data <- exp1_data[, c(set1_vars, set2_vars)] # ===== NORMALITY CHECKS ===== # Shapiro-Wilk tests for normality (n < 5000) for(var in names(correlation_data)) { if(length(na.omit(correlation_data[[var]])) <= 5000) { shapiro_result <- shapiro.test(correlation_data[[var]]) cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", var, shapiro_result$p.value, ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) } } # Kolmogorov-Smirnov test for normality for(var in names(correlation_data)) { ks_result <- ks.test(correlation_data[[var]], "pnorm", mean = mean(correlation_data[[var]], na.rm = TRUE), sd = sd(correlation_data[[var]], na.rm = TRUE)) cat(sprintf("%s: KS p = %.5f %s\n", var, ks_result$p.value, ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) } # Visual normality checks pdf("normality_plots.pdf", width = 12, height = 8) par(mfrow = c(2, 4)) for(var in names(correlation_data)) { # Histogram with normal curve overlay hist(correlation_data[[var]], main = paste("Histogram:", var), xlab = var, freq = FALSE) curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), sd = sd(correlation_data[[var]], na.rm = TRUE)), add = TRUE, col = "red", lwd = 2) # Q-Q plot qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) qqline(correlation_data[[var]], col = "red", lwd = 2) } dev.off() # ===== LINEARITY CHECKS ===== # Check linearity between variable pairs pdf("linearity_plots.pdf", width = 15, height = 10) par(mfrow = c(3, 5)) for(i in 1:length(set1_vars)) { for(j in 1:length(set2_vars)) { var1 <- set1_vars[i] var2 <- set2_vars[j] # Scatter plot with regression line plot(correlation_data[[var1]], correlation_data[[var2]], main = paste(var1, "vs", var2), xlab = var1, ylab = var2, pch = 16, cex = 0.6) # Add linear regression line lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) abline(lm_fit, col = "red", lwd = 2) # Add LOESS smooth line for non-linear pattern detection loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), max(correlation_data[[var1]], na.rm = TRUE), length = 100) loess_pred <- predict(loess_fit, x_seq) lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) # Calculate R-squared for linear fit r_squared <- summary(lm_fit)$r.squared cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) } } dev.off() # Residual analysis for linearity pdf("residual_plots.pdf", width = 15, height = 10) par(mfrow = c(3, 5)) for(i in 1:length(set1_vars)) { for(j in 1:length(set2_vars)) { var1 <- set1_vars[i] var2 <- set2_vars[j] lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) residuals <- residuals(lm_fit) fitted <- fitted(lm_fit) plot(fitted, residuals, main = paste("Residuals:", var1, "vs", var2), xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) abline(h = 0, col = "red", lwd = 2) # Add smooth line to residuals lines(lowess(fitted, residuals), col = "blue", lwd = 2) } } dev.off() # Calculate correlation matrix (Spearman only) cor_matrix_spearman <- cor(correlation_data, method = "spearman") # Print correlation matrix with 5 decimal places print(round(cor_matrix_spearman, 5)) # Separate correlations between the two sets (Spearman) set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] print(round(set1_set2_cor, 5)) # Calculate correlations within each set (Spearman) set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] # Statistical significance tests (Spearman) cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") for(i in 1:length(set1_vars)) { for(j in 1:length(set2_vars)) { var1 <- set1_vars[i] var2 <- set2_vars[j] p_val <- cor_test_results_spearman$P[var1, var2] cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) } } # Create correlation plots for both methods pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) corrplot(cor_matrix_spearman, method = "color", type = "upper", order = "hclust", tl.cex = 0.8, tl.col = "black", addCoef.col = "black", number.cex = 0.7, title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") dev.off() pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) corrplot(cor_matrix_pearson, method = "color", type = "upper", order = "hclust", tl.cex = 0.8, tl.col = "black", addCoef.col = "black", number.cex = 0.7, title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") dev.off() # Summary statistics desc_stats <- describe(correlation_data) print(round(desc_stats, 5)) # Save results to CSV files write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") write.csv(round(desc_stats, 5), "descriptive_statistics.csv") # Save correlation results in a formatted table cor_results <- data.frame( Variable1 = character(), Variable2 = character(), Spearman_r = numeric(), P_value = numeric(), stringsAsFactors = FALSE ) # Extract significant correlations between sets for(i in 1:length(set1_vars)) { for(j in 1:length(set2_vars)) { var1 <- set1_vars[i] var2 <- set2_vars[j] r_val <- cor_matrix_spearman[var1, var2] p_val <- cor_test_results_spearman$P[var1, var2] cor_results <- rbind(cor_results, data.frame( Variable1 = var1, Variable2 = var2, Spearman_r = r_val, P_value = p_val, stringsAsFactors = FALSE )) } } write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE)