library(tidyverse) library(ggplot2) setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") # Read data data <- read.csv("exp1.csv") # Select variables ending exactly with _T or _F df <- data %>% select(matches("(_T|_F)$")) # Remove demo_f variable (if present) df <- df %>% select(-any_of("demo_f")) str(df) # Coerce to numeric where possible (without breaking non-numeric) df_num <- df %>% mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) # Compute count and proportion correct per variable descriptives <- purrr::imap_dfr(df_num, function(col, name) { x <- suppressWarnings(as.numeric(col)) x <- x[!is.na(x)] n_total <- length(x) n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) prop <- if (n_total == 0) NA_real_ else n_correct / n_total # Extract difficulty number from variable name and map to expected range difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) expected_ranges <- list( "15" = c(0.15, 0.25), "35" = c(0.35, 0.45), "55" = c(0.55, 0.65), "75" = c(0.75, 0.85) ) if (as.character(difficulty_num) %in% names(expected_ranges)) { expected_range <- expected_ranges[[as.character(difficulty_num)]] match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" } else { match_difficulty <- "UNKNOWN" } tibble( variable = name, n_total = n_total, n_correct = n_correct, prop_correct = round(prop, 5), match_difficulty = match_difficulty ) }) %>% arrange(variable) # Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) bin_factor <- cut( descriptives$prop_correct, breaks = seq(0.10, 1.00, by = 0.10), right = FALSE, include.lowest = FALSE, labels = bin_levels ) bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% group_by(bin) %>% summarise(num_variables = n(), .groups = "drop") # Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) bin15_factor <- cut( descriptives$prop_correct, breaks = seq(0.15, 0.95, by = 0.10), right = FALSE, include.lowest = FALSE, labels = bin15_levels ) bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% group_by(bin) %>% summarise(num_variables = n(), .groups = "drop") # View print(descriptives, n = Inf) cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") print(bin_counts, n = Inf) cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") print(bin15_counts, n = Inf) # Histogram of proportion correct with custom bins histogram <- ggplot(descriptives, aes(x = prop_correct)) + geom_histogram( breaks = seq(0.15, 0.95, by = 0.10), fill = "lightblue", color = "black", alpha = 0.7 ) + labs( title = "Distribution of Proportion Correct", x = "Proportion Correct", y = "Number of Variables" ) + theme_minimal() + scale_x_continuous(breaks = seq(0.15, 0.95, by = 0.10)) print(histogram) # Optionally save # readr::write_csv(descriptives, "exp1_TF_descriptives.csv")