library(dplyr)

setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")

# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")

# Remove trailing columns with empty names (dplyr requires all columns to have names)
empty_cols <- which(names(df) == "" | is.na(names(df)))
if (length(empty_cols) > 0) {
  df <- df[, -empty_cols, drop = FALSE]
}

# Set to TRUE to save all distributions to a document file
save_to_doc <- TRUE
doc_filename <- "eohi3_quotas.txt"

# =============================================================================
# SINGLE VARIABLE DISTRIBUTIONS
# =============================================================================

dist_age <- df %>% count(taq_age, sort = TRUE)
print(dist_age)

dist_sex <- df %>% count(taq_sex, sort = TRUE)
print(dist_sex)

dist_citizenship <- df %>% count(citizenship, sort = TRUE)
print(dist_citizenship)

dist_group <- df %>% count(group, sort = TRUE)
print(dist_group)

dist_temporalDO <- df %>% count(temporalDO, sort = TRUE)
print(dist_temporalDO)

dist_perspective <- df %>% count(perspective, sort = TRUE)
print(dist_perspective)

# =============================================================================
# NESTED DISTRIBUTIONS
# =============================================================================

dist_age_citizenship <- df %>% count(citizenship, taq_age) %>% arrange(citizenship, taq_age)
print(dist_age_citizenship)

dist_sex_citizenship <- df %>% count(citizenship, taq_sex) %>% arrange(citizenship, taq_sex)
print(dist_sex_citizenship)

dist_age_temporalDO <- df %>% count(temporalDO, taq_age) %>% arrange(temporalDO, taq_age)
print(dist_age_temporalDO)

dist_age_perspective <- df %>% count(perspective, taq_age) %>% arrange(perspective, taq_age)
print(dist_age_perspective)

dist_sex_temporalDO <- df %>% count(temporalDO, taq_sex) %>% arrange(temporalDO, taq_sex)
print(dist_sex_temporalDO)

dist_sex_perspective <- df %>% count(perspective, taq_sex) %>% arrange(perspective, taq_sex)
print(dist_sex_perspective)

# =============================================================================
# OPTIONAL: SAVE ALL DISTRIBUTIONS TO DOCUMENT
# =============================================================================

if (save_to_doc) {
  sink(doc_filename)
  
  cat("DISTRIBUTION REPORT\n")
  cat("==================\n\n")
  
  cat("SINGLE VARIABLE DISTRIBUTIONS\n")
  cat("------------------------------\n\n")
  
  cat("Distribution of taq_age:\n")
  print(dist_age)
  cat("\n\n")
  
  cat("Distribution of taq_sex:\n")
  print(dist_sex)
  cat("\n\n")
  
  cat("Distribution of citizenship:\n")
  print(dist_citizenship)
  cat("\n\n")
  
  cat("Distribution of group:\n")
  print(dist_group)
  cat("\n\n")
  
  cat("Distribution of temporalDO:\n")
  print(dist_temporalDO)
  cat("\n\n")
  
  cat("Distribution of perspective:\n")
  print(dist_perspective)
  cat("\n\n")
  
  cat("NESTED DISTRIBUTIONS\n")
  cat("---------------------\n\n")
  
  cat("Age within Citizenship:\n")
  print(dist_age_citizenship)
  cat("\n\n")
  
  cat("Sex within Citizenship:\n")
  print(dist_sex_citizenship)
  cat("\n\n")
  
  cat("Age within temporalDO:\n")
  print(dist_age_temporalDO)
  cat("\n\n")
  
  cat("Age within perspective:\n")
  print(dist_age_perspective)
  cat("\n\n")
  
  cat("Sex within temporalDO:\n")
  print(dist_sex_temporalDO)
  cat("\n\n")
  
  cat("Sex within perspective:\n")
  print(dist_sex_perspective)
  cat("\n")
  
  sink()
  cat("Distributions saved to:", doc_filename, "\n")
}