eohi/eohi3/dataREVIEW-JAN21/datap 01 - age and sex match.r

library(dplyr)

setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")

# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")

# RATIONALE column should exist in the CSV
# Ensure RATIONALE is character and convert any NA values to empty strings
if (!is.character(df$RATIONALE)) {
  df$RATIONALE <- as.character(df$RATIONALE)
}
df$RATIONALE[is.na(df$RATIONALE)] <- ""

# Function to check if age falls within range
check_age_range <- function(age_num, age_range_str) {
  # Check if data is missing or empty
  if (is.na(age_num) || is.null(age_num) || age_range_str == "" || is.na(age_range_str) || trimws(age_range_str) == "") {
    return(NULL)  # Can't check if data is missing - return NULL to indicate skip
  }

  # Parse range string (e.g., "46 - 52" or "25 - 31")
  range_parts <- strsplit(trimws(age_range_str), "\\s*-\\s*")[[1]]
  if (length(range_parts) != 2) {
    return(NULL)  # Invalid range format - return NULL to indicate skip
  }

  min_age <- as.numeric(trimws(range_parts[1]))
  max_age <- as.numeric(trimws(range_parts[2]))

  if (is.na(min_age) || is.na(max_age)) {
    return(NULL)  # Couldn't parse numbers - return NULL to indicate skip
  }

  # Check if age falls within range (inclusive)
  return(age_num >= min_age && age_num <= max_age)
}

# Function to check if a value is empty (empty string or whitespace only)
# Empty cells are kept as empty strings, not NA
# Vectorized to handle both single values and vectors
is_empty <- function(x) {
  if (is.null(x)) return(TRUE)
  # Handle vectors
  if (length(x) > 1) {
    result <- rep(FALSE, length(x))
    result[is.na(x)] <- TRUE
    if (is.character(x)) {
      result[trimws(x) == ""] <- TRUE
      result[x == ""] <- TRUE
    }
    return(result)
  }
  # Handle single value
  if (is.na(x)) return(TRUE)
  if (is.character(x) && trimws(x) == "") return(TRUE)
  if (is.character(x) && x == "") return(TRUE)
  return(FALSE)
}

# 1. Check sex match
# Only check if both values are non-empty
sex_mismatch <- rep(FALSE, nrow(df))
for (i in seq_len(nrow(df))) {
  demo_sex_val <- ifelse(is.na(df$demo_sex[i]), "", trimws(df$demo_sex[i]))
  taq_sex_val <- ifelse(is.na(df$taq_sex[i]), "", trimws(df$taq_sex[i]))

  # Only check if both are non-empty
  if (demo_sex_val != "" && taq_sex_val != "") {
    if (tolower(demo_sex_val) != tolower(taq_sex_val)) {
      sex_mismatch[i] <- TRUE
    }
  }
}

# 2. Check age range match
age_mismatch <- rep(FALSE, nrow(df))
for (i in seq_len(nrow(df))) {
  # Only check if demo_age is not empty/NA and taq_age is not empty
  if (!is.na(df$demo_age[i]) && !is_empty(df$taq_age[i])) {
    age_check <- check_age_range(df$demo_age[i], df$taq_age[i])
    # age_check is NULL if we can't check, FALSE if mismatch, TRUE if match
    if (!is.null(age_check) && !age_check) {
      age_mismatch[i] <- TRUE
    }
  }
}

# 3. Check citizenship (taq_cit_1 or taq_cit_2)
no_cit <- is_empty(df$taq_cit_1) & is_empty(df$taq_cit_2)

# 4. Check IP address duplicates
# Find IP addresses that appear more than once (non-empty IPs only)
ip_duplicate <- rep(FALSE, nrow(df))
if ("IPAddress" %in% colnames(df)) {
  # Get non-empty IP addresses
  ip_addresses <- ifelse(is.na(df$IPAddress), "", trimws(df$IPAddress))

  # Count occurrences of each IP
  ip_counts <- table(ip_addresses)
  # Get IPs that appear more than once (and are not empty)
  duplicate_ips <- names(ip_counts)[ip_counts > 1 & names(ip_counts) != ""]

  # Mark rows with duplicate IPs
  if (length(duplicate_ips) > 0) {
    for (dup_ip in duplicate_ips) {
      ip_duplicate[ip_addresses == dup_ip] <- TRUE
    }
  }
}

# Build RATIONALE column - only populate when there are issues
# Start with empty strings to preserve existing empty cells
rationale_parts <- rep("", nrow(df))

# Add sex mismatch
rationale_parts[sex_mismatch] <- "sex mismatch"

# Add age mismatch (append if sex mismatch already exists)
for (i in seq_len(nrow(df))) {
  if (age_mismatch[i]) {
    if (rationale_parts[i] != "") {
      rationale_parts[i] <- paste(rationale_parts[i], "age mismatch", sep = "; ")
    } else {
      rationale_parts[i] <- "age mismatch"
    }
  }
}

# Add no cit (append if other issues already exist)
for (i in seq_len(nrow(df))) {
  if (no_cit[i]) {
    if (rationale_parts[i] != "") {
      rationale_parts[i] <- paste(rationale_parts[i], "no cit", sep = "; ")
    } else {
      rationale_parts[i] <- "no cit"
    }
  }
}

# Add IP duplicate (append if other issues already exist)
for (i in seq_len(nrow(df))) {
  if (ip_duplicate[i]) {
    if (rationale_parts[i] != "") {
      rationale_parts[i] <- paste(rationale_parts[i], "IP duplicate", sep = "; ")
    } else {
      rationale_parts[i] <- "IP duplicate"
    }
  }
}

# Update RATIONALE column - only set when there are issues, otherwise keep existing value
# If no issues found, keep the cell empty (or existing value if any)
for (i in seq_len(nrow(df))) {
  if (rationale_parts[i] != "") {
    df$RATIONALE[i] <- rationale_parts[i]
  }
  # If rationale_parts[i] is empty, leave RATIONALE as is (preserves existing empty or other values)
}

# Summary - using multiple methods to ensure output appears
# Try message() first (better for debug console)
message("Validation Summary:")
message("Sex mismatches: ", sum(sex_mismatch))
message("Age mismatches: ", sum(age_mismatch))
message("No citizenship: ", sum(no_cit))
message("IP duplicates: ", sum(ip_duplicate))
message("Total rows with issues: ", sum(rationale_parts != ""))

# Also use cat() to stdout (for terminal)
cat("Validation Summary:\n", file = stdout())
cat("Sex mismatches:", sum(sex_mismatch), "\n", file = stdout())
cat("Age mismatches:", sum(age_mismatch), "\n", file = stdout())
cat("No citizenship:", sum(no_cit), "\n", file = stdout())
cat("IP duplicates:", sum(ip_duplicate), "\n", file = stdout())
cat("Total rows with issues:", sum(rationale_parts != ""), "\n", file = stdout())
flush(stdout())

# Write the updated data
# Preserve empty strings as empty (not NA)
# Convert character column NAs to empty strings to preserve empty cells
for (col in names(df)) {
  if (is.character(df[[col]])) {
    df[[col]][is.na(df[[col]])] <- ""
  }
}
write.csv(df, "eohi3_raw2.csv", row.names = FALSE, na = "", quote = TRUE)