eohi/eohi3/dataREVIEW-JAN21/datap 01 - age and sex match.r
2026-01-22 17:55:35 -05:00

190 lines
6.3 KiB
R

library(dplyr)
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
# RATIONALE column should exist in the CSV
# Ensure RATIONALE is character and convert any NA values to empty strings
if (!is.character(df$RATIONALE)) {
df$RATIONALE <- as.character(df$RATIONALE)
}
df$RATIONALE[is.na(df$RATIONALE)] <- ""
# Function to check if age falls within range
check_age_range <- function(age_num, age_range_str) {
# Check if data is missing or empty
if (is.na(age_num) || is.null(age_num) || age_range_str == "" || is.na(age_range_str) || trimws(age_range_str) == "") {
return(NULL) # Can't check if data is missing - return NULL to indicate skip
}
# Parse range string (e.g., "46 - 52" or "25 - 31")
range_parts <- strsplit(trimws(age_range_str), "\\s*-\\s*")[[1]]
if (length(range_parts) != 2) {
return(NULL) # Invalid range format - return NULL to indicate skip
}
min_age <- as.numeric(trimws(range_parts[1]))
max_age <- as.numeric(trimws(range_parts[2]))
if (is.na(min_age) || is.na(max_age)) {
return(NULL) # Couldn't parse numbers - return NULL to indicate skip
}
# Check if age falls within range (inclusive)
return(age_num >= min_age && age_num <= max_age)
}
# Function to check if a value is empty (empty string or whitespace only)
# Empty cells are kept as empty strings, not NA
# Vectorized to handle both single values and vectors
is_empty <- function(x) {
if (is.null(x)) return(TRUE)
# Handle vectors
if (length(x) > 1) {
result <- rep(FALSE, length(x))
result[is.na(x)] <- TRUE
if (is.character(x)) {
result[trimws(x) == ""] <- TRUE
result[x == ""] <- TRUE
}
return(result)
}
# Handle single value
if (is.na(x)) return(TRUE)
if (is.character(x) && trimws(x) == "") return(TRUE)
if (is.character(x) && x == "") return(TRUE)
return(FALSE)
}
# 1. Check sex match
# Only check if both values are non-empty
sex_mismatch <- rep(FALSE, nrow(df))
for (i in seq_len(nrow(df))) {
demo_sex_val <- ifelse(is.na(df$demo_sex[i]), "", trimws(df$demo_sex[i]))
taq_sex_val <- ifelse(is.na(df$taq_sex[i]), "", trimws(df$taq_sex[i]))
# Only check if both are non-empty
if (demo_sex_val != "" && taq_sex_val != "") {
if (tolower(demo_sex_val) != tolower(taq_sex_val)) {
sex_mismatch[i] <- TRUE
}
}
}
# 2. Check age range match
age_mismatch <- rep(FALSE, nrow(df))
for (i in seq_len(nrow(df))) {
# Only check if demo_age is not empty/NA and taq_age is not empty
if (!is.na(df$demo_age[i]) && !is_empty(df$taq_age[i])) {
age_check <- check_age_range(df$demo_age[i], df$taq_age[i])
# age_check is NULL if we can't check, FALSE if mismatch, TRUE if match
if (!is.null(age_check) && !age_check) {
age_mismatch[i] <- TRUE
}
}
}
# 3. Check citizenship (taq_cit_1 or taq_cit_2)
no_cit <- is_empty(df$taq_cit_1) & is_empty(df$taq_cit_2)
# 4. Check IP address duplicates
# Find IP addresses that appear more than once (non-empty IPs only)
ip_duplicate <- rep(FALSE, nrow(df))
if ("IPAddress" %in% colnames(df)) {
# Get non-empty IP addresses
ip_addresses <- ifelse(is.na(df$IPAddress), "", trimws(df$IPAddress))
# Count occurrences of each IP
ip_counts <- table(ip_addresses)
# Get IPs that appear more than once (and are not empty)
duplicate_ips <- names(ip_counts)[ip_counts > 1 & names(ip_counts) != ""]
# Mark rows with duplicate IPs
if (length(duplicate_ips) > 0) {
for (dup_ip in duplicate_ips) {
ip_duplicate[ip_addresses == dup_ip] <- TRUE
}
}
}
# Build RATIONALE column - only populate when there are issues
# Start with empty strings to preserve existing empty cells
rationale_parts <- rep("", nrow(df))
# Add sex mismatch
rationale_parts[sex_mismatch] <- "sex mismatch"
# Add age mismatch (append if sex mismatch already exists)
for (i in seq_len(nrow(df))) {
if (age_mismatch[i]) {
if (rationale_parts[i] != "") {
rationale_parts[i] <- paste(rationale_parts[i], "age mismatch", sep = "; ")
} else {
rationale_parts[i] <- "age mismatch"
}
}
}
# Add no cit (append if other issues already exist)
for (i in seq_len(nrow(df))) {
if (no_cit[i]) {
if (rationale_parts[i] != "") {
rationale_parts[i] <- paste(rationale_parts[i], "no cit", sep = "; ")
} else {
rationale_parts[i] <- "no cit"
}
}
}
# Add IP duplicate (append if other issues already exist)
for (i in seq_len(nrow(df))) {
if (ip_duplicate[i]) {
if (rationale_parts[i] != "") {
rationale_parts[i] <- paste(rationale_parts[i], "IP duplicate", sep = "; ")
} else {
rationale_parts[i] <- "IP duplicate"
}
}
}
# Update RATIONALE column - only set when there are issues, otherwise keep existing value
# If no issues found, keep the cell empty (or existing value if any)
for (i in seq_len(nrow(df))) {
if (rationale_parts[i] != "") {
df$RATIONALE[i] <- rationale_parts[i]
}
# If rationale_parts[i] is empty, leave RATIONALE as is (preserves existing empty or other values)
}
# Summary - using multiple methods to ensure output appears
# Try message() first (better for debug console)
message("Validation Summary:")
message("Sex mismatches: ", sum(sex_mismatch))
message("Age mismatches: ", sum(age_mismatch))
message("No citizenship: ", sum(no_cit))
message("IP duplicates: ", sum(ip_duplicate))
message("Total rows with issues: ", sum(rationale_parts != ""))
# Also use cat() to stdout (for terminal)
cat("Validation Summary:\n", file = stdout())
cat("Sex mismatches:", sum(sex_mismatch), "\n", file = stdout())
cat("Age mismatches:", sum(age_mismatch), "\n", file = stdout())
cat("No citizenship:", sum(no_cit), "\n", file = stdout())
cat("IP duplicates:", sum(ip_duplicate), "\n", file = stdout())
cat("Total rows with issues:", sum(rationale_parts != ""), "\n", file = stdout())
flush(stdout())
# Write the updated data
# Preserve empty strings as empty (not NA)
# Convert character column NAs to empty strings to preserve empty cells
for (col in names(df)) {
if (is.character(df[[col]])) {
df[[col]][is.na(df[[col]])] <- ""
}
}
write.csv(df, "eohi3_raw2.csv", row.names = FALSE, na = "", quote = TRUE)