190 lines
6.3 KiB
R
190 lines
6.3 KiB
R
library(dplyr)
|
|
|
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
|
|
|
|
# Read the data (with check.names=FALSE to preserve original column names)
|
|
# Keep empty cells as empty strings, not NA
|
|
# Only convert the literal string "NA" to NA, not empty strings
|
|
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
|
|
|
# RATIONALE column should exist in the CSV
|
|
# Ensure RATIONALE is character and convert any NA values to empty strings
|
|
if (!is.character(df$RATIONALE)) {
|
|
df$RATIONALE <- as.character(df$RATIONALE)
|
|
}
|
|
df$RATIONALE[is.na(df$RATIONALE)] <- ""
|
|
|
|
# Function to check if age falls within range
|
|
check_age_range <- function(age_num, age_range_str) {
|
|
# Check if data is missing or empty
|
|
if (is.na(age_num) || is.null(age_num) || age_range_str == "" || is.na(age_range_str) || trimws(age_range_str) == "") {
|
|
return(NULL) # Can't check if data is missing - return NULL to indicate skip
|
|
}
|
|
|
|
# Parse range string (e.g., "46 - 52" or "25 - 31")
|
|
range_parts <- strsplit(trimws(age_range_str), "\\s*-\\s*")[[1]]
|
|
if (length(range_parts) != 2) {
|
|
return(NULL) # Invalid range format - return NULL to indicate skip
|
|
}
|
|
|
|
min_age <- as.numeric(trimws(range_parts[1]))
|
|
max_age <- as.numeric(trimws(range_parts[2]))
|
|
|
|
if (is.na(min_age) || is.na(max_age)) {
|
|
return(NULL) # Couldn't parse numbers - return NULL to indicate skip
|
|
}
|
|
|
|
# Check if age falls within range (inclusive)
|
|
return(age_num >= min_age && age_num <= max_age)
|
|
}
|
|
|
|
# Function to check if a value is empty (empty string or whitespace only)
|
|
# Empty cells are kept as empty strings, not NA
|
|
# Vectorized to handle both single values and vectors
|
|
is_empty <- function(x) {
|
|
if (is.null(x)) return(TRUE)
|
|
# Handle vectors
|
|
if (length(x) > 1) {
|
|
result <- rep(FALSE, length(x))
|
|
result[is.na(x)] <- TRUE
|
|
if (is.character(x)) {
|
|
result[trimws(x) == ""] <- TRUE
|
|
result[x == ""] <- TRUE
|
|
}
|
|
return(result)
|
|
}
|
|
# Handle single value
|
|
if (is.na(x)) return(TRUE)
|
|
if (is.character(x) && trimws(x) == "") return(TRUE)
|
|
if (is.character(x) && x == "") return(TRUE)
|
|
return(FALSE)
|
|
}
|
|
|
|
# 1. Check sex match
|
|
# Only check if both values are non-empty
|
|
sex_mismatch <- rep(FALSE, nrow(df))
|
|
for (i in seq_len(nrow(df))) {
|
|
demo_sex_val <- ifelse(is.na(df$demo_sex[i]), "", trimws(df$demo_sex[i]))
|
|
taq_sex_val <- ifelse(is.na(df$taq_sex[i]), "", trimws(df$taq_sex[i]))
|
|
|
|
# Only check if both are non-empty
|
|
if (demo_sex_val != "" && taq_sex_val != "") {
|
|
if (tolower(demo_sex_val) != tolower(taq_sex_val)) {
|
|
sex_mismatch[i] <- TRUE
|
|
}
|
|
}
|
|
}
|
|
|
|
# 2. Check age range match
|
|
age_mismatch <- rep(FALSE, nrow(df))
|
|
for (i in seq_len(nrow(df))) {
|
|
# Only check if demo_age is not empty/NA and taq_age is not empty
|
|
if (!is.na(df$demo_age[i]) && !is_empty(df$taq_age[i])) {
|
|
age_check <- check_age_range(df$demo_age[i], df$taq_age[i])
|
|
# age_check is NULL if we can't check, FALSE if mismatch, TRUE if match
|
|
if (!is.null(age_check) && !age_check) {
|
|
age_mismatch[i] <- TRUE
|
|
}
|
|
}
|
|
}
|
|
|
|
# 3. Check citizenship (taq_cit_1 or taq_cit_2)
|
|
no_cit <- is_empty(df$taq_cit_1) & is_empty(df$taq_cit_2)
|
|
|
|
# 4. Check IP address duplicates
|
|
# Find IP addresses that appear more than once (non-empty IPs only)
|
|
ip_duplicate <- rep(FALSE, nrow(df))
|
|
if ("IPAddress" %in% colnames(df)) {
|
|
# Get non-empty IP addresses
|
|
ip_addresses <- ifelse(is.na(df$IPAddress), "", trimws(df$IPAddress))
|
|
|
|
# Count occurrences of each IP
|
|
ip_counts <- table(ip_addresses)
|
|
# Get IPs that appear more than once (and are not empty)
|
|
duplicate_ips <- names(ip_counts)[ip_counts > 1 & names(ip_counts) != ""]
|
|
|
|
# Mark rows with duplicate IPs
|
|
if (length(duplicate_ips) > 0) {
|
|
for (dup_ip in duplicate_ips) {
|
|
ip_duplicate[ip_addresses == dup_ip] <- TRUE
|
|
}
|
|
}
|
|
}
|
|
|
|
# Build RATIONALE column - only populate when there are issues
|
|
# Start with empty strings to preserve existing empty cells
|
|
rationale_parts <- rep("", nrow(df))
|
|
|
|
# Add sex mismatch
|
|
rationale_parts[sex_mismatch] <- "sex mismatch"
|
|
|
|
# Add age mismatch (append if sex mismatch already exists)
|
|
for (i in seq_len(nrow(df))) {
|
|
if (age_mismatch[i]) {
|
|
if (rationale_parts[i] != "") {
|
|
rationale_parts[i] <- paste(rationale_parts[i], "age mismatch", sep = "; ")
|
|
} else {
|
|
rationale_parts[i] <- "age mismatch"
|
|
}
|
|
}
|
|
}
|
|
|
|
# Add no cit (append if other issues already exist)
|
|
for (i in seq_len(nrow(df))) {
|
|
if (no_cit[i]) {
|
|
if (rationale_parts[i] != "") {
|
|
rationale_parts[i] <- paste(rationale_parts[i], "no cit", sep = "; ")
|
|
} else {
|
|
rationale_parts[i] <- "no cit"
|
|
}
|
|
}
|
|
}
|
|
|
|
# Add IP duplicate (append if other issues already exist)
|
|
for (i in seq_len(nrow(df))) {
|
|
if (ip_duplicate[i]) {
|
|
if (rationale_parts[i] != "") {
|
|
rationale_parts[i] <- paste(rationale_parts[i], "IP duplicate", sep = "; ")
|
|
} else {
|
|
rationale_parts[i] <- "IP duplicate"
|
|
}
|
|
}
|
|
}
|
|
|
|
# Update RATIONALE column - only set when there are issues, otherwise keep existing value
|
|
# If no issues found, keep the cell empty (or existing value if any)
|
|
for (i in seq_len(nrow(df))) {
|
|
if (rationale_parts[i] != "") {
|
|
df$RATIONALE[i] <- rationale_parts[i]
|
|
}
|
|
# If rationale_parts[i] is empty, leave RATIONALE as is (preserves existing empty or other values)
|
|
}
|
|
|
|
# Summary - using multiple methods to ensure output appears
|
|
# Try message() first (better for debug console)
|
|
message("Validation Summary:")
|
|
message("Sex mismatches: ", sum(sex_mismatch))
|
|
message("Age mismatches: ", sum(age_mismatch))
|
|
message("No citizenship: ", sum(no_cit))
|
|
message("IP duplicates: ", sum(ip_duplicate))
|
|
message("Total rows with issues: ", sum(rationale_parts != ""))
|
|
|
|
# Also use cat() to stdout (for terminal)
|
|
cat("Validation Summary:\n", file = stdout())
|
|
cat("Sex mismatches:", sum(sex_mismatch), "\n", file = stdout())
|
|
cat("Age mismatches:", sum(age_mismatch), "\n", file = stdout())
|
|
cat("No citizenship:", sum(no_cit), "\n", file = stdout())
|
|
cat("IP duplicates:", sum(ip_duplicate), "\n", file = stdout())
|
|
cat("Total rows with issues:", sum(rationale_parts != ""), "\n", file = stdout())
|
|
flush(stdout())
|
|
|
|
# Write the updated data
|
|
# Preserve empty strings as empty (not NA)
|
|
# Convert character column NAs to empty strings to preserve empty cells
|
|
for (col in names(df)) {
|
|
if (is.character(df[[col]])) {
|
|
df[[col]][is.na(df[[col]])] <- ""
|
|
}
|
|
}
|
|
write.csv(df, "eohi3_raw2.csv", row.names = FALSE, na = "", quote = TRUE)
|