library(dplyr) setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21") # Read the data (with check.names=FALSE to preserve original column names) # Keep empty cells as empty strings, not NA # Only convert the literal string "NA" to NA, not empty strings df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA") # RATIONALE column should exist in the CSV # Ensure RATIONALE is character and convert any NA values to empty strings if (!is.character(df$RATIONALE)) { df$RATIONALE <- as.character(df$RATIONALE) } df$RATIONALE[is.na(df$RATIONALE)] <- "" # Function to check if age falls within range check_age_range <- function(age_num, age_range_str) { # Check if data is missing or empty if (is.na(age_num) || is.null(age_num) || age_range_str == "" || is.na(age_range_str) || trimws(age_range_str) == "") { return(NULL) # Can't check if data is missing - return NULL to indicate skip } # Parse range string (e.g., "46 - 52" or "25 - 31") range_parts <- strsplit(trimws(age_range_str), "\\s*-\\s*")[[1]] if (length(range_parts) != 2) { return(NULL) # Invalid range format - return NULL to indicate skip } min_age <- as.numeric(trimws(range_parts[1])) max_age <- as.numeric(trimws(range_parts[2])) if (is.na(min_age) || is.na(max_age)) { return(NULL) # Couldn't parse numbers - return NULL to indicate skip } # Check if age falls within range (inclusive) return(age_num >= min_age && age_num <= max_age) } # Function to check if a value is empty (empty string or whitespace only) # Empty cells are kept as empty strings, not NA # Vectorized to handle both single values and vectors is_empty <- function(x) { if (is.null(x)) return(TRUE) # Handle vectors if (length(x) > 1) { result <- rep(FALSE, length(x)) result[is.na(x)] <- TRUE if (is.character(x)) { result[trimws(x) == ""] <- TRUE result[x == ""] <- TRUE } return(result) } # Handle single value if (is.na(x)) return(TRUE) if (is.character(x) && trimws(x) == "") return(TRUE) if (is.character(x) && x == "") return(TRUE) return(FALSE) } # 1. Check sex match # Only check if both values are non-empty sex_mismatch <- rep(FALSE, nrow(df)) for (i in seq_len(nrow(df))) { demo_sex_val <- ifelse(is.na(df$demo_sex[i]), "", trimws(df$demo_sex[i])) taq_sex_val <- ifelse(is.na(df$taq_sex[i]), "", trimws(df$taq_sex[i])) # Only check if both are non-empty if (demo_sex_val != "" && taq_sex_val != "") { if (tolower(demo_sex_val) != tolower(taq_sex_val)) { sex_mismatch[i] <- TRUE } } } # 2. Check age range match age_mismatch <- rep(FALSE, nrow(df)) for (i in seq_len(nrow(df))) { # Only check if demo_age is not empty/NA and taq_age is not empty if (!is.na(df$demo_age[i]) && !is_empty(df$taq_age[i])) { age_check <- check_age_range(df$demo_age[i], df$taq_age[i]) # age_check is NULL if we can't check, FALSE if mismatch, TRUE if match if (!is.null(age_check) && !age_check) { age_mismatch[i] <- TRUE } } } # 3. Check citizenship (taq_cit_1 or taq_cit_2) no_cit <- is_empty(df$taq_cit_1) & is_empty(df$taq_cit_2) # 4. Check IP address duplicates # Find IP addresses that appear more than once (non-empty IPs only) ip_duplicate <- rep(FALSE, nrow(df)) if ("IPAddress" %in% colnames(df)) { # Get non-empty IP addresses ip_addresses <- ifelse(is.na(df$IPAddress), "", trimws(df$IPAddress)) # Count occurrences of each IP ip_counts <- table(ip_addresses) # Get IPs that appear more than once (and are not empty) duplicate_ips <- names(ip_counts)[ip_counts > 1 & names(ip_counts) != ""] # Mark rows with duplicate IPs if (length(duplicate_ips) > 0) { for (dup_ip in duplicate_ips) { ip_duplicate[ip_addresses == dup_ip] <- TRUE } } } # Build RATIONALE column - only populate when there are issues # Start with empty strings to preserve existing empty cells rationale_parts <- rep("", nrow(df)) # Add sex mismatch rationale_parts[sex_mismatch] <- "sex mismatch" # Add age mismatch (append if sex mismatch already exists) for (i in seq_len(nrow(df))) { if (age_mismatch[i]) { if (rationale_parts[i] != "") { rationale_parts[i] <- paste(rationale_parts[i], "age mismatch", sep = "; ") } else { rationale_parts[i] <- "age mismatch" } } } # Add no cit (append if other issues already exist) for (i in seq_len(nrow(df))) { if (no_cit[i]) { if (rationale_parts[i] != "") { rationale_parts[i] <- paste(rationale_parts[i], "no cit", sep = "; ") } else { rationale_parts[i] <- "no cit" } } } # Add IP duplicate (append if other issues already exist) for (i in seq_len(nrow(df))) { if (ip_duplicate[i]) { if (rationale_parts[i] != "") { rationale_parts[i] <- paste(rationale_parts[i], "IP duplicate", sep = "; ") } else { rationale_parts[i] <- "IP duplicate" } } } # Update RATIONALE column - only set when there are issues, otherwise keep existing value # If no issues found, keep the cell empty (or existing value if any) for (i in seq_len(nrow(df))) { if (rationale_parts[i] != "") { df$RATIONALE[i] <- rationale_parts[i] } # If rationale_parts[i] is empty, leave RATIONALE as is (preserves existing empty or other values) } # Summary - using multiple methods to ensure output appears # Try message() first (better for debug console) message("Validation Summary:") message("Sex mismatches: ", sum(sex_mismatch)) message("Age mismatches: ", sum(age_mismatch)) message("No citizenship: ", sum(no_cit)) message("IP duplicates: ", sum(ip_duplicate)) message("Total rows with issues: ", sum(rationale_parts != "")) # Also use cat() to stdout (for terminal) cat("Validation Summary:\n", file = stdout()) cat("Sex mismatches:", sum(sex_mismatch), "\n", file = stdout()) cat("Age mismatches:", sum(age_mismatch), "\n", file = stdout()) cat("No citizenship:", sum(no_cit), "\n", file = stdout()) cat("IP duplicates:", sum(ip_duplicate), "\n", file = stdout()) cat("Total rows with issues:", sum(rationale_parts != ""), "\n", file = stdout()) flush(stdout()) # Write the updated data # Preserve empty strings as empty (not NA) # Convert character column NAs to empty strings to preserve empty cells for (col in names(df)) { if (is.character(df[[col]])) { df[[col]][is.na(df[[col]])] <- "" } } write.csv(df, "eohi3_raw2.csv", row.names = FALSE, na = "", quote = TRUE)