library(dplyr)

setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")

# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")

# Populate citizenship column from taq_cit_1 and taq_cit_2
# If both have values, set to "Both"
# Otherwise, use the value from whichever column has a value
# Empty values remain as empty strings (not NA)

# Ensure citizenship column exists, initialize with empty strings if needed
if (!"citizenship" %in% names(df)) {
  df$citizenship <- ""
}

# Convert NA to empty string for taq_cit columns to ensure consistent handling
df$taq_cit_1[is.na(df$taq_cit_1)] <- ""
df$taq_cit_2[is.na(df$taq_cit_2)] <- ""

# Populate citizenship based on taq_cit_1 and taq_cit_2 using base R
# Check if both have values (non-empty)
both_have_values <- df$taq_cit_1 != "" & df$taq_cit_2 != ""

# Check if only taq_cit_1 has a value
only_cit1 <- df$taq_cit_1 != "" & df$taq_cit_2 == ""

# Check if only taq_cit_2 has a value
only_cit2 <- df$taq_cit_2 != "" & df$taq_cit_1 == ""

# Assign values
df$citizenship[both_have_values] <- "Both"
df$citizenship[only_cit1] <- df$taq_cit_1[only_cit1]
df$citizenship[only_cit2] <- df$taq_cit_2[only_cit2]
# For rows where neither has a value, citizenship keeps its original value (may be empty string)
write.csv(df, "eohi3_raw.csv", row.names = FALSE, na = "", quote = TRUE)