193 lines
6.5 KiB
R
193 lines
6.5 KiB
R
# Script to recode present-time Likert scale items in eohi2.csv
|
|
# Recodes prePrefItem, prePersItem, and preValItem to numeric values
|
|
|
|
# Load necessary library
|
|
library(dplyr)
|
|
|
|
setwd("C:/Users/irina/Documents/DND/EOHI/eohi2")
|
|
|
|
# Read the data (with check.names=FALSE to preserve original column names)
|
|
# na.strings=NULL keeps empty cells as empty strings instead of converting to NA
|
|
df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL)
|
|
|
|
# Define the mapping function
|
|
recode_likert <- function(x) {
|
|
case_when(
|
|
tolower(x) == "strongly disagree" ~ -3,
|
|
tolower(x) == "disagree" ~ -2,
|
|
tolower(x) == "somewhat disagree" ~ -1,
|
|
tolower(x) == "neither agree nor disagree" ~ 0,
|
|
tolower(x) == "somewhat agree" ~ 1,
|
|
tolower(x) == "agree" ~ 2,
|
|
tolower(x) == "strongly agree" ~ 3,
|
|
TRUE ~ NA_real_
|
|
)
|
|
}
|
|
|
|
# Define source columns (15 columns total)
|
|
source_cols <- c(
|
|
"prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5",
|
|
"prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5",
|
|
"preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5"
|
|
)
|
|
|
|
# Define target column names (15 columns total)
|
|
target_cols <- c(
|
|
"present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel",
|
|
"present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex",
|
|
"present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice"
|
|
)
|
|
|
|
# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE =============
|
|
cat("\n=== COLUMN EXISTENCE CHECK ===\n\n")
|
|
|
|
# Get actual column names from dataframe (trimmed)
|
|
df_cols <- trimws(names(df))
|
|
|
|
# Print first 30 actual column names for debugging
|
|
cat("First 30 actual column names in CSV:\n")
|
|
for (i in 1:min(30, length(df_cols))) {
|
|
cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i])))
|
|
}
|
|
cat("\n")
|
|
|
|
# Check Source columns
|
|
missing_source <- source_cols[!source_cols %in% df_cols]
|
|
existing_source <- source_cols[source_cols %in% df_cols]
|
|
|
|
cat("Source Columns:\n")
|
|
cat(" Expected: 15 columns\n")
|
|
cat(" Found:", length(existing_source), "columns\n")
|
|
cat(" Missing:", length(missing_source), "columns\n")
|
|
|
|
if (length(missing_source) > 0) {
|
|
cat("\n Missing columns:\n")
|
|
for (col in missing_source) {
|
|
cat(" -", col, "\n")
|
|
}
|
|
}
|
|
|
|
# Check for columns with similar names (potential typos/spaces)
|
|
if (length(missing_source) > 0) {
|
|
cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n")
|
|
for (miss_col in missing_source) {
|
|
# Find columns that start with similar pattern
|
|
pattern <- substr(miss_col, 1, 10)
|
|
similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE)
|
|
if (length(similar) > 0) {
|
|
cat("\nLooking for:", miss_col)
|
|
cat("\n Similar columns found:\n")
|
|
for (sim in similar) {
|
|
cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
cat("\n=== END CHECK ===\n\n")
|
|
|
|
# Stop if critical columns are missing
|
|
if (length(missing_source) > 7) {
|
|
stop("ERROR: Too many columns missing! Please check column names in CSV file.")
|
|
}
|
|
|
|
cat("Proceeding with processing...\n\n")
|
|
|
|
# Check if target columns exist in the dataframe
|
|
cat("\n=== CHECKING TARGET COLUMNS ===\n")
|
|
existing_targets <- target_cols[target_cols %in% df_cols]
|
|
missing_targets <- target_cols[!target_cols %in% df_cols]
|
|
|
|
cat("Target Columns:\n")
|
|
cat(" Expected: 15 columns\n")
|
|
cat(" Found:", length(existing_targets), "columns\n")
|
|
cat(" Missing:", length(missing_targets), "columns\n")
|
|
|
|
if (length(missing_targets) > 0) {
|
|
cat("\n Target columns do NOT exist yet - will create them.\n")
|
|
if (length(existing_targets) > 0) {
|
|
cat(" WARNING: Some target columns already exist and will be overwritten.\n")
|
|
}
|
|
} else {
|
|
cat(" All target columns exist - will overwrite with recoded values.\n")
|
|
}
|
|
cat("\n")
|
|
|
|
# Process each column (overwrite existing target columns with recoded values)
|
|
for (i in 1:15) {
|
|
source_col <- source_cols[i]
|
|
target_col <- target_cols[i]
|
|
|
|
# Get values from source column, handling missing columns
|
|
source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df))
|
|
|
|
# Recode to numeric and overwrite existing target column
|
|
df[[target_col]] <- recode_likert(source_vals)
|
|
|
|
# Print progress
|
|
cat("Processed:", target_col, "\n")
|
|
}
|
|
|
|
cat("\n=== RECODING COMPLETE ===\n\n")
|
|
|
|
|
|
# ============= QUALITY ASSURANCE: RANDOM ROW CHECK =============
|
|
# This function can be run multiple times to check different random rows
|
|
|
|
qa_check_random_row <- function() {
|
|
# Pick a random row
|
|
random_row <- sample(1:nrow(df), 1)
|
|
|
|
cat("\n========================================\n")
|
|
cat("QA CHECK: Random Row #", random_row, "\n")
|
|
cat("========================================\n\n")
|
|
|
|
# Check each of the 15 columns
|
|
for (i in 1:15) {
|
|
source_col <- source_cols[i]
|
|
target_col <- target_cols[i]
|
|
|
|
# Get values
|
|
source_val <- if (source_col %in% names(df)) df[random_row, source_col] else ""
|
|
target_val <- df[random_row, target_col]
|
|
|
|
# Determine if source has a value
|
|
has_val <- !is.na(source_val) && source_val != ""
|
|
|
|
original_text <- if (has_val) source_val else "(empty)"
|
|
|
|
# Print the info
|
|
cat(sprintf("Column %2d:\n", i))
|
|
cat(sprintf(" Source: %-30s\n", source_col))
|
|
cat(sprintf(" Target: %-30s\n", target_col))
|
|
cat(sprintf(" Original text: '%s'\n", original_text))
|
|
cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val))))
|
|
cat("\n")
|
|
}
|
|
|
|
cat("========================================\n")
|
|
cat("END QA CHECK\n")
|
|
cat("========================================\n\n")
|
|
}
|
|
|
|
# Run QA check on first random row
|
|
cat("\n\n")
|
|
qa_check_random_row()
|
|
|
|
# Instructions for running additional checks
|
|
cat("\n")
|
|
cat("*** TO CHECK ANOTHER RANDOM ROW ***\n")
|
|
cat("Run this command in R console:\n")
|
|
cat(" qa_check_random_row()\n")
|
|
cat("\n")
|
|
|
|
|
|
# Save the modified dataframe back to CSV
|
|
# na="" writes NA values as empty cells instead of "NA" text
|
|
# COMMENTED OUT FOR REVIEW - Uncomment when ready to save
|
|
write.csv(df, "eohi2.csv", row.names = FALSE, na = "")
|
|
|
|
cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n")
|
|
cat("Review the output above, then uncomment line 189 to save changes.\n")
|
|
cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n")
|