eohi3-updates #3
55
.vscode/launch.json
vendored
Normal file
55
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"type": "R-Debugger",
|
||||||
|
"name": "Launch R-Workspace",
|
||||||
|
"request": "launch",
|
||||||
|
"debugMode": "workspace",
|
||||||
|
"workingDirectory": "${workspaceFolder}",
|
||||||
|
"splitOverwrittenOutput": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "R-Debugger",
|
||||||
|
"name": "Debug R-File",
|
||||||
|
"request": "launch",
|
||||||
|
"debugMode": "file",
|
||||||
|
"workingDirectory": "${workspaceFolder}",
|
||||||
|
"file": "${file}",
|
||||||
|
"splitOverwrittenOutput": true,
|
||||||
|
"stopOnEntry": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "R-Debugger",
|
||||||
|
"name": "Debug R-Function",
|
||||||
|
"request": "launch",
|
||||||
|
"debugMode": "function",
|
||||||
|
"workingDirectory": "${workspaceFolder}",
|
||||||
|
"file": "${file}",
|
||||||
|
"mainFunction": "main",
|
||||||
|
"allowGlobalDebugging": false,
|
||||||
|
"splitOverwrittenOutput": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "R-Debugger",
|
||||||
|
"name": "Debug R-Package",
|
||||||
|
"request": "launch",
|
||||||
|
"debugMode": "workspace",
|
||||||
|
"workingDirectory": "${workspaceFolder}",
|
||||||
|
"includePackageScopes": true,
|
||||||
|
"loadPackages": [
|
||||||
|
"."
|
||||||
|
],
|
||||||
|
"splitOverwrittenOutput": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "R-Debugger",
|
||||||
|
"request": "attach",
|
||||||
|
"name": "Attach to R process",
|
||||||
|
"splitOverwrittenOutput": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
425
eohi3/dataREVIEW-JAN05/eohi3_filter2.csv
Normal file
425
eohi3/dataREVIEW-JAN05/eohi3_filter2.csv
Normal file
File diff suppressed because one or more lines are too long
490
eohi3/dataREVIEW-JAN05/eohi3_raw.csv
Normal file
490
eohi3/dataREVIEW-JAN05/eohi3_raw.csv
Normal file
File diff suppressed because one or more lines are too long
490
eohi3/dataREVIEW-JAN05/eohi3_raw2.csv
Normal file
490
eohi3/dataREVIEW-JAN05/eohi3_raw2.csv
Normal file
File diff suppressed because one or more lines are too long
543
eohi3/dataREVIEW-JAN05/eohi3_unprocessed.csv
Normal file
543
eohi3/dataREVIEW-JAN05/eohi3_unprocessed.csv
Normal file
File diff suppressed because one or more lines are too long
68
eohi3/dataREVIEW-JAN05/response ID for replacement.csv
Normal file
68
eohi3/dataREVIEW-JAN05/response ID for replacement.csv
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
ResponseId,RATIONALE
|
||||||
|
R_12EXYt8gHauPaCb,duration
|
||||||
|
R_142iZtlDp1Vam14,duration
|
||||||
|
R_16eRiaoFPG5CpE4,duration
|
||||||
|
R_1aK2JWzCFkpefUg,duration
|
||||||
|
R_1FEuEk6VzuwxZby,duration
|
||||||
|
R_1IsHUv4sb6oOphv,duration
|
||||||
|
R_1J2cryciskOYjOV,duration
|
||||||
|
R_1JFsZ1GXM7jDWmh,duration
|
||||||
|
R_1JlV9H7AJKtNZ8g,duration
|
||||||
|
R_1kgjhkT4sJwhfuV,duration
|
||||||
|
R_1MAMwGkBHTTSyAh,duration
|
||||||
|
R_1O6dV9hTlqpsYjP,duration
|
||||||
|
R_1qatgZwcLPGctnd,age mismatch
|
||||||
|
R_1QE5KaKNkt66Cer,duration
|
||||||
|
R_1QsYazd3eOH62js,duration
|
||||||
|
R_1vwOg7l0kSLHGRX,duration
|
||||||
|
R_1YJ2G01dpxYqKAm,duration
|
||||||
|
R_1YoddNWqybPbaNN,feedback in french
|
||||||
|
R_1ZOjQ97Ph1VtRwp,duration
|
||||||
|
R_347ABt6LFPUeVZS,duration
|
||||||
|
R_34Ain6V2NbEDeQm,duration
|
||||||
|
R_38J0VDB8JE8Dd0o,duration
|
||||||
|
R_3DptQmS26X0Z8Wu,IP duplicate
|
||||||
|
R_3Foc2aYGpXFrbnX,age mismatch + duration
|
||||||
|
R_3HLz0FyaULkIPKu,IP duplicate
|
||||||
|
R_3jUhefm4hAEQ6PC,duration
|
||||||
|
R_3n8b0ndM4habNjB,age mismatch
|
||||||
|
R_3nTLzs9jMwDHbFy,duration
|
||||||
|
R_3rGudTtAd2oVze3,duration
|
||||||
|
R_3t6giyCy5IwZgom,duration
|
||||||
|
R_3WwXkl4IatPYDZ0,age mismatch
|
||||||
|
R_5ByssDsdjMcQgUV,duration
|
||||||
|
R_5cNBH4nxBlH8OSB,duration
|
||||||
|
R_5FkttTgBeMePzhk,sex mismatch
|
||||||
|
R_5FyLW7dHpyFojo5,duration
|
||||||
|
R_5M3urkuYhhSG06E,duration
|
||||||
|
R_5MRp7eFKMm59t14,feedback in french
|
||||||
|
R_5n6H7xuYTQgvFEf,duration
|
||||||
|
R_5rrbHXjKol6Zl9U,duration
|
||||||
|
R_5youAGSa5hLGkuZ,age mismatch + duration
|
||||||
|
R_5z5DYfTnai5Pj3j,duration
|
||||||
|
R_64nOi2TWI4XCYkt,duration
|
||||||
|
R_6BcdSiP0Nibxx1D,duration
|
||||||
|
R_6C4v9kRnGm9Iqyj,IP duplicate
|
||||||
|
R_6CpjN5tJoj8dYuB,duration
|
||||||
|
R_6cwKXrr8R99m5ez,duration
|
||||||
|
R_6F4ld4gRlKjsb06,age mismatch + duration
|
||||||
|
R_6GqjTqXrehkbG0x,duration
|
||||||
|
R_6HCtgHyy16nNMQ4,age mismatch
|
||||||
|
R_6hQN1DUFkxGpDGD,IP duplicate
|
||||||
|
R_6JKscJDUeAt7k1y,age mismatch
|
||||||
|
R_6lKqtees5Z1hj2L,duration
|
||||||
|
R_6m1NYZLedxbAxui,duration
|
||||||
|
R_6pM4ierZhbT1FEb,duration
|
||||||
|
R_6rQCiwlJHKrWWKB,duration
|
||||||
|
R_7AwVrmL8AM0KLKx,duration
|
||||||
|
R_7bH15XzvHpDCZO1,duration
|
||||||
|
R_7Cl7KFkEiuYwdZn,duration
|
||||||
|
R_7EfALTPED13tduG,duration
|
||||||
|
R_7flJBV9qf88XSM5,duration
|
||||||
|
R_7H0dTzsyEC1Pzyh,duration
|
||||||
|
R_7HM0FXjrAoTeGqt,duration
|
||||||
|
R_7HRMvwMPw3OBE7g,duration
|
||||||
|
R_7o7FORJHlgWAahS,age mismatch
|
||||||
|
R_7sTsQ9AI42QQgSV,duration
|
||||||
|
R_7VJCRyovK5KAddn,duration
|
||||||
|
R_7w4ggvRoPBkyTle,duration
|
||||||
|
189
eohi3/dataREVIEW-JAN21/datap 01 - age and sex match.r
Normal file
189
eohi3/dataREVIEW-JAN21/datap 01 - age and sex match.r
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
library(dplyr)
|
||||||
|
|
||||||
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
|
||||||
|
|
||||||
|
# Read the data (with check.names=FALSE to preserve original column names)
|
||||||
|
# Keep empty cells as empty strings, not NA
|
||||||
|
# Only convert the literal string "NA" to NA, not empty strings
|
||||||
|
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
||||||
|
|
||||||
|
# RATIONALE column should exist in the CSV
|
||||||
|
# Ensure RATIONALE is character and convert any NA values to empty strings
|
||||||
|
if (!is.character(df$RATIONALE)) {
|
||||||
|
df$RATIONALE <- as.character(df$RATIONALE)
|
||||||
|
}
|
||||||
|
df$RATIONALE[is.na(df$RATIONALE)] <- ""
|
||||||
|
|
||||||
|
# Function to check if age falls within range
|
||||||
|
check_age_range <- function(age_num, age_range_str) {
|
||||||
|
# Check if data is missing or empty
|
||||||
|
if (is.na(age_num) || is.null(age_num) || age_range_str == "" || is.na(age_range_str) || trimws(age_range_str) == "") {
|
||||||
|
return(NULL) # Can't check if data is missing - return NULL to indicate skip
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse range string (e.g., "46 - 52" or "25 - 31")
|
||||||
|
range_parts <- strsplit(trimws(age_range_str), "\\s*-\\s*")[[1]]
|
||||||
|
if (length(range_parts) != 2) {
|
||||||
|
return(NULL) # Invalid range format - return NULL to indicate skip
|
||||||
|
}
|
||||||
|
|
||||||
|
min_age <- as.numeric(trimws(range_parts[1]))
|
||||||
|
max_age <- as.numeric(trimws(range_parts[2]))
|
||||||
|
|
||||||
|
if (is.na(min_age) || is.na(max_age)) {
|
||||||
|
return(NULL) # Couldn't parse numbers - return NULL to indicate skip
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if age falls within range (inclusive)
|
||||||
|
return(age_num >= min_age && age_num <= max_age)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to check if a value is empty (empty string or whitespace only)
|
||||||
|
# Empty cells are kept as empty strings, not NA
|
||||||
|
# Vectorized to handle both single values and vectors
|
||||||
|
is_empty <- function(x) {
|
||||||
|
if (is.null(x)) return(TRUE)
|
||||||
|
# Handle vectors
|
||||||
|
if (length(x) > 1) {
|
||||||
|
result <- rep(FALSE, length(x))
|
||||||
|
result[is.na(x)] <- TRUE
|
||||||
|
if (is.character(x)) {
|
||||||
|
result[trimws(x) == ""] <- TRUE
|
||||||
|
result[x == ""] <- TRUE
|
||||||
|
}
|
||||||
|
return(result)
|
||||||
|
}
|
||||||
|
# Handle single value
|
||||||
|
if (is.na(x)) return(TRUE)
|
||||||
|
if (is.character(x) && trimws(x) == "") return(TRUE)
|
||||||
|
if (is.character(x) && x == "") return(TRUE)
|
||||||
|
return(FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
|
# 1. Check sex match
|
||||||
|
# Only check if both values are non-empty
|
||||||
|
sex_mismatch <- rep(FALSE, nrow(df))
|
||||||
|
for (i in seq_len(nrow(df))) {
|
||||||
|
demo_sex_val <- ifelse(is.na(df$demo_sex[i]), "", trimws(df$demo_sex[i]))
|
||||||
|
taq_sex_val <- ifelse(is.na(df$taq_sex[i]), "", trimws(df$taq_sex[i]))
|
||||||
|
|
||||||
|
# Only check if both are non-empty
|
||||||
|
if (demo_sex_val != "" && taq_sex_val != "") {
|
||||||
|
if (tolower(demo_sex_val) != tolower(taq_sex_val)) {
|
||||||
|
sex_mismatch[i] <- TRUE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 2. Check age range match
|
||||||
|
age_mismatch <- rep(FALSE, nrow(df))
|
||||||
|
for (i in seq_len(nrow(df))) {
|
||||||
|
# Only check if demo_age is not empty/NA and taq_age is not empty
|
||||||
|
if (!is.na(df$demo_age[i]) && !is_empty(df$taq_age[i])) {
|
||||||
|
age_check <- check_age_range(df$demo_age[i], df$taq_age[i])
|
||||||
|
# age_check is NULL if we can't check, FALSE if mismatch, TRUE if match
|
||||||
|
if (!is.null(age_check) && !age_check) {
|
||||||
|
age_mismatch[i] <- TRUE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 3. Check citizenship (taq_cit_1 or taq_cit_2)
|
||||||
|
no_cit <- is_empty(df$taq_cit_1) & is_empty(df$taq_cit_2)
|
||||||
|
|
||||||
|
# 4. Check IP address duplicates
|
||||||
|
# Find IP addresses that appear more than once (non-empty IPs only)
|
||||||
|
ip_duplicate <- rep(FALSE, nrow(df))
|
||||||
|
if ("IPAddress" %in% colnames(df)) {
|
||||||
|
# Get non-empty IP addresses
|
||||||
|
ip_addresses <- ifelse(is.na(df$IPAddress), "", trimws(df$IPAddress))
|
||||||
|
|
||||||
|
# Count occurrences of each IP
|
||||||
|
ip_counts <- table(ip_addresses)
|
||||||
|
# Get IPs that appear more than once (and are not empty)
|
||||||
|
duplicate_ips <- names(ip_counts)[ip_counts > 1 & names(ip_counts) != ""]
|
||||||
|
|
||||||
|
# Mark rows with duplicate IPs
|
||||||
|
if (length(duplicate_ips) > 0) {
|
||||||
|
for (dup_ip in duplicate_ips) {
|
||||||
|
ip_duplicate[ip_addresses == dup_ip] <- TRUE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Build RATIONALE column - only populate when there are issues
|
||||||
|
# Start with empty strings to preserve existing empty cells
|
||||||
|
rationale_parts <- rep("", nrow(df))
|
||||||
|
|
||||||
|
# Add sex mismatch
|
||||||
|
rationale_parts[sex_mismatch] <- "sex mismatch"
|
||||||
|
|
||||||
|
# Add age mismatch (append if sex mismatch already exists)
|
||||||
|
for (i in seq_len(nrow(df))) {
|
||||||
|
if (age_mismatch[i]) {
|
||||||
|
if (rationale_parts[i] != "") {
|
||||||
|
rationale_parts[i] <- paste(rationale_parts[i], "age mismatch", sep = "; ")
|
||||||
|
} else {
|
||||||
|
rationale_parts[i] <- "age mismatch"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add no cit (append if other issues already exist)
|
||||||
|
for (i in seq_len(nrow(df))) {
|
||||||
|
if (no_cit[i]) {
|
||||||
|
if (rationale_parts[i] != "") {
|
||||||
|
rationale_parts[i] <- paste(rationale_parts[i], "no cit", sep = "; ")
|
||||||
|
} else {
|
||||||
|
rationale_parts[i] <- "no cit"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add IP duplicate (append if other issues already exist)
|
||||||
|
for (i in seq_len(nrow(df))) {
|
||||||
|
if (ip_duplicate[i]) {
|
||||||
|
if (rationale_parts[i] != "") {
|
||||||
|
rationale_parts[i] <- paste(rationale_parts[i], "IP duplicate", sep = "; ")
|
||||||
|
} else {
|
||||||
|
rationale_parts[i] <- "IP duplicate"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update RATIONALE column - only set when there are issues, otherwise keep existing value
|
||||||
|
# If no issues found, keep the cell empty (or existing value if any)
|
||||||
|
for (i in seq_len(nrow(df))) {
|
||||||
|
if (rationale_parts[i] != "") {
|
||||||
|
df$RATIONALE[i] <- rationale_parts[i]
|
||||||
|
}
|
||||||
|
# If rationale_parts[i] is empty, leave RATIONALE as is (preserves existing empty or other values)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Summary - using multiple methods to ensure output appears
|
||||||
|
# Try message() first (better for debug console)
|
||||||
|
message("Validation Summary:")
|
||||||
|
message("Sex mismatches: ", sum(sex_mismatch))
|
||||||
|
message("Age mismatches: ", sum(age_mismatch))
|
||||||
|
message("No citizenship: ", sum(no_cit))
|
||||||
|
message("IP duplicates: ", sum(ip_duplicate))
|
||||||
|
message("Total rows with issues: ", sum(rationale_parts != ""))
|
||||||
|
|
||||||
|
# Also use cat() to stdout (for terminal)
|
||||||
|
cat("Validation Summary:\n", file = stdout())
|
||||||
|
cat("Sex mismatches:", sum(sex_mismatch), "\n", file = stdout())
|
||||||
|
cat("Age mismatches:", sum(age_mismatch), "\n", file = stdout())
|
||||||
|
cat("No citizenship:", sum(no_cit), "\n", file = stdout())
|
||||||
|
cat("IP duplicates:", sum(ip_duplicate), "\n", file = stdout())
|
||||||
|
cat("Total rows with issues:", sum(rationale_parts != ""), "\n", file = stdout())
|
||||||
|
flush(stdout())
|
||||||
|
|
||||||
|
# Write the updated data
|
||||||
|
# Preserve empty strings as empty (not NA)
|
||||||
|
# Convert character column NAs to empty strings to preserve empty cells
|
||||||
|
for (col in names(df)) {
|
||||||
|
if (is.character(df[[col]])) {
|
||||||
|
df[[col]][is.na(df[[col]])] <- ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
write.csv(df, "eohi3_raw2.csv", row.names = FALSE, na = "", quote = TRUE)
|
||||||
39
eohi3/dataREVIEW-JAN21/datap 02 - citizenship.r
Normal file
39
eohi3/dataREVIEW-JAN21/datap 02 - citizenship.r
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
library(dplyr)
|
||||||
|
|
||||||
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
|
||||||
|
|
||||||
|
# Read the data (with check.names=FALSE to preserve original column names)
|
||||||
|
# Keep empty cells as empty strings, not NA
|
||||||
|
# Only convert the literal string "NA" to NA, not empty strings
|
||||||
|
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
||||||
|
|
||||||
|
# Populate citizenship column from taq_cit_1 and taq_cit_2
|
||||||
|
# If both have values, set to "Both"
|
||||||
|
# Otherwise, use the value from whichever column has a value
|
||||||
|
# Empty values remain as empty strings (not NA)
|
||||||
|
|
||||||
|
# Ensure citizenship column exists, initialize with empty strings if needed
|
||||||
|
if (!"citizenship" %in% names(df)) {
|
||||||
|
df$citizenship <- ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert NA to empty string for taq_cit columns to ensure consistent handling
|
||||||
|
df$taq_cit_1[is.na(df$taq_cit_1)] <- ""
|
||||||
|
df$taq_cit_2[is.na(df$taq_cit_2)] <- ""
|
||||||
|
|
||||||
|
# Populate citizenship based on taq_cit_1 and taq_cit_2 using base R
|
||||||
|
# Check if both have values (non-empty)
|
||||||
|
both_have_values <- df$taq_cit_1 != "" & df$taq_cit_2 != ""
|
||||||
|
|
||||||
|
# Check if only taq_cit_1 has a value
|
||||||
|
only_cit1 <- df$taq_cit_1 != "" & df$taq_cit_2 == ""
|
||||||
|
|
||||||
|
# Check if only taq_cit_2 has a value
|
||||||
|
only_cit2 <- df$taq_cit_2 != "" & df$taq_cit_1 == ""
|
||||||
|
|
||||||
|
# Assign values
|
||||||
|
df$citizenship[both_have_values] <- "Both"
|
||||||
|
df$citizenship[only_cit1] <- df$taq_cit_1[only_cit1]
|
||||||
|
df$citizenship[only_cit2] <- df$taq_cit_2[only_cit2]
|
||||||
|
# For rows where neither has a value, citizenship keeps its original value (may be empty string)
|
||||||
|
write.csv(df, "eohi3_raw.csv", row.names = FALSE, na = "", quote = TRUE)
|
||||||
130
eohi3/dataREVIEW-JAN21/datap 03 - quotas.r
Normal file
130
eohi3/dataREVIEW-JAN21/datap 03 - quotas.r
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
library(dplyr)
|
||||||
|
|
||||||
|
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
|
||||||
|
|
||||||
|
# Read the data (with check.names=FALSE to preserve original column names)
|
||||||
|
# Keep empty cells as empty strings, not NA
|
||||||
|
# Only convert the literal string "NA" to NA, not empty strings
|
||||||
|
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
|
||||||
|
|
||||||
|
# Remove trailing columns with empty names (dplyr requires all columns to have names)
|
||||||
|
empty_cols <- which(names(df) == "" | is.na(names(df)))
|
||||||
|
if (length(empty_cols) > 0) {
|
||||||
|
df <- df[, -empty_cols, drop = FALSE]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set to TRUE to save all distributions to a document file
|
||||||
|
save_to_doc <- TRUE
|
||||||
|
doc_filename <- "eohi3_quotas.txt"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# SINGLE VARIABLE DISTRIBUTIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
dist_age <- df %>% count(taq_age, sort = TRUE)
|
||||||
|
print(dist_age)
|
||||||
|
|
||||||
|
dist_sex <- df %>% count(taq_sex, sort = TRUE)
|
||||||
|
print(dist_sex)
|
||||||
|
|
||||||
|
dist_citizenship <- df %>% count(citizenship, sort = TRUE)
|
||||||
|
print(dist_citizenship)
|
||||||
|
|
||||||
|
dist_group <- df %>% count(group, sort = TRUE)
|
||||||
|
print(dist_group)
|
||||||
|
|
||||||
|
dist_temporalDO <- df %>% count(temporalDO, sort = TRUE)
|
||||||
|
print(dist_temporalDO)
|
||||||
|
|
||||||
|
dist_perspective <- df %>% count(perspective, sort = TRUE)
|
||||||
|
print(dist_perspective)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# NESTED DISTRIBUTIONS
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
dist_age_citizenship <- df %>% count(citizenship, taq_age) %>% arrange(citizenship, taq_age)
|
||||||
|
print(dist_age_citizenship)
|
||||||
|
|
||||||
|
dist_sex_citizenship <- df %>% count(citizenship, taq_sex) %>% arrange(citizenship, taq_sex)
|
||||||
|
print(dist_sex_citizenship)
|
||||||
|
|
||||||
|
dist_age_temporalDO <- df %>% count(temporalDO, taq_age) %>% arrange(temporalDO, taq_age)
|
||||||
|
print(dist_age_temporalDO)
|
||||||
|
|
||||||
|
dist_age_perspective <- df %>% count(perspective, taq_age) %>% arrange(perspective, taq_age)
|
||||||
|
print(dist_age_perspective)
|
||||||
|
|
||||||
|
dist_sex_temporalDO <- df %>% count(temporalDO, taq_sex) %>% arrange(temporalDO, taq_sex)
|
||||||
|
print(dist_sex_temporalDO)
|
||||||
|
|
||||||
|
dist_sex_perspective <- df %>% count(perspective, taq_sex) %>% arrange(perspective, taq_sex)
|
||||||
|
print(dist_sex_perspective)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# OPTIONAL: SAVE ALL DISTRIBUTIONS TO DOCUMENT
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
if (save_to_doc) {
|
||||||
|
sink(doc_filename)
|
||||||
|
|
||||||
|
cat("DISTRIBUTION REPORT\n")
|
||||||
|
cat("==================\n\n")
|
||||||
|
|
||||||
|
cat("SINGLE VARIABLE DISTRIBUTIONS\n")
|
||||||
|
cat("------------------------------\n\n")
|
||||||
|
|
||||||
|
cat("Distribution of taq_age:\n")
|
||||||
|
print(dist_age)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Distribution of taq_sex:\n")
|
||||||
|
print(dist_sex)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Distribution of citizenship:\n")
|
||||||
|
print(dist_citizenship)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Distribution of group:\n")
|
||||||
|
print(dist_group)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Distribution of temporalDO:\n")
|
||||||
|
print(dist_temporalDO)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Distribution of perspective:\n")
|
||||||
|
print(dist_perspective)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("NESTED DISTRIBUTIONS\n")
|
||||||
|
cat("---------------------\n\n")
|
||||||
|
|
||||||
|
cat("Age within Citizenship:\n")
|
||||||
|
print(dist_age_citizenship)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Sex within Citizenship:\n")
|
||||||
|
print(dist_sex_citizenship)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Age within temporalDO:\n")
|
||||||
|
print(dist_age_temporalDO)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Age within perspective:\n")
|
||||||
|
print(dist_age_perspective)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Sex within temporalDO:\n")
|
||||||
|
print(dist_sex_temporalDO)
|
||||||
|
cat("\n\n")
|
||||||
|
|
||||||
|
cat("Sex within perspective:\n")
|
||||||
|
print(dist_sex_perspective)
|
||||||
|
cat("\n")
|
||||||
|
|
||||||
|
sink()
|
||||||
|
cat("Distributions saved to:", doc_filename, "\n")
|
||||||
|
}
|
||||||
177
eohi3/dataREVIEW-JAN21/eohi3_quotas.txt
Normal file
177
eohi3/dataREVIEW-JAN21/eohi3_quotas.txt
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
DISTRIBUTION REPORT
|
||||||
|
|
||||||
|
==================
|
||||||
|
|
||||||
|
|
||||||
|
SINGLE VARIABLE DISTRIBUTIONS
|
||||||
|
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
Distribution of taq_age:
|
||||||
|
|
||||||
|
taq_age n
|
||||||
|
1 18 - 24 73
|
||||||
|
2 53 - 59 67
|
||||||
|
3 60 - 66 67
|
||||||
|
4 67 - 73 65
|
||||||
|
5 39 - 45 64
|
||||||
|
6 46 - 52 63
|
||||||
|
7 25 - 31 62
|
||||||
|
8 32 - 38 61
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Distribution of taq_sex:
|
||||||
|
|
||||||
|
taq_sex n
|
||||||
|
1 Female 260
|
||||||
|
2 Male 257
|
||||||
|
3 Prefer not to say 5
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Distribution of citizenship:
|
||||||
|
|
||||||
|
citizenship n
|
||||||
|
1 American 262
|
||||||
|
2 Canadian 258
|
||||||
|
3 Both 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Distribution of group:
|
||||||
|
|
||||||
|
group n
|
||||||
|
1 01FPV 177
|
||||||
|
2 03VFP 174
|
||||||
|
3 02PVF 171
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Distribution of temporalDO:
|
||||||
|
|
||||||
|
temporalDO n
|
||||||
|
1 past 262
|
||||||
|
2 future 260
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Distribution of perspective:
|
||||||
|
|
||||||
|
perspective n
|
||||||
|
1 other 261
|
||||||
|
2 self 261
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
NESTED DISTRIBUTIONS
|
||||||
|
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
|
||||||
|
Age within Citizenship:
|
||||||
|
|
||||||
|
citizenship taq_age n
|
||||||
|
1 American 18 - 24 38
|
||||||
|
2 American 25 - 31 30
|
||||||
|
3 American 32 - 38 29
|
||||||
|
4 American 39 - 45 33
|
||||||
|
5 American 46 - 52 31
|
||||||
|
6 American 53 - 59 34
|
||||||
|
7 American 60 - 66 34
|
||||||
|
8 American 67 - 73 33
|
||||||
|
9 Both 32 - 38 1
|
||||||
|
10 Both 46 - 52 1
|
||||||
|
11 Canadian 18 - 24 35
|
||||||
|
12 Canadian 25 - 31 32
|
||||||
|
13 Canadian 32 - 38 31
|
||||||
|
14 Canadian 39 - 45 31
|
||||||
|
15 Canadian 46 - 52 31
|
||||||
|
16 Canadian 53 - 59 33
|
||||||
|
17 Canadian 60 - 66 33
|
||||||
|
18 Canadian 67 - 73 32
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Sex within Citizenship:
|
||||||
|
|
||||||
|
citizenship taq_sex n
|
||||||
|
1 American Female 130
|
||||||
|
2 American Male 129
|
||||||
|
3 American Prefer not to say 3
|
||||||
|
4 Both Female 1
|
||||||
|
5 Both Male 1
|
||||||
|
6 Canadian Female 129
|
||||||
|
7 Canadian Male 127
|
||||||
|
8 Canadian Prefer not to say 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Age within temporalDO:
|
||||||
|
|
||||||
|
temporalDO taq_age n
|
||||||
|
1 future 18 - 24 38
|
||||||
|
2 future 25 - 31 31
|
||||||
|
3 future 32 - 38 29
|
||||||
|
4 future 39 - 45 34
|
||||||
|
5 future 46 - 52 35
|
||||||
|
6 future 53 - 59 36
|
||||||
|
7 future 60 - 66 29
|
||||||
|
8 future 67 - 73 28
|
||||||
|
9 past 18 - 24 35
|
||||||
|
10 past 25 - 31 31
|
||||||
|
11 past 32 - 38 32
|
||||||
|
12 past 39 - 45 30
|
||||||
|
13 past 46 - 52 28
|
||||||
|
14 past 53 - 59 31
|
||||||
|
15 past 60 - 66 38
|
||||||
|
16 past 67 - 73 37
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Age within perspective:
|
||||||
|
|
||||||
|
perspective taq_age n
|
||||||
|
1 other 18 - 24 41
|
||||||
|
2 other 25 - 31 36
|
||||||
|
3 other 32 - 38 28
|
||||||
|
4 other 39 - 45 32
|
||||||
|
5 other 46 - 52 28
|
||||||
|
6 other 53 - 59 33
|
||||||
|
7 other 60 - 66 30
|
||||||
|
8 other 67 - 73 33
|
||||||
|
9 self 18 - 24 32
|
||||||
|
10 self 25 - 31 26
|
||||||
|
11 self 32 - 38 33
|
||||||
|
12 self 39 - 45 32
|
||||||
|
13 self 46 - 52 35
|
||||||
|
14 self 53 - 59 34
|
||||||
|
15 self 60 - 66 37
|
||||||
|
16 self 67 - 73 32
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Sex within temporalDO:
|
||||||
|
|
||||||
|
temporalDO taq_sex n
|
||||||
|
1 future Female 130
|
||||||
|
2 future Male 129
|
||||||
|
3 future Prefer not to say 1
|
||||||
|
4 past Female 130
|
||||||
|
5 past Male 128
|
||||||
|
6 past Prefer not to say 4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Sex within perspective:
|
||||||
|
|
||||||
|
perspective taq_sex n
|
||||||
|
1 other Female 130
|
||||||
|
2 other Male 128
|
||||||
|
3 other Prefer not to say 3
|
||||||
|
4 self Female 130
|
||||||
|
5 self Male 129
|
||||||
|
6 self Prefer not to say 2
|
||||||
|
|
||||||
|
|
||||||
531
eohi3/dataREVIEW-JAN21/eohi3_raw.csv
Normal file
531
eohi3/dataREVIEW-JAN21/eohi3_raw.csv
Normal file
File diff suppressed because one or more lines are too long
532
eohi3/dataREVIEW-JAN21/eohi3_raw2.csv
Normal file
532
eohi3/dataREVIEW-JAN21/eohi3_raw2.csv
Normal file
File diff suppressed because one or more lines are too long
586
eohi3/dataREVIEW-JAN21/eohi3_unprocessed_final.csv
Normal file
586
eohi3/dataREVIEW-JAN21/eohi3_unprocessed_final.csv
Normal file
File diff suppressed because one or more lines are too long
523
eohi3/eohi3.csv
Normal file
523
eohi3/eohi3.csv
Normal file
File diff suppressed because one or more lines are too long
61
eohi3/test-DEC29/eohi3-test.csv
Normal file
61
eohi3/test-DEC29/eohi3-test.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
lit review/Diener_lifeScale.pdf
Normal file
BIN
lit review/Diener_lifeScale.pdf
Normal file
Binary file not shown.
BIN
lit review/brietzke_ehi2.pdf
Normal file
BIN
lit review/brietzke_ehi2.pdf
Normal file
Binary file not shown.
BIN
lit review/carmen_ehi2.pdf
Normal file
BIN
lit review/carmen_ehi2.pdf
Normal file
Binary file not shown.
BIN
lit review/fleming_pro-retro.pdf
Normal file
BIN
lit review/fleming_pro-retro.pdf
Normal file
Binary file not shown.
BIN
lit review/guo_ehi2.pdf
Normal file
BIN
lit review/guo_ehi2.pdf
Normal file
Binary file not shown.
BIN
lit review/gutral_ehi2.pdf
Normal file
BIN
lit review/gutral_ehi2.pdf
Normal file
Binary file not shown.
BIN
lit review/haas_ehi1.pdf
Normal file
BIN
lit review/haas_ehi1.pdf
Normal file
Binary file not shown.
BIN
lit review/haddock_futResearch.pdf
Normal file
BIN
lit review/haddock_futResearch.pdf
Normal file
Binary file not shown.
BIN
lit review/harris_ehi1.pdf
Normal file
BIN
lit review/harris_ehi1.pdf
Normal file
Binary file not shown.
BIN
lit review/hershfield_FUTforecast.pdf
Normal file
BIN
lit review/hershfield_FUTforecast.pdf
Normal file
Binary file not shown.
BIN
lit review/lechner_valueScale.pdf
Normal file
BIN
lit review/lechner_valueScale.pdf
Normal file
Binary file not shown.
128
lit review/pdf_to_txt.py
Normal file
128
lit review/pdf_to_txt.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
#!/home/ladmin/miniconda3/envs/nlp/bin/python
|
||||||
|
"""
|
||||||
|
PDF to Text Converter
|
||||||
|
Converts PDF files to plain text files.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python pdf_to_txt.py <input.pdf> # Creates input.txt
|
||||||
|
python pdf_to_txt.py <input.pdf> <output.txt> # Custom output name
|
||||||
|
python pdf_to_txt.py --all # Convert all PDFs in current directory
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
pip install pypdf
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader
|
||||||
|
except ImportError:
|
||||||
|
print("Error: pypdf library not found.")
|
||||||
|
print("Please install it with: pip install pypdf")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_text(pdf_path, output_path=None):
|
||||||
|
"""
|
||||||
|
Convert a PDF file to a text file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
output_path: Path to the output text file (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Convert to Path objects
|
||||||
|
pdf_path = Path(pdf_path)
|
||||||
|
|
||||||
|
if not pdf_path.exists():
|
||||||
|
print(f"Error: File not found: {pdf_path}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Determine output path
|
||||||
|
if output_path is None:
|
||||||
|
output_path = pdf_path.with_suffix('.txt')
|
||||||
|
else:
|
||||||
|
output_path = Path(output_path)
|
||||||
|
|
||||||
|
print(f"Converting: {pdf_path.name}")
|
||||||
|
|
||||||
|
# Read the PDF
|
||||||
|
reader = PdfReader(str(pdf_path))
|
||||||
|
|
||||||
|
# Extract text from all pages
|
||||||
|
text_content = []
|
||||||
|
for i, page in enumerate(reader.pages, 1):
|
||||||
|
text = page.extract_text()
|
||||||
|
if text:
|
||||||
|
text_content.append(f"--- Page {i} ---\n{text}\n")
|
||||||
|
|
||||||
|
# Write to text file
|
||||||
|
full_text = "\n".join(text_content)
|
||||||
|
output_path.write_text(full_text, encoding='utf-8')
|
||||||
|
|
||||||
|
print(f"✓ Created: {output_path.name} ({len(reader.pages)} pages, {len(full_text):,} characters)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Error processing {pdf_path.name}: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def convert_all_pdfs():
|
||||||
|
"""Convert all PDF files in the current directory to text files."""
|
||||||
|
current_dir = Path.cwd()
|
||||||
|
pdf_files = list(current_dir.glob("*.pdf"))
|
||||||
|
|
||||||
|
if not pdf_files:
|
||||||
|
print("No PDF files found in the current directory.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found {len(pdf_files)} PDF file(s) to convert.\n")
|
||||||
|
|
||||||
|
successful = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
if pdf_to_text(pdf_file):
|
||||||
|
successful += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Conversion complete: {successful} successful, {failed} failed")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print(__doc__)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Convert all PDFs in directory
|
||||||
|
if sys.argv[1] == "--all":
|
||||||
|
convert_all_pdfs()
|
||||||
|
|
||||||
|
# Convert single PDF
|
||||||
|
elif len(sys.argv) == 2:
|
||||||
|
pdf_path = sys.argv[1]
|
||||||
|
pdf_to_text(pdf_path)
|
||||||
|
|
||||||
|
# Convert single PDF with custom output name
|
||||||
|
elif len(sys.argv) == 3:
|
||||||
|
pdf_path = sys.argv[1]
|
||||||
|
output_path = sys.argv[2]
|
||||||
|
pdf_to_text(pdf_path, output_path)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Error: Too many arguments")
|
||||||
|
print(__doc__)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
BIN
lit review/quoidbach.sm.pdf
Normal file
BIN
lit review/quoidbach.sm.pdf
Normal file
Binary file not shown.
BIN
lit review/quoidbach_ehi1.pdf
Normal file
BIN
lit review/quoidbach_ehi1.pdf
Normal file
Binary file not shown.
BIN
lit review/reiff_ehi2.pdf
Normal file
BIN
lit review/reiff_ehi2.pdf
Normal file
Binary file not shown.
BIN
lit review/rutt_ehi2.pdf
Normal file
BIN
lit review/rutt_ehi2.pdf
Normal file
Binary file not shown.
BIN
lit review/sachi_ehi1.pdf
Normal file
BIN
lit review/sachi_ehi1.pdf
Normal file
Binary file not shown.
BIN
lit review/siedlecka_pro-retro.pdf
Normal file
BIN
lit review/siedlecka_pro-retro.pdf
Normal file
Binary file not shown.
BIN
lit review/vanRyzin_ehi1.pdf
Normal file
BIN
lit review/vanRyzin_ehi1.pdf
Normal file
Binary file not shown.
BIN
lit review/verner_FUTforecast.pdf
Normal file
BIN
lit review/verner_FUTforecast.pdf
Normal file
Binary file not shown.
BIN
lit review/wilson_FUTforecast.pdf
Normal file
BIN
lit review/wilson_FUTforecast.pdf
Normal file
Binary file not shown.
BIN
lit review/yue_ehi2.pdf
Normal file
BIN
lit review/yue_ehi2.pdf
Normal file
Binary file not shown.
BIN
manuscript/EOHI - M&Methods - 2024APR28.docx
Normal file
BIN
manuscript/EOHI - M&Methods - 2024APR28.docx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user