eohi3-updates (#3)

updating eohi folder w/ third eohi exp.

Reviewed-on: #3
Co-authored-by: Irina Levit <irina.levit.rn@gmail.com>
Co-committed-by: Irina Levit <irina.levit.rn@gmail.com>
This commit is contained in:
Irina Levit 2026-01-26 16:30:09 -05:00 committed by ira
parent 5e7ad6be15
commit ba54687da2
38 changed files with 4967 additions and 0 deletions

55
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,55 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "R-Debugger",
"name": "Launch R-Workspace",
"request": "launch",
"debugMode": "workspace",
"workingDirectory": "${workspaceFolder}",
"splitOverwrittenOutput": true
},
{
"type": "R-Debugger",
"name": "Debug R-File",
"request": "launch",
"debugMode": "file",
"workingDirectory": "${workspaceFolder}",
"file": "${file}",
"splitOverwrittenOutput": true,
"stopOnEntry": false
},
{
"type": "R-Debugger",
"name": "Debug R-Function",
"request": "launch",
"debugMode": "function",
"workingDirectory": "${workspaceFolder}",
"file": "${file}",
"mainFunction": "main",
"allowGlobalDebugging": false,
"splitOverwrittenOutput": true
},
{
"type": "R-Debugger",
"name": "Debug R-Package",
"request": "launch",
"debugMode": "workspace",
"workingDirectory": "${workspaceFolder}",
"includePackageScopes": true,
"loadPackages": [
"."
],
"splitOverwrittenOutput": true
},
{
"type": "R-Debugger",
"request": "attach",
"name": "Attach to R process",
"splitOverwrittenOutput": true
}
]
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,68 @@
ResponseId,RATIONALE
R_12EXYt8gHauPaCb,duration
R_142iZtlDp1Vam14,duration
R_16eRiaoFPG5CpE4,duration
R_1aK2JWzCFkpefUg,duration
R_1FEuEk6VzuwxZby,duration
R_1IsHUv4sb6oOphv,duration
R_1J2cryciskOYjOV,duration
R_1JFsZ1GXM7jDWmh,duration
R_1JlV9H7AJKtNZ8g,duration
R_1kgjhkT4sJwhfuV,duration
R_1MAMwGkBHTTSyAh,duration
R_1O6dV9hTlqpsYjP,duration
R_1qatgZwcLPGctnd,age mismatch
R_1QE5KaKNkt66Cer,duration
R_1QsYazd3eOH62js,duration
R_1vwOg7l0kSLHGRX,duration
R_1YJ2G01dpxYqKAm,duration
R_1YoddNWqybPbaNN,feedback in french
R_1ZOjQ97Ph1VtRwp,duration
R_347ABt6LFPUeVZS,duration
R_34Ain6V2NbEDeQm,duration
R_38J0VDB8JE8Dd0o,duration
R_3DptQmS26X0Z8Wu,IP duplicate
R_3Foc2aYGpXFrbnX,age mismatch + duration
R_3HLz0FyaULkIPKu,IP duplicate
R_3jUhefm4hAEQ6PC,duration
R_3n8b0ndM4habNjB,age mismatch
R_3nTLzs9jMwDHbFy,duration
R_3rGudTtAd2oVze3,duration
R_3t6giyCy5IwZgom,duration
R_3WwXkl4IatPYDZ0,age mismatch
R_5ByssDsdjMcQgUV,duration
R_5cNBH4nxBlH8OSB,duration
R_5FkttTgBeMePzhk,sex mismatch
R_5FyLW7dHpyFojo5,duration
R_5M3urkuYhhSG06E,duration
R_5MRp7eFKMm59t14,feedback in french
R_5n6H7xuYTQgvFEf,duration
R_5rrbHXjKol6Zl9U,duration
R_5youAGSa5hLGkuZ,age mismatch + duration
R_5z5DYfTnai5Pj3j,duration
R_64nOi2TWI4XCYkt,duration
R_6BcdSiP0Nibxx1D,duration
R_6C4v9kRnGm9Iqyj,IP duplicate
R_6CpjN5tJoj8dYuB,duration
R_6cwKXrr8R99m5ez,duration
R_6F4ld4gRlKjsb06,age mismatch + duration
R_6GqjTqXrehkbG0x,duration
R_6HCtgHyy16nNMQ4,age mismatch
R_6hQN1DUFkxGpDGD,IP duplicate
R_6JKscJDUeAt7k1y,age mismatch
R_6lKqtees5Z1hj2L,duration
R_6m1NYZLedxbAxui,duration
R_6pM4ierZhbT1FEb,duration
R_6rQCiwlJHKrWWKB,duration
R_7AwVrmL8AM0KLKx,duration
R_7bH15XzvHpDCZO1,duration
R_7Cl7KFkEiuYwdZn,duration
R_7EfALTPED13tduG,duration
R_7flJBV9qf88XSM5,duration
R_7H0dTzsyEC1Pzyh,duration
R_7HM0FXjrAoTeGqt,duration
R_7HRMvwMPw3OBE7g,duration
R_7o7FORJHlgWAahS,age mismatch
R_7sTsQ9AI42QQgSV,duration
R_7VJCRyovK5KAddn,duration
R_7w4ggvRoPBkyTle,duration
1 ResponseId RATIONALE
2 R_12EXYt8gHauPaCb duration
3 R_142iZtlDp1Vam14 duration
4 R_16eRiaoFPG5CpE4 duration
5 R_1aK2JWzCFkpefUg duration
6 R_1FEuEk6VzuwxZby duration
7 R_1IsHUv4sb6oOphv duration
8 R_1J2cryciskOYjOV duration
9 R_1JFsZ1GXM7jDWmh duration
10 R_1JlV9H7AJKtNZ8g duration
11 R_1kgjhkT4sJwhfuV duration
12 R_1MAMwGkBHTTSyAh duration
13 R_1O6dV9hTlqpsYjP duration
14 R_1qatgZwcLPGctnd age mismatch
15 R_1QE5KaKNkt66Cer duration
16 R_1QsYazd3eOH62js duration
17 R_1vwOg7l0kSLHGRX duration
18 R_1YJ2G01dpxYqKAm duration
19 R_1YoddNWqybPbaNN feedback in french
20 R_1ZOjQ97Ph1VtRwp duration
21 R_347ABt6LFPUeVZS duration
22 R_34Ain6V2NbEDeQm duration
23 R_38J0VDB8JE8Dd0o duration
24 R_3DptQmS26X0Z8Wu IP duplicate
25 R_3Foc2aYGpXFrbnX age mismatch + duration
26 R_3HLz0FyaULkIPKu IP duplicate
27 R_3jUhefm4hAEQ6PC duration
28 R_3n8b0ndM4habNjB age mismatch
29 R_3nTLzs9jMwDHbFy duration
30 R_3rGudTtAd2oVze3 duration
31 R_3t6giyCy5IwZgom duration
32 R_3WwXkl4IatPYDZ0 age mismatch
33 R_5ByssDsdjMcQgUV duration
34 R_5cNBH4nxBlH8OSB duration
35 R_5FkttTgBeMePzhk sex mismatch
36 R_5FyLW7dHpyFojo5 duration
37 R_5M3urkuYhhSG06E duration
38 R_5MRp7eFKMm59t14 feedback in french
39 R_5n6H7xuYTQgvFEf duration
40 R_5rrbHXjKol6Zl9U duration
41 R_5youAGSa5hLGkuZ age mismatch + duration
42 R_5z5DYfTnai5Pj3j duration
43 R_64nOi2TWI4XCYkt duration
44 R_6BcdSiP0Nibxx1D duration
45 R_6C4v9kRnGm9Iqyj IP duplicate
46 R_6CpjN5tJoj8dYuB duration
47 R_6cwKXrr8R99m5ez duration
48 R_6F4ld4gRlKjsb06 age mismatch + duration
49 R_6GqjTqXrehkbG0x duration
50 R_6HCtgHyy16nNMQ4 age mismatch
51 R_6hQN1DUFkxGpDGD IP duplicate
52 R_6JKscJDUeAt7k1y age mismatch
53 R_6lKqtees5Z1hj2L duration
54 R_6m1NYZLedxbAxui duration
55 R_6pM4ierZhbT1FEb duration
56 R_6rQCiwlJHKrWWKB duration
57 R_7AwVrmL8AM0KLKx duration
58 R_7bH15XzvHpDCZO1 duration
59 R_7Cl7KFkEiuYwdZn duration
60 R_7EfALTPED13tduG duration
61 R_7flJBV9qf88XSM5 duration
62 R_7H0dTzsyEC1Pzyh duration
63 R_7HM0FXjrAoTeGqt duration
64 R_7HRMvwMPw3OBE7g duration
65 R_7o7FORJHlgWAahS age mismatch
66 R_7sTsQ9AI42QQgSV duration
67 R_7VJCRyovK5KAddn duration
68 R_7w4ggvRoPBkyTle duration

View File

@ -0,0 +1,189 @@
library(dplyr)
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
# RATIONALE column should exist in the CSV
# Ensure RATIONALE is character and convert any NA values to empty strings
if (!is.character(df$RATIONALE)) {
df$RATIONALE <- as.character(df$RATIONALE)
}
df$RATIONALE[is.na(df$RATIONALE)] <- ""
# Function to check if age falls within range
check_age_range <- function(age_num, age_range_str) {
# Check if data is missing or empty
if (is.na(age_num) || is.null(age_num) || age_range_str == "" || is.na(age_range_str) || trimws(age_range_str) == "") {
return(NULL) # Can't check if data is missing - return NULL to indicate skip
}
# Parse range string (e.g., "46 - 52" or "25 - 31")
range_parts <- strsplit(trimws(age_range_str), "\\s*-\\s*")[[1]]
if (length(range_parts) != 2) {
return(NULL) # Invalid range format - return NULL to indicate skip
}
min_age <- as.numeric(trimws(range_parts[1]))
max_age <- as.numeric(trimws(range_parts[2]))
if (is.na(min_age) || is.na(max_age)) {
return(NULL) # Couldn't parse numbers - return NULL to indicate skip
}
# Check if age falls within range (inclusive)
return(age_num >= min_age && age_num <= max_age)
}
# Function to check if a value is empty (empty string or whitespace only)
# Empty cells are kept as empty strings, not NA
# Vectorized to handle both single values and vectors
is_empty <- function(x) {
if (is.null(x)) return(TRUE)
# Handle vectors
if (length(x) > 1) {
result <- rep(FALSE, length(x))
result[is.na(x)] <- TRUE
if (is.character(x)) {
result[trimws(x) == ""] <- TRUE
result[x == ""] <- TRUE
}
return(result)
}
# Handle single value
if (is.na(x)) return(TRUE)
if (is.character(x) && trimws(x) == "") return(TRUE)
if (is.character(x) && x == "") return(TRUE)
return(FALSE)
}
# 1. Check sex match
# Only check if both values are non-empty
sex_mismatch <- rep(FALSE, nrow(df))
for (i in seq_len(nrow(df))) {
demo_sex_val <- ifelse(is.na(df$demo_sex[i]), "", trimws(df$demo_sex[i]))
taq_sex_val <- ifelse(is.na(df$taq_sex[i]), "", trimws(df$taq_sex[i]))
# Only check if both are non-empty
if (demo_sex_val != "" && taq_sex_val != "") {
if (tolower(demo_sex_val) != tolower(taq_sex_val)) {
sex_mismatch[i] <- TRUE
}
}
}
# 2. Check age range match
age_mismatch <- rep(FALSE, nrow(df))
for (i in seq_len(nrow(df))) {
# Only check if demo_age is not empty/NA and taq_age is not empty
if (!is.na(df$demo_age[i]) && !is_empty(df$taq_age[i])) {
age_check <- check_age_range(df$demo_age[i], df$taq_age[i])
# age_check is NULL if we can't check, FALSE if mismatch, TRUE if match
if (!is.null(age_check) && !age_check) {
age_mismatch[i] <- TRUE
}
}
}
# 3. Check citizenship (taq_cit_1 or taq_cit_2)
no_cit <- is_empty(df$taq_cit_1) & is_empty(df$taq_cit_2)
# 4. Check IP address duplicates
# Find IP addresses that appear more than once (non-empty IPs only)
ip_duplicate <- rep(FALSE, nrow(df))
if ("IPAddress" %in% colnames(df)) {
# Get non-empty IP addresses
ip_addresses <- ifelse(is.na(df$IPAddress), "", trimws(df$IPAddress))
# Count occurrences of each IP
ip_counts <- table(ip_addresses)
# Get IPs that appear more than once (and are not empty)
duplicate_ips <- names(ip_counts)[ip_counts > 1 & names(ip_counts) != ""]
# Mark rows with duplicate IPs
if (length(duplicate_ips) > 0) {
for (dup_ip in duplicate_ips) {
ip_duplicate[ip_addresses == dup_ip] <- TRUE
}
}
}
# Build RATIONALE column - only populate when there are issues
# Start with empty strings to preserve existing empty cells
rationale_parts <- rep("", nrow(df))
# Add sex mismatch
rationale_parts[sex_mismatch] <- "sex mismatch"
# Add age mismatch (append if sex mismatch already exists)
for (i in seq_len(nrow(df))) {
if (age_mismatch[i]) {
if (rationale_parts[i] != "") {
rationale_parts[i] <- paste(rationale_parts[i], "age mismatch", sep = "; ")
} else {
rationale_parts[i] <- "age mismatch"
}
}
}
# Add no cit (append if other issues already exist)
for (i in seq_len(nrow(df))) {
if (no_cit[i]) {
if (rationale_parts[i] != "") {
rationale_parts[i] <- paste(rationale_parts[i], "no cit", sep = "; ")
} else {
rationale_parts[i] <- "no cit"
}
}
}
# Add IP duplicate (append if other issues already exist)
for (i in seq_len(nrow(df))) {
if (ip_duplicate[i]) {
if (rationale_parts[i] != "") {
rationale_parts[i] <- paste(rationale_parts[i], "IP duplicate", sep = "; ")
} else {
rationale_parts[i] <- "IP duplicate"
}
}
}
# Update RATIONALE column - only set when there are issues, otherwise keep existing value
# If no issues found, keep the cell empty (or existing value if any)
for (i in seq_len(nrow(df))) {
if (rationale_parts[i] != "") {
df$RATIONALE[i] <- rationale_parts[i]
}
# If rationale_parts[i] is empty, leave RATIONALE as is (preserves existing empty or other values)
}
# Summary - using multiple methods to ensure output appears
# Try message() first (better for debug console)
message("Validation Summary:")
message("Sex mismatches: ", sum(sex_mismatch))
message("Age mismatches: ", sum(age_mismatch))
message("No citizenship: ", sum(no_cit))
message("IP duplicates: ", sum(ip_duplicate))
message("Total rows with issues: ", sum(rationale_parts != ""))
# Also use cat() to stdout (for terminal)
cat("Validation Summary:\n", file = stdout())
cat("Sex mismatches:", sum(sex_mismatch), "\n", file = stdout())
cat("Age mismatches:", sum(age_mismatch), "\n", file = stdout())
cat("No citizenship:", sum(no_cit), "\n", file = stdout())
cat("IP duplicates:", sum(ip_duplicate), "\n", file = stdout())
cat("Total rows with issues:", sum(rationale_parts != ""), "\n", file = stdout())
flush(stdout())
# Write the updated data
# Preserve empty strings as empty (not NA)
# Convert character column NAs to empty strings to preserve empty cells
for (col in names(df)) {
if (is.character(df[[col]])) {
df[[col]][is.na(df[[col]])] <- ""
}
}
write.csv(df, "eohi3_raw2.csv", row.names = FALSE, na = "", quote = TRUE)

View File

@ -0,0 +1,39 @@
library(dplyr)
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
# Populate citizenship column from taq_cit_1 and taq_cit_2
# If both have values, set to "Both"
# Otherwise, use the value from whichever column has a value
# Empty values remain as empty strings (not NA)
# Ensure citizenship column exists, initialize with empty strings if needed
if (!"citizenship" %in% names(df)) {
df$citizenship <- ""
}
# Convert NA to empty string for taq_cit columns to ensure consistent handling
df$taq_cit_1[is.na(df$taq_cit_1)] <- ""
df$taq_cit_2[is.na(df$taq_cit_2)] <- ""
# Populate citizenship based on taq_cit_1 and taq_cit_2 using base R
# Check if both have values (non-empty)
both_have_values <- df$taq_cit_1 != "" & df$taq_cit_2 != ""
# Check if only taq_cit_1 has a value
only_cit1 <- df$taq_cit_1 != "" & df$taq_cit_2 == ""
# Check if only taq_cit_2 has a value
only_cit2 <- df$taq_cit_2 != "" & df$taq_cit_1 == ""
# Assign values
df$citizenship[both_have_values] <- "Both"
df$citizenship[only_cit1] <- df$taq_cit_1[only_cit1]
df$citizenship[only_cit2] <- df$taq_cit_2[only_cit2]
# For rows where neither has a value, citizenship keeps its original value (may be empty string)
write.csv(df, "eohi3_raw.csv", row.names = FALSE, na = "", quote = TRUE)

View File

@ -0,0 +1,130 @@
library(dplyr)
setwd("/home/ladmin/Documents/DND/EOHI/eohi3/dataREVIEW-JAN21")
# Read the data (with check.names=FALSE to preserve original column names)
# Keep empty cells as empty strings, not NA
# Only convert the literal string "NA" to NA, not empty strings
df <- read.csv("eohi3_raw.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = "NA")
# Remove trailing columns with empty names (dplyr requires all columns to have names)
empty_cols <- which(names(df) == "" | is.na(names(df)))
if (length(empty_cols) > 0) {
df <- df[, -empty_cols, drop = FALSE]
}
# Set to TRUE to save all distributions to a document file
save_to_doc <- TRUE
doc_filename <- "eohi3_quotas.txt"
# =============================================================================
# SINGLE VARIABLE DISTRIBUTIONS
# =============================================================================
dist_age <- df %>% count(taq_age, sort = TRUE)
print(dist_age)
dist_sex <- df %>% count(taq_sex, sort = TRUE)
print(dist_sex)
dist_citizenship <- df %>% count(citizenship, sort = TRUE)
print(dist_citizenship)
dist_group <- df %>% count(group, sort = TRUE)
print(dist_group)
dist_temporalDO <- df %>% count(temporalDO, sort = TRUE)
print(dist_temporalDO)
dist_perspective <- df %>% count(perspective, sort = TRUE)
print(dist_perspective)
# =============================================================================
# NESTED DISTRIBUTIONS
# =============================================================================
dist_age_citizenship <- df %>% count(citizenship, taq_age) %>% arrange(citizenship, taq_age)
print(dist_age_citizenship)
dist_sex_citizenship <- df %>% count(citizenship, taq_sex) %>% arrange(citizenship, taq_sex)
print(dist_sex_citizenship)
dist_age_temporalDO <- df %>% count(temporalDO, taq_age) %>% arrange(temporalDO, taq_age)
print(dist_age_temporalDO)
dist_age_perspective <- df %>% count(perspective, taq_age) %>% arrange(perspective, taq_age)
print(dist_age_perspective)
dist_sex_temporalDO <- df %>% count(temporalDO, taq_sex) %>% arrange(temporalDO, taq_sex)
print(dist_sex_temporalDO)
dist_sex_perspective <- df %>% count(perspective, taq_sex) %>% arrange(perspective, taq_sex)
print(dist_sex_perspective)
# =============================================================================
# OPTIONAL: SAVE ALL DISTRIBUTIONS TO DOCUMENT
# =============================================================================
if (save_to_doc) {
sink(doc_filename)
cat("DISTRIBUTION REPORT\n")
cat("==================\n\n")
cat("SINGLE VARIABLE DISTRIBUTIONS\n")
cat("------------------------------\n\n")
cat("Distribution of taq_age:\n")
print(dist_age)
cat("\n\n")
cat("Distribution of taq_sex:\n")
print(dist_sex)
cat("\n\n")
cat("Distribution of citizenship:\n")
print(dist_citizenship)
cat("\n\n")
cat("Distribution of group:\n")
print(dist_group)
cat("\n\n")
cat("Distribution of temporalDO:\n")
print(dist_temporalDO)
cat("\n\n")
cat("Distribution of perspective:\n")
print(dist_perspective)
cat("\n\n")
cat("NESTED DISTRIBUTIONS\n")
cat("---------------------\n\n")
cat("Age within Citizenship:\n")
print(dist_age_citizenship)
cat("\n\n")
cat("Sex within Citizenship:\n")
print(dist_sex_citizenship)
cat("\n\n")
cat("Age within temporalDO:\n")
print(dist_age_temporalDO)
cat("\n\n")
cat("Age within perspective:\n")
print(dist_age_perspective)
cat("\n\n")
cat("Sex within temporalDO:\n")
print(dist_sex_temporalDO)
cat("\n\n")
cat("Sex within perspective:\n")
print(dist_sex_perspective)
cat("\n")
sink()
cat("Distributions saved to:", doc_filename, "\n")
}

View File

@ -0,0 +1,177 @@
DISTRIBUTION REPORT
==================
SINGLE VARIABLE DISTRIBUTIONS
------------------------------
Distribution of taq_age:
taq_age n
1 18 - 24 73
2 53 - 59 67
3 60 - 66 67
4 67 - 73 65
5 39 - 45 64
6 46 - 52 63
7 25 - 31 62
8 32 - 38 61
Distribution of taq_sex:
taq_sex n
1 Female 260
2 Male 257
3 Prefer not to say 5
Distribution of citizenship:
citizenship n
1 American 262
2 Canadian 258
3 Both 2
Distribution of group:
group n
1 01FPV 177
2 03VFP 174
3 02PVF 171
Distribution of temporalDO:
temporalDO n
1 past 262
2 future 260
Distribution of perspective:
perspective n
1 other 261
2 self 261
NESTED DISTRIBUTIONS
---------------------
Age within Citizenship:
citizenship taq_age n
1 American 18 - 24 38
2 American 25 - 31 30
3 American 32 - 38 29
4 American 39 - 45 33
5 American 46 - 52 31
6 American 53 - 59 34
7 American 60 - 66 34
8 American 67 - 73 33
9 Both 32 - 38 1
10 Both 46 - 52 1
11 Canadian 18 - 24 35
12 Canadian 25 - 31 32
13 Canadian 32 - 38 31
14 Canadian 39 - 45 31
15 Canadian 46 - 52 31
16 Canadian 53 - 59 33
17 Canadian 60 - 66 33
18 Canadian 67 - 73 32
Sex within Citizenship:
citizenship taq_sex n
1 American Female 130
2 American Male 129
3 American Prefer not to say 3
4 Both Female 1
5 Both Male 1
6 Canadian Female 129
7 Canadian Male 127
8 Canadian Prefer not to say 2
Age within temporalDO:
temporalDO taq_age n
1 future 18 - 24 38
2 future 25 - 31 31
3 future 32 - 38 29
4 future 39 - 45 34
5 future 46 - 52 35
6 future 53 - 59 36
7 future 60 - 66 29
8 future 67 - 73 28
9 past 18 - 24 35
10 past 25 - 31 31
11 past 32 - 38 32
12 past 39 - 45 30
13 past 46 - 52 28
14 past 53 - 59 31
15 past 60 - 66 38
16 past 67 - 73 37
Age within perspective:
perspective taq_age n
1 other 18 - 24 41
2 other 25 - 31 36
3 other 32 - 38 28
4 other 39 - 45 32
5 other 46 - 52 28
6 other 53 - 59 33
7 other 60 - 66 30
8 other 67 - 73 33
9 self 18 - 24 32
10 self 25 - 31 26
11 self 32 - 38 33
12 self 39 - 45 32
13 self 46 - 52 35
14 self 53 - 59 34
15 self 60 - 66 37
16 self 67 - 73 32
Sex within temporalDO:
temporalDO taq_sex n
1 future Female 130
2 future Male 129
3 future Prefer not to say 1
4 past Female 130
5 past Male 128
6 past Prefer not to say 4
Sex within perspective:
perspective taq_sex n
1 other Female 130
2 other Male 128
3 other Prefer not to say 3
4 self Female 130
5 self Male 129
6 self Prefer not to say 2

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

523
eohi3/eohi3.csv Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

BIN
lit review/carmen_ehi2.pdf Normal file

Binary file not shown.

Binary file not shown.

BIN
lit review/guo_ehi2.pdf Normal file

Binary file not shown.

BIN
lit review/gutral_ehi2.pdf Normal file

Binary file not shown.

BIN
lit review/haas_ehi1.pdf Normal file

Binary file not shown.

Binary file not shown.

BIN
lit review/harris_ehi1.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

128
lit review/pdf_to_txt.py Normal file
View File

@ -0,0 +1,128 @@
#!/home/ladmin/miniconda3/envs/nlp/bin/python
"""
PDF to Text Converter
Converts PDF files to plain text files.
Usage:
python pdf_to_txt.py <input.pdf> # Creates input.txt
python pdf_to_txt.py <input.pdf> <output.txt> # Custom output name
python pdf_to_txt.py --all # Convert all PDFs in current directory
Requirements:
pip install pypdf
"""
import sys
import os
from pathlib import Path
try:
from pypdf import PdfReader
except ImportError:
print("Error: pypdf library not found.")
print("Please install it with: pip install pypdf")
sys.exit(1)
def pdf_to_text(pdf_path, output_path=None):
"""
Convert a PDF file to a text file.
Args:
pdf_path: Path to the PDF file
output_path: Path to the output text file (optional)
Returns:
True if successful, False otherwise
"""
try:
# Convert to Path objects
pdf_path = Path(pdf_path)
if not pdf_path.exists():
print(f"Error: File not found: {pdf_path}")
return False
# Determine output path
if output_path is None:
output_path = pdf_path.with_suffix('.txt')
else:
output_path = Path(output_path)
print(f"Converting: {pdf_path.name}")
# Read the PDF
reader = PdfReader(str(pdf_path))
# Extract text from all pages
text_content = []
for i, page in enumerate(reader.pages, 1):
text = page.extract_text()
if text:
text_content.append(f"--- Page {i} ---\n{text}\n")
# Write to text file
full_text = "\n".join(text_content)
output_path.write_text(full_text, encoding='utf-8')
print(f"✓ Created: {output_path.name} ({len(reader.pages)} pages, {len(full_text):,} characters)")
return True
except Exception as e:
print(f"✗ Error processing {pdf_path.name}: {str(e)}")
return False
def convert_all_pdfs():
"""Convert all PDF files in the current directory to text files."""
current_dir = Path.cwd()
pdf_files = list(current_dir.glob("*.pdf"))
if not pdf_files:
print("No PDF files found in the current directory.")
return
print(f"Found {len(pdf_files)} PDF file(s) to convert.\n")
successful = 0
failed = 0
for pdf_file in pdf_files:
if pdf_to_text(pdf_file):
successful += 1
else:
failed += 1
print(f"\n{'='*60}")
print(f"Conversion complete: {successful} successful, {failed} failed")
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
# Convert all PDFs in directory
if sys.argv[1] == "--all":
convert_all_pdfs()
# Convert single PDF
elif len(sys.argv) == 2:
pdf_path = sys.argv[1]
pdf_to_text(pdf_path)
# Convert single PDF with custom output name
elif len(sys.argv) == 3:
pdf_path = sys.argv[1]
output_path = sys.argv[2]
pdf_to_text(pdf_path, output_path)
else:
print("Error: Too many arguments")
print(__doc__)
sys.exit(1)
if __name__ == "__main__":
main()

BIN
lit review/quoidbach.sm.pdf Normal file

Binary file not shown.

Binary file not shown.

BIN
lit review/reiff_ehi2.pdf Normal file

Binary file not shown.

BIN
lit review/rutt_ehi2.pdf Normal file

Binary file not shown.

BIN
lit review/sachi_ehi1.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lit review/yue_ehi2.pdf Normal file

Binary file not shown.

Binary file not shown.