diff --git a/extractors/jobspy/src/run.ts b/extractors/jobspy/src/run.ts index 7bd6eac..f6215a7 100644 --- a/extractors/jobspy/src/run.ts +++ b/extractors/jobspy/src/run.ts @@ -6,6 +6,7 @@ import { createInterface } from "node:readline"; import { fileURLToPath } from "node:url"; import { resolveSearchCities } from "@shared/search-cities.js"; import type { CreateJobInput, JobSource } from "@shared/types/jobs"; +import { normalizeIsRemote } from "@shared/work-arrangement.js"; import { toNumberOrNull, toStringOrNull, @@ -374,27 +375,41 @@ function mapJobSpyRows( const salary = formatSalary({ minAmount, maxAmount, currency, interval }); const jobUrlDirect = toStringOrNull(row.job_url_direct); + const title = toStringOrNull(row.title) ?? "Unknown Title"; + const jobDescription = toStringOrNull(row.description) ?? undefined; + const location = toStringOrNull(row.location) ?? undefined; + const jobType = toStringOrNull(row.job_type) ?? undefined; + const workFromHomeType = + toStringOrNull(row.work_from_home_type) ?? undefined; jobs.push({ source, sourceJobId: toStringOrNull(row.id) ?? undefined, jobUrlDirect: jobUrlDirect ?? undefined, datePosted: toStringOrNull(row.date_posted) ?? undefined, - title: toStringOrNull(row.title) ?? "Unknown Title", + title, employer: toStringOrNull(row.company) ?? "Unknown Employer", employerUrl: toStringOrNull(row.company_url) ?? undefined, jobUrl, applicationLink: jobUrlDirect ?? jobUrl, - location: toStringOrNull(row.location) ?? undefined, - jobDescription: toStringOrNull(row.description) ?? undefined, + location, + jobDescription, salary: salary ?? undefined, - jobType: toStringOrNull(row.job_type) ?? undefined, + jobType, salarySource: toStringOrNull(row.salary_source) ?? undefined, salaryInterval: interval ?? undefined, salaryMinAmount: minAmount ?? undefined, salaryMaxAmount: maxAmount ?? undefined, salaryCurrency: currency ?? undefined, - isRemote: toBooleanOrNull(row.is_remote) ?? undefined, + isRemote: + normalizeIsRemote({ + title, + jobDescription, + location, + jobType, + workFromHomeType, + isRemote: toBooleanOrNull(row.is_remote) ?? undefined, + }) ?? undefined, jobLevel: toStringOrNull(row.job_level) ?? undefined, jobFunction: toStringOrNull(row.job_function) ?? undefined, listingType: toStringOrNull(row.listing_type) ?? undefined, @@ -413,7 +428,7 @@ function mapJobSpyRows( companyReviewsCount: toNumberOrNull(row.company_reviews_count) ?? undefined, vacancyCount: toNumberOrNull(row.vacancy_count) ?? undefined, - workFromHomeType: toStringOrNull(row.work_from_home_type) ?? undefined, + workFromHomeType, }); } diff --git a/extractors/jobspy/tests/run.test.ts b/extractors/jobspy/tests/run.test.ts index 0ade945..1a0db18 100644 --- a/extractors/jobspy/tests/run.test.ts +++ b/extractors/jobspy/tests/run.test.ts @@ -1,3 +1,4 @@ +import { normalizeIsRemote } from "@shared/work-arrangement.js"; import { describe, expect, it } from "vitest"; import { deriveIsRemoteFlag, parseJobSpyProgressLine } from "../src/run"; @@ -49,3 +50,15 @@ describe("parseJobSpyProgressLine", () => { expect(deriveIsRemoteFlag(["remote", "hybrid", "onsite"])).toBeUndefined(); }); }); + +describe("normalizeIsRemote (JobSpy ingest)", () => { + it("rejects hybrid postings that JobSpy marks remote", () => { + expect( + normalizeIsRemote({ + title: "Automation Test Engineer (SDET)", + jobDescription: "Job Type: Hybrid (3 days remote)", + isRemote: true, + }), + ).toBe(false); + }); +}); diff --git a/orchestrator/src/server/repositories/jobs.ts b/orchestrator/src/server/repositories/jobs.ts index d3b17d0..ee03cc9 100644 --- a/orchestrator/src/server/repositories/jobs.ts +++ b/orchestrator/src/server/repositories/jobs.ts @@ -7,6 +7,7 @@ import { getJobOwnerProfileId } from "@infra/request-context"; import { DEFAULT_JOB_OWNER_PROFILE_ID } from "@server/infra/job-owner-context"; import { buildJobContentFingerprint } from "@shared/job-fingerprint"; import { canonicalizeJobUrl } from "@shared/job-url-canonical"; +import { normalizeIsRemote } from "@shared/work-arrangement"; import type { CreateJobInput, Job, @@ -400,7 +401,15 @@ async function insertJob(input: CreateJobInput): Promise { salaryMinAmount: input.salaryMinAmount ?? null, salaryMaxAmount: input.salaryMaxAmount ?? null, salaryCurrency: input.salaryCurrency ?? null, - isRemote: input.isRemote ?? null, + isRemote: + normalizeIsRemote({ + title: input.title, + jobDescription: input.jobDescription, + location: input.location, + workFromHomeType: input.workFromHomeType, + jobType: input.jobType, + isRemote: input.isRemote, + }) ?? null, jobLevel: input.jobLevel ?? null, jobFunction: input.jobFunction ?? null, listingType: input.listingType ?? null, diff --git a/orchestrator/src/server/services/scorer.ts b/orchestrator/src/server/services/scorer.ts index 5841fb7..8522809 100644 --- a/orchestrator/src/server/services/scorer.ts +++ b/orchestrator/src/server/services/scorer.ts @@ -4,6 +4,7 @@ import { logger } from "@infra/logger"; import type { Job, JobSearchProfile, SuitabilityAnalysis } from "@shared/types"; +import { jobLikelyRequiresOfficePresence } from "@shared/work-arrangement"; import { LlmService } from "./llm/service"; import type { JsonSchemaDefinition } from "./llm/types"; import { stripMarkdownCodeFences } from "./llm/utils/json"; @@ -326,61 +327,6 @@ function candidateWantsRemoteOnly(p: JobSearchProfile): boolean { return true; } -/** - * Job text / metadata suggests hybrid or mandatory office presence (not remote-only). - */ -function jobSignalsHybridOrOnsite(job: Job): boolean { - const blob = [ - job.title, - job.jobDescription ?? "", - job.location ?? "", - job.workFromHomeType ?? "", - job.jobType ?? "", - ] - .filter(Boolean) - .join("\n") - .toLowerCase(); - - const strongRemoteOnly = - /\b100%\s*remote\b|\bfully\s+remote\b|\bremote[\s-]only\b|\bcompletely\s+remote\b|\bwork\s+from\s+anywhere\b|\banywhere\s+in\s+the\s+(us|usa|uk|world)\b/.test( - blob, - ); - - const hybridOrOffice = - /\bhybrid\b/.test(blob) || - /\bremote[\s-]?hybrid\b/.test(blob) || - /\bhybrid[\s-]?remote\b/.test(blob) || - /\b\d[\d]?\s+days?\s+(a|per)\s+week\b.*\b(in[\s-]?office|on[\s-]?site|onsite|at\s+the\s+office)\b/.test( - blob, - ) || - /\b(in[\s-]?office|on[\s-]?site|onsite|at\s+the\s+office)\b.*\b\d[\d]?\s+days?\b/.test( - blob, - ) || - /\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+days?\b.*\b(in[\s-]?office|on[\s-]?site|onsite)\b/.test( - blob, - ) || - /\b(in[\s-]?office|on[\s-]?site|onsite)\b.*\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+days?\b/.test( - blob, - ) || - /\boffice[\s-]based\b/.test(blob) || - /\bon[\s-]?site\s+(role|position|required|mandatory)\b/.test(blob) || - /\b(required|must)\b.*\b(in[\s-]?office|on[\s-]?site|onsite|in[\s-]?person)\b/.test( - blob, - ); - - const wfh = (job.workFromHomeType ?? "").toLowerCase(); - if (wfh.includes("hybrid")) return true; - - if (job.isRemote === false) { - if (strongRemoteOnly && !hybridOrOffice) return false; - return true; - } - - if (hybridOrOffice) return true; - - return false; -} - /** * Cap score when candidate wants remote-only but the job is hybrid / on-site, or * when the model admits a poor work-arrangement fit but still scores high. @@ -396,7 +342,7 @@ function applyRemoteOfficeMismatchCap( return { score, reason }; } - const officeLikely = jobSignalsHybridOrOnsite(job); + const officeLikely = jobLikelyRequiresOfficePresence(job); const wam = typeof data.workArrangementMatch === "number" ? data.workArrangementMatch diff --git a/scripts/jobber-cron.env.example b/scripts/jobber-cron.env.example index 0edb074..e4b518b 100644 --- a/scripts/jobber-cron.env.example +++ b/scripts/jobber-cron.env.example @@ -10,8 +10,14 @@ JOBOPS_URL="http://127.0.0.1:3005" # JOB_TELEGRAM_MAX_JOBS=25 # Optional: override POST /api/pipeline/run sources (comma-separated). If unset, the server default applies. -# Example (matches typical JobSpy bundle + UK sources): -# JOBBER_PIPELINE_SOURCES=gradcracker,indeed,linkedin,glassdoor,ukvisajobs +# Canada + QA automation + fully remote (see JOBBER_CRON_* below): +# JOBBER_PIPELINE_SOURCES=linkedin,indeed,glassdoor,qajobsboard,arcdev,eluta,bctenet + +# Optional: applied via PATCH /api/settings immediately before each scheduled run (ilia profile when BASIC_AUTH_USER=ilia). +# JOBBER_CRON_SEARCH_CITIES=Canada +# JOBBER_CRON_JOBSPY_COUNTRY=Canada +# JOBBER_CRON_WORKPLACE_TYPES=remote +# JOBBER_CRON_SEARCH_TERMS=QA Automation Engineer|SDET|Software Development Engineer in Test|Automation Test Engineer # Optional — only if BASIC_AUTH_USER / BASIC_AUTH_PASSWORD are set in Jobber .env (use one pair; cron runs as a single identity) # BASIC_AUTH_USER="" diff --git a/scripts/jobber-pipeline-telegram.sh b/scripts/jobber-pipeline-telegram.sh index 599662c..57c0993 100755 --- a/scripts/jobber-pipeline-telegram.sh +++ b/scripts/jobber-pipeline-telegram.sh @@ -44,6 +44,35 @@ fetch_status() { "${BASE}/api/pipeline/status" } +apply_cron_settings() { + local patch='{}' + if [[ -n "${JOBBER_CRON_SEARCH_CITIES:-}" ]]; then + patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_SEARCH_CITIES" '. + {searchCities: $v}')" + fi + if [[ -n "${JOBBER_CRON_JOBSPY_COUNTRY:-}" ]]; then + patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_JOBSPY_COUNTRY" '. + {jobspyCountryIndeed: $v}')" + fi + if [[ -n "${JOBBER_CRON_WORKPLACE_TYPES:-}" ]]; then + patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_WORKPLACE_TYPES" \ + '. + {workplaceTypes: ($v | split(",") | map(gsub("^\\s+|\\s+$";"")) | map(select(. != "")))}')" + fi + if [[ -n "${JOBBER_CRON_SEARCH_TERMS:-}" ]]; then + patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_SEARCH_TERMS" \ + '. + {searchTerms: ($v | split("|") | map(gsub("^\\s+|\\s+$";"")) | map(select(. != "")))}')" + fi + if [[ "$patch" == "{}" ]]; then + return 0 + fi + local resp + resp="$(curl -sS --compressed "${AUTH[@]}" -X PATCH "${BASE}/api/settings" \ + -H "Accept: application/json" -H "Content-Type: application/json" \ + -d "$patch")" + if ! echo "$resp" | jq -e '.ok == true' >/dev/null 2>&1; then + send_tg_html "Jobber: PATCH /api/settings failed before cron run: $(tg_html_escape "$(echo "$resp" | jq -c . 2>/dev/null || echo "$resp")")" + exit 1 + fi +} + fetch_jobs_list() { curl -sS --compressed "${AUTH[@]}" -H "Accept: application/json" \ "${BASE}/api/jobs?view=list" @@ -164,6 +193,8 @@ if echo "$body" | jq -e '.data.isRunning == true' >/dev/null 2>&1; then exit 0 fi +apply_cron_settings + # Optional: comma-separated sources (see JOBBER_PIPELINE_SOURCES in jobber-cron.env.example). # If unset, POST body is {} and the server uses its default source list. run_body='{}' diff --git a/shared/src/index.ts b/shared/src/index.ts index 6ab1295..a82b7e4 100644 --- a/shared/src/index.ts +++ b/shared/src/index.ts @@ -2,5 +2,6 @@ export * from "./extractors"; export * from "./job-fingerprint"; export * from "./job-url-canonical"; export * from "./location-support"; +export * from "./work-arrangement"; export * from "./types"; export * from "./utils/type-conversion"; diff --git a/shared/src/work-arrangement.test.ts b/shared/src/work-arrangement.test.ts new file mode 100644 index 0000000..c1beb3f --- /dev/null +++ b/shared/src/work-arrangement.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it } from "vitest"; +import { + jobSignalsHybridOrOnsite, + normalizeIsRemote, +} from "./work-arrangement.js"; + +describe("jobSignalsHybridOrOnsite", () => { + it("detects hybrid in description", () => { + expect( + jobSignalsHybridOrOnsite({ + title: "Automation Test Engineer (SDET)", + jobDescription: + "Job Type: Hybrid (3 days remote)\nJob Location: Vancouver, BC", + isRemote: true, + }), + ).toBe(true); + }); + + it("detects N days in office", () => { + expect( + jobSignalsHybridOrOnsite({ + jobDescription: "3 days per week in the office, 2 days remote", + }), + ).toBe(true); + }); + + it("does not flag fully remote postings", () => { + expect( + jobSignalsHybridOrOnsite({ + jobDescription: "100% remote. Work from anywhere in Canada.", + isRemote: true, + }), + ).toBe(false); + }); +}); + +describe("normalizeIsRemote", () => { + it("downgrades JobSpy false positive when hybrid is mentioned", () => { + expect( + normalizeIsRemote({ + title: "Automation Test Engineer (SDET)", + jobDescription: "Job Type: Hybrid (3 days remote)", + isRemote: true, + }), + ).toBe(false); + }); + + it("keeps true when upstream says remote and text is fully remote", () => { + expect( + normalizeIsRemote({ + jobDescription: "Fully remote role. No office visits required.", + isRemote: true, + }), + ).toBe(true); + }); + + it("promotes unknown upstream when text is strongly remote-only", () => { + expect( + normalizeIsRemote({ + jobDescription: "100% remote — work from anywhere.", + isRemote: undefined, + }), + ).toBe(true); + }); + + it("returns null when remote status is unclear", () => { + expect( + normalizeIsRemote({ + title: "Software Engineer", + location: "Toronto, ON", + isRemote: undefined, + }), + ).toBeNull(); + }); +}); diff --git a/shared/src/work-arrangement.ts b/shared/src/work-arrangement.ts new file mode 100644 index 0000000..e3e6aa9 --- /dev/null +++ b/shared/src/work-arrangement.ts @@ -0,0 +1,125 @@ +/** + * Work-arrangement detection for ingest and scoring. + * `isRemote` means 100% remote — hybrid or regular office presence disqualifies. + */ + +export interface WorkArrangementSignals { + title?: string | null; + jobDescription?: string | null; + location?: string | null; + workFromHomeType?: string | null; + jobType?: string | null; + isRemote?: boolean | null; +} + +function buildBlob(signals: WorkArrangementSignals): string { + return [ + signals.title, + signals.jobDescription, + signals.location, + signals.workFromHomeType, + signals.jobType, + ] + .filter(Boolean) + .join("\n") + .toLowerCase(); +} + +/** Posting text strongly indicates fully remote (no office days). */ +export function jobSignalsStrongRemoteOnly( + signals: WorkArrangementSignals, +): boolean { + const blob = buildBlob(signals); + return ( + /\b100%\s*remote\b/.test(blob) || + /\bfully\s+remote\b/.test(blob) || + /\bremote[\s-]only\b/.test(blob) || + /\bcompletely\s+remote\b/.test(blob) || + /\bwork\s+from\s+anywhere\b/.test(blob) || + /\banywhere\s+in\s+the\s+(us|usa|uk|world)\b/.test(blob) + ); +} + +/** + * Hybrid, partial-remote, or on-site/office requirements — not 100% remote. + */ +export function jobSignalsHybridOrOnsite( + signals: WorkArrangementSignals, +): boolean { + const blob = buildBlob(signals); + + const wfh = (signals.workFromHomeType ?? "").toLowerCase(); + if (wfh.includes("hybrid") || wfh.includes("on-site") || wfh.includes("onsite")) { + return true; + } + + if ( + /\bhybrid\b/.test(blob) || + /\bremote[\s-]?hybrid\b/.test(blob) || + /\bhybrid[\s-]?remote\b/.test(blob) || + /\bpartial(?:ly)?\s+remote\b/.test(blob) || + /\b\d[\d]?\s+days?\s+remote\b/.test(blob) || + /\bremote\s+\d[\d]?\s+days?\b/.test(blob) || + /\b\d[\d]?\s+days?\s+(a|per)\s+week\b.*\b(in[\s-]?office|on[\s-]?site|onsite|at\s+the\s+office)\b/.test( + blob, + ) || + /\b(in[\s-]?office|on[\s-]?site|onsite|at\s+the\s+office)\b.*\b\d[\d]?\s+days?\b/.test( + blob, + ) || + /\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+days?\b.*\b(in[\s-]?office|on[\s-]?site|onsite)\b/.test( + blob, + ) || + /\b(in[\s-]?office|on[\s-]?site|onsite)\b.*\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+days?\b/.test( + blob, + ) || + /\boffice[\s-]based\b/.test(blob) || + /\bon[\s-]?site\s+(role|position|required|mandatory)\b/.test(blob) || + /\b(required|must)\b.*\b(in[\s-]?office|on[\s-]?site|onsite|in[\s-]?person)\b/.test( + blob, + ) + ) { + return true; + } + + return false; +} + +/** + * Normalize upstream `isRemote` to 100% remote only. + * Hybrid / office-day language forces false; strong remote-only text can promote to true. + */ +export function normalizeIsRemote( + signals: WorkArrangementSignals, +): boolean | null { + if (jobSignalsHybridOrOnsite(signals)) { + return false; + } + if (signals.isRemote === true) { + return true; + } + if (signals.isRemote === false) { + return false; + } + if (jobSignalsStrongRemoteOnly(signals)) { + return true; + } + return null; +} + +/** + * Job likely requires office presence (for scoring caps when candidate is remote-only). + */ +export function jobLikelyRequiresOfficePresence( + signals: WorkArrangementSignals, +): boolean { + if (jobSignalsHybridOrOnsite(signals)) { + return true; + } + if (signals.isRemote === false) { + if (jobSignalsStrongRemoteOnly(signals)) { + return false; + } + return true; + } + return false; +}