From 0a633161000265cb994a938a1d28cb6e2784ba2c Mon Sep 17 00:00:00 2001 From: ilia Date: Sat, 16 May 2026 17:15:18 -0400 Subject: [PATCH] fix(discovery): block countries in vague locations via job description QAJobsBoard and similar feeds often store Worldwide/Remote while the real country is only in the description. Scan title and description when location is vague, and prefer concrete locations from QAJobsBoard postings. Co-authored-by: Cursor --- extractors/qajobsboard/manifest.ts | 39 +++++++++- .../pipeline/steps/discover-jobs.test.ts | 54 +++++++++++++ .../server/pipeline/steps/discover-jobs.ts | 10 ++- shared/src/blocked-countries.test.ts | 39 ++++++++++ shared/src/blocked-countries.ts | 78 ++++++++++++++++++- 5 files changed, 213 insertions(+), 7 deletions(-) diff --git a/extractors/qajobsboard/manifest.ts b/extractors/qajobsboard/manifest.ts index 5b6275d..ff19205 100644 --- a/extractors/qajobsboard/manifest.ts +++ b/extractors/qajobsboard/manifest.ts @@ -81,14 +81,47 @@ function salaryLabel(raw: SalaryBand | undefined): string | undefined { return schedule.trim() || undefined; } +const VAGUE_LOCATION_LABELS = new Set([ + "worldwide", + "global", + "anywhere", + "remote", + "unknown", +]); + +function isVagueLocationLabel(value: string): boolean { + return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase()); +} + +function extractJobLocationFromDescription( + html: string | undefined, +): string | undefined { + if (!html) return undefined; + const text = stripHtml(html); + const match = text.match( + /\bjob\s+location\s*:\s*([^\n.]{2,120})/i, + ); + if (!match?.[1]) return undefined; + const extracted = match[1].trim(); + return extracted || undefined; +} + function locationLabel(job: QaJobBoardlyJob): string { const limits = Array.isArray(job.location_limits) - ? job.location_limits.filter( - (v): v is string => typeof v === "string" && v.trim().length > 0, - ) + ? job.location_limits + .map((v) => (typeof v === "string" ? v.trim() : "")) + .filter((v) => v.length > 0 && !isVagueLocationLabel(v)) : []; if (limits.length > 0) return limits.join(", "); + const loc = asString(job.location); + if (loc && !isVagueLocationLabel(loc)) return loc; + + const fromDescription = extractJobLocationFromDescription( + job.description?.html, + ); + if (fromDescription) return fromDescription; + if (loc) return loc; return "Unknown"; } diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts index c12322b..1388cfb 100644 --- a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts +++ b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts @@ -394,6 +394,60 @@ describe("discoverJobsStep", () => { ]); }); + it("drops jobs with blocked country in description when location is worldwide", async () => { + const settingsRepo = await import("@server/repositories/settings"); + const registryModule = await import("@server/extractors/registry"); + + const qaManifest = { + id: "qajobsboard", + displayName: "QAJobsBoard", + providesSources: ["qajobsboard"], + run: vi.fn().mockResolvedValue({ + success: true, + jobs: [ + { + source: "qajobsboard", + title: "Sr. QA Automation Engineer", + employer: "Harrier", + location: "Worldwide", + jobDescription: + "Job Location: Mumbai/Nagpur. Open to candidates in India.", + jobUrl: "https://example.com/job-in", + }, + { + source: "qajobsboard", + title: "SDET", + employer: "Contoso", + location: "Toronto, ON, Canada", + jobUrl: "https://example.com/job-ca", + }, + ], + }), + }; + + vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({ + searchTerms: JSON.stringify(["sdet"]), + blockedCountries: JSON.stringify(["india"]), + searchCities: "Canada", + } as any); + + vi.mocked(registryModule.getExtractorRegistry).mockResolvedValue({ + manifests: new Map([["qajobsboard", qaManifest as any]]), + manifestBySource: new Map([["qajobsboard", qaManifest as any]]), + availableSources: ["qajobsboard"], + } as any); + + const result = await discoverJobsStep({ + mergedConfig: { + ...baseConfig, + sources: ["qajobsboard"], + }, + }); + + expect(result.discoveredJobs).toHaveLength(1); + expect(result.discoveredJobs[0]?.jobUrl).toBe("https://example.com/job-ca"); + }); + it("applies shared city filtering for sources without native city filtering", async () => { const settingsRepo = await import("@server/repositories/settings"); const registryModule = await import("@server/extractors/registry"); diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.ts index 10a55f0..e3c506c 100644 --- a/orchestrator/src/server/pipeline/steps/discover-jobs.ts +++ b/orchestrator/src/server/pipeline/steps/discover-jobs.ts @@ -553,7 +553,15 @@ export async function discoverJobsStep(args: { settings.blockedCountries, ); const filteredDiscoveredJobs = afterCompanyFilter.filter( - (job) => !jobMatchesBlockedCountries(job.location, blockedCountryKeys), + (job) => + !jobMatchesBlockedCountries( + { + location: job.location, + jobDescription: job.jobDescription, + title: job.title, + }, + blockedCountryKeys, + ), ); const countryDroppedCount = afterCompanyFilter.length - filteredDiscoveredJobs.length; diff --git a/shared/src/blocked-countries.test.ts b/shared/src/blocked-countries.test.ts index 876b372..1746b46 100644 --- a/shared/src/blocked-countries.test.ts +++ b/shared/src/blocked-countries.test.ts @@ -1,5 +1,7 @@ import { describe, expect, it } from "vitest"; import { + inferCountryKeysFromJobText, + isVagueJobLocation, jobMatchesBlockedCountries, normalizeBlockedCountryTokens, resolveBlockedCountriesFromStoredString, @@ -34,4 +36,41 @@ describe("blocked-countries", () => { expect(jobMatchesBlockedCountries("Remote", blocked)).toBe(false); expect(jobMatchesBlockedCountries(null, blocked)).toBe(false); }); + + it("treats worldwide and remote-only locations as vague", () => { + expect(isVagueJobLocation("Worldwide")).toBe(true); + expect(isVagueJobLocation("Remote")).toBe(true); + expect(isVagueJobLocation("Toronto, Canada")).toBe(false); + }); + + it("finds blocked countries in description when location is worldwide", () => { + const blocked = resolveBlockedCountriesFromStoredString('["india"]'); + expect( + jobMatchesBlockedCountries( + { + location: "Worldwide", + jobDescription: + "Job Location: Mumbai/Nagpur. We are hiring in India for this role.", + }, + blocked, + ), + ).toBe(true); + expect( + jobMatchesBlockedCountries( + { + location: "Worldwide", + jobDescription: "Fully remote team across North America.", + }, + blocked, + ), + ).toBe(false); + }); + + it("infers country names embedded in free text", () => { + expect( + inferCountryKeysFromJobText( + "Harrier is hiring in India. Job Location: Mumbai/Nagpur", + ), + ).toContain("india"); + }); }); diff --git a/shared/src/blocked-countries.ts b/shared/src/blocked-countries.ts index cb498ec..da68b13 100644 --- a/shared/src/blocked-countries.ts +++ b/shared/src/blocked-countries.ts @@ -7,6 +7,27 @@ import { inferCountryKeysFromJobLocation } from "./search-cities.js"; const supportedCountryKeySet = new Set(SUPPORTED_COUNTRY_KEYS); +/** Location strings that do not pin a hiring country (check description too). */ +const VAGUE_LOCATION_VALUES = new Set([ + "worldwide", + "global", + "anywhere", + "remote", + "wfh", + "work from home", + "unknown", + "multiple locations", + "multiple countries", +]); + +const VAGUE_COUNTRY_KEYS = new Set(["worldwide", "global"]); + +export interface JobBlockedCountrySignals { + location?: string | null; + jobDescription?: string | null; + title?: string | null; +} + /** * Parse stored settings value for blocked countries. * Accepts JSON string array (normal) or legacy plain comma/newline-separated text. @@ -43,14 +64,65 @@ export function normalizeBlockedCountryTokens(tokens: string[]): string[] { return [...keys]; } -/** True when the job location mentions a blocked country (unknown location is kept). */ +export function isVagueJobLocation(location: string | null | undefined): boolean { + if (!location?.trim()) return true; + const normalized = location.trim().toLowerCase(); + if (VAGUE_LOCATION_VALUES.has(normalized)) return true; + const keys = inferCountryKeysFromJobLocation(location); + if (keys.length === 0) return true; + return keys.every((key) => VAGUE_COUNTRY_KEYS.has(key)); +} + +/** + * Infer supported country keys mentioned anywhere in free text (title, description). + */ +export function inferCountryKeysFromJobText( + text: string | null | undefined, +): string[] { + if (!text?.trim()) return []; + const keys = new Set(inferCountryKeysFromJobLocation(text)); + const lower = text.toLowerCase(); + for (const countryKey of SUPPORTED_COUNTRY_KEYS) { + if (VAGUE_COUNTRY_KEYS.has(countryKey)) continue; + const pattern = countryKey.replace(/\s+/g, "\\s+"); + if (new RegExp(`\\b${pattern}\\b`, "i").test(lower)) { + keys.add(countryKey); + } + } + return [...keys]; +} + +function collectJobCountryKeys(signals: JobBlockedCountrySignals): string[] { + const keys = new Set(); + for (const key of inferCountryKeysFromJobLocation(signals.location)) { + keys.add(key); + } + if (isVagueJobLocation(signals.location)) { + const blob = [signals.title, signals.jobDescription] + .filter(Boolean) + .join("\n"); + for (const key of inferCountryKeysFromJobText(blob)) { + keys.add(key); + } + } + return [...keys]; +} + +/** + * True when the job mentions a blocked country in location and/or (when location + * is vague) title/description. Unknown location with no country in text is kept. + */ export function jobMatchesBlockedCountries( - location: string | null | undefined, + locationOrSignals: string | null | undefined | JobBlockedCountrySignals, blockedCountryKeys: readonly string[], ): boolean { if (blockedCountryKeys.length === 0) return false; const blocked = new Set(blockedCountryKeys); - const jobCountries = inferCountryKeysFromJobLocation(location); + const signals: JobBlockedCountrySignals = + typeof locationOrSignals === "object" && locationOrSignals !== null + ? locationOrSignals + : { location: locationOrSignals }; + const jobCountries = collectJobCountryKeys(signals); if (jobCountries.length === 0) return false; return jobCountries.some((key) => blocked.has(key)); }