fix(discovery): block countries in vague locations via job description
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m22s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m28s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m13s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m12s
CI / Documentation (push) Successful in 2m0s

QAJobsBoard and similar feeds often store Worldwide/Remote while the real
country is only in the description. Scan title and description when location
is vague, and prefer concrete locations from QAJobsBoard postings.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ilia 2026-05-16 17:15:18 -04:00
parent f0261711c6
commit 0a63316100
5 changed files with 213 additions and 7 deletions

View File

@ -81,14 +81,47 @@ function salaryLabel(raw: SalaryBand | undefined): string | undefined {
return schedule.trim() || undefined;
}
const VAGUE_LOCATION_LABELS = new Set([
"worldwide",
"global",
"anywhere",
"remote",
"unknown",
]);
function isVagueLocationLabel(value: string): boolean {
return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase());
}
function extractJobLocationFromDescription(
html: string | undefined,
): string | undefined {
if (!html) return undefined;
const text = stripHtml(html);
const match = text.match(
/\bjob\s+location\s*:\s*([^\n.]{2,120})/i,
);
if (!match?.[1]) return undefined;
const extracted = match[1].trim();
return extracted || undefined;
}
function locationLabel(job: QaJobBoardlyJob): string {
const limits = Array.isArray(job.location_limits)
? job.location_limits.filter(
(v): v is string => typeof v === "string" && v.trim().length > 0,
)
? job.location_limits
.map((v) => (typeof v === "string" ? v.trim() : ""))
.filter((v) => v.length > 0 && !isVagueLocationLabel(v))
: [];
if (limits.length > 0) return limits.join(", ");
const loc = asString(job.location);
if (loc && !isVagueLocationLabel(loc)) return loc;
const fromDescription = extractJobLocationFromDescription(
job.description?.html,
);
if (fromDescription) return fromDescription;
if (loc) return loc;
return "Unknown";
}

View File

@ -394,6 +394,60 @@ describe("discoverJobsStep", () => {
]);
});
it("drops jobs with blocked country in description when location is worldwide", async () => {
const settingsRepo = await import("@server/repositories/settings");
const registryModule = await import("@server/extractors/registry");
const qaManifest = {
id: "qajobsboard",
displayName: "QAJobsBoard",
providesSources: ["qajobsboard"],
run: vi.fn().mockResolvedValue({
success: true,
jobs: [
{
source: "qajobsboard",
title: "Sr. QA Automation Engineer",
employer: "Harrier",
location: "Worldwide",
jobDescription:
"Job Location: Mumbai/Nagpur. Open to candidates in India.",
jobUrl: "https://example.com/job-in",
},
{
source: "qajobsboard",
title: "SDET",
employer: "Contoso",
location: "Toronto, ON, Canada",
jobUrl: "https://example.com/job-ca",
},
],
}),
};
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
searchTerms: JSON.stringify(["sdet"]),
blockedCountries: JSON.stringify(["india"]),
searchCities: "Canada",
} as any);
vi.mocked(registryModule.getExtractorRegistry).mockResolvedValue({
manifests: new Map([["qajobsboard", qaManifest as any]]),
manifestBySource: new Map([["qajobsboard", qaManifest as any]]),
availableSources: ["qajobsboard"],
} as any);
const result = await discoverJobsStep({
mergedConfig: {
...baseConfig,
sources: ["qajobsboard"],
},
});
expect(result.discoveredJobs).toHaveLength(1);
expect(result.discoveredJobs[0]?.jobUrl).toBe("https://example.com/job-ca");
});
it("applies shared city filtering for sources without native city filtering", async () => {
const settingsRepo = await import("@server/repositories/settings");
const registryModule = await import("@server/extractors/registry");

View File

@ -553,7 +553,15 @@ export async function discoverJobsStep(args: {
settings.blockedCountries,
);
const filteredDiscoveredJobs = afterCompanyFilter.filter(
(job) => !jobMatchesBlockedCountries(job.location, blockedCountryKeys),
(job) =>
!jobMatchesBlockedCountries(
{
location: job.location,
jobDescription: job.jobDescription,
title: job.title,
},
blockedCountryKeys,
),
);
const countryDroppedCount =
afterCompanyFilter.length - filteredDiscoveredJobs.length;

View File

@ -1,5 +1,7 @@
import { describe, expect, it } from "vitest";
import {
inferCountryKeysFromJobText,
isVagueJobLocation,
jobMatchesBlockedCountries,
normalizeBlockedCountryTokens,
resolveBlockedCountriesFromStoredString,
@ -34,4 +36,41 @@ describe("blocked-countries", () => {
expect(jobMatchesBlockedCountries("Remote", blocked)).toBe(false);
expect(jobMatchesBlockedCountries(null, blocked)).toBe(false);
});
it("treats worldwide and remote-only locations as vague", () => {
expect(isVagueJobLocation("Worldwide")).toBe(true);
expect(isVagueJobLocation("Remote")).toBe(true);
expect(isVagueJobLocation("Toronto, Canada")).toBe(false);
});
it("finds blocked countries in description when location is worldwide", () => {
const blocked = resolveBlockedCountriesFromStoredString('["india"]');
expect(
jobMatchesBlockedCountries(
{
location: "Worldwide",
jobDescription:
"Job Location: Mumbai/Nagpur. We are hiring in India for this role.",
},
blocked,
),
).toBe(true);
expect(
jobMatchesBlockedCountries(
{
location: "Worldwide",
jobDescription: "Fully remote team across North America.",
},
blocked,
),
).toBe(false);
});
it("infers country names embedded in free text", () => {
expect(
inferCountryKeysFromJobText(
"Harrier is hiring in India. Job Location: Mumbai/Nagpur",
),
).toContain("india");
});
});

View File

@ -7,6 +7,27 @@ import { inferCountryKeysFromJobLocation } from "./search-cities.js";
const supportedCountryKeySet = new Set(SUPPORTED_COUNTRY_KEYS);
/** Location strings that do not pin a hiring country (check description too). */
const VAGUE_LOCATION_VALUES = new Set([
"worldwide",
"global",
"anywhere",
"remote",
"wfh",
"work from home",
"unknown",
"multiple locations",
"multiple countries",
]);
const VAGUE_COUNTRY_KEYS = new Set(["worldwide", "global"]);
export interface JobBlockedCountrySignals {
location?: string | null;
jobDescription?: string | null;
title?: string | null;
}
/**
* Parse stored settings value for blocked countries.
* Accepts JSON string array (normal) or legacy plain comma/newline-separated text.
@ -43,14 +64,65 @@ export function normalizeBlockedCountryTokens(tokens: string[]): string[] {
return [...keys];
}
/** True when the job location mentions a blocked country (unknown location is kept). */
export function isVagueJobLocation(location: string | null | undefined): boolean {
if (!location?.trim()) return true;
const normalized = location.trim().toLowerCase();
if (VAGUE_LOCATION_VALUES.has(normalized)) return true;
const keys = inferCountryKeysFromJobLocation(location);
if (keys.length === 0) return true;
return keys.every((key) => VAGUE_COUNTRY_KEYS.has(key));
}
/**
* Infer supported country keys mentioned anywhere in free text (title, description).
*/
export function inferCountryKeysFromJobText(
text: string | null | undefined,
): string[] {
if (!text?.trim()) return [];
const keys = new Set(inferCountryKeysFromJobLocation(text));
const lower = text.toLowerCase();
for (const countryKey of SUPPORTED_COUNTRY_KEYS) {
if (VAGUE_COUNTRY_KEYS.has(countryKey)) continue;
const pattern = countryKey.replace(/\s+/g, "\\s+");
if (new RegExp(`\\b${pattern}\\b`, "i").test(lower)) {
keys.add(countryKey);
}
}
return [...keys];
}
function collectJobCountryKeys(signals: JobBlockedCountrySignals): string[] {
const keys = new Set<string>();
for (const key of inferCountryKeysFromJobLocation(signals.location)) {
keys.add(key);
}
if (isVagueJobLocation(signals.location)) {
const blob = [signals.title, signals.jobDescription]
.filter(Boolean)
.join("\n");
for (const key of inferCountryKeysFromJobText(blob)) {
keys.add(key);
}
}
return [...keys];
}
/**
* True when the job mentions a blocked country in location and/or (when location
* is vague) title/description. Unknown location with no country in text is kept.
*/
export function jobMatchesBlockedCountries(
location: string | null | undefined,
locationOrSignals: string | null | undefined | JobBlockedCountrySignals,
blockedCountryKeys: readonly string[],
): boolean {
if (blockedCountryKeys.length === 0) return false;
const blocked = new Set(blockedCountryKeys);
const jobCountries = inferCountryKeysFromJobLocation(location);
const signals: JobBlockedCountrySignals =
typeof locationOrSignals === "object" && locationOrSignals !== null
? locationOrSignals
: { location: locationOrSignals };
const jobCountries = collectJobCountryKeys(signals);
if (jobCountries.length === 0) return false;
return jobCountries.some((key) => blocked.has(key));
}