fix(discovery): block countries in vague locations via job description
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m22s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m28s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m13s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m12s
CI / Documentation (push) Successful in 2m0s
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m22s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m28s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m13s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m12s
CI / Documentation (push) Successful in 2m0s
QAJobsBoard and similar feeds often store Worldwide/Remote while the real country is only in the description. Scan title and description when location is vague, and prefer concrete locations from QAJobsBoard postings. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
f0261711c6
commit
0a63316100
@ -81,14 +81,47 @@ function salaryLabel(raw: SalaryBand | undefined): string | undefined {
|
||||
return schedule.trim() || undefined;
|
||||
}
|
||||
|
||||
const VAGUE_LOCATION_LABELS = new Set([
|
||||
"worldwide",
|
||||
"global",
|
||||
"anywhere",
|
||||
"remote",
|
||||
"unknown",
|
||||
]);
|
||||
|
||||
function isVagueLocationLabel(value: string): boolean {
|
||||
return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase());
|
||||
}
|
||||
|
||||
function extractJobLocationFromDescription(
|
||||
html: string | undefined,
|
||||
): string | undefined {
|
||||
if (!html) return undefined;
|
||||
const text = stripHtml(html);
|
||||
const match = text.match(
|
||||
/\bjob\s+location\s*:\s*([^\n.]{2,120})/i,
|
||||
);
|
||||
if (!match?.[1]) return undefined;
|
||||
const extracted = match[1].trim();
|
||||
return extracted || undefined;
|
||||
}
|
||||
|
||||
function locationLabel(job: QaJobBoardlyJob): string {
|
||||
const limits = Array.isArray(job.location_limits)
|
||||
? job.location_limits.filter(
|
||||
(v): v is string => typeof v === "string" && v.trim().length > 0,
|
||||
)
|
||||
? job.location_limits
|
||||
.map((v) => (typeof v === "string" ? v.trim() : ""))
|
||||
.filter((v) => v.length > 0 && !isVagueLocationLabel(v))
|
||||
: [];
|
||||
if (limits.length > 0) return limits.join(", ");
|
||||
|
||||
const loc = asString(job.location);
|
||||
if (loc && !isVagueLocationLabel(loc)) return loc;
|
||||
|
||||
const fromDescription = extractJobLocationFromDescription(
|
||||
job.description?.html,
|
||||
);
|
||||
if (fromDescription) return fromDescription;
|
||||
|
||||
if (loc) return loc;
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
@ -394,6 +394,60 @@ describe("discoverJobsStep", () => {
|
||||
]);
|
||||
});
|
||||
|
||||
it("drops jobs with blocked country in description when location is worldwide", async () => {
|
||||
const settingsRepo = await import("@server/repositories/settings");
|
||||
const registryModule = await import("@server/extractors/registry");
|
||||
|
||||
const qaManifest = {
|
||||
id: "qajobsboard",
|
||||
displayName: "QAJobsBoard",
|
||||
providesSources: ["qajobsboard"],
|
||||
run: vi.fn().mockResolvedValue({
|
||||
success: true,
|
||||
jobs: [
|
||||
{
|
||||
source: "qajobsboard",
|
||||
title: "Sr. QA Automation Engineer",
|
||||
employer: "Harrier",
|
||||
location: "Worldwide",
|
||||
jobDescription:
|
||||
"Job Location: Mumbai/Nagpur. Open to candidates in India.",
|
||||
jobUrl: "https://example.com/job-in",
|
||||
},
|
||||
{
|
||||
source: "qajobsboard",
|
||||
title: "SDET",
|
||||
employer: "Contoso",
|
||||
location: "Toronto, ON, Canada",
|
||||
jobUrl: "https://example.com/job-ca",
|
||||
},
|
||||
],
|
||||
}),
|
||||
};
|
||||
|
||||
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
|
||||
searchTerms: JSON.stringify(["sdet"]),
|
||||
blockedCountries: JSON.stringify(["india"]),
|
||||
searchCities: "Canada",
|
||||
} as any);
|
||||
|
||||
vi.mocked(registryModule.getExtractorRegistry).mockResolvedValue({
|
||||
manifests: new Map([["qajobsboard", qaManifest as any]]),
|
||||
manifestBySource: new Map([["qajobsboard", qaManifest as any]]),
|
||||
availableSources: ["qajobsboard"],
|
||||
} as any);
|
||||
|
||||
const result = await discoverJobsStep({
|
||||
mergedConfig: {
|
||||
...baseConfig,
|
||||
sources: ["qajobsboard"],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.discoveredJobs).toHaveLength(1);
|
||||
expect(result.discoveredJobs[0]?.jobUrl).toBe("https://example.com/job-ca");
|
||||
});
|
||||
|
||||
it("applies shared city filtering for sources without native city filtering", async () => {
|
||||
const settingsRepo = await import("@server/repositories/settings");
|
||||
const registryModule = await import("@server/extractors/registry");
|
||||
|
||||
@ -553,7 +553,15 @@ export async function discoverJobsStep(args: {
|
||||
settings.blockedCountries,
|
||||
);
|
||||
const filteredDiscoveredJobs = afterCompanyFilter.filter(
|
||||
(job) => !jobMatchesBlockedCountries(job.location, blockedCountryKeys),
|
||||
(job) =>
|
||||
!jobMatchesBlockedCountries(
|
||||
{
|
||||
location: job.location,
|
||||
jobDescription: job.jobDescription,
|
||||
title: job.title,
|
||||
},
|
||||
blockedCountryKeys,
|
||||
),
|
||||
);
|
||||
const countryDroppedCount =
|
||||
afterCompanyFilter.length - filteredDiscoveredJobs.length;
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
inferCountryKeysFromJobText,
|
||||
isVagueJobLocation,
|
||||
jobMatchesBlockedCountries,
|
||||
normalizeBlockedCountryTokens,
|
||||
resolveBlockedCountriesFromStoredString,
|
||||
@ -34,4 +36,41 @@ describe("blocked-countries", () => {
|
||||
expect(jobMatchesBlockedCountries("Remote", blocked)).toBe(false);
|
||||
expect(jobMatchesBlockedCountries(null, blocked)).toBe(false);
|
||||
});
|
||||
|
||||
it("treats worldwide and remote-only locations as vague", () => {
|
||||
expect(isVagueJobLocation("Worldwide")).toBe(true);
|
||||
expect(isVagueJobLocation("Remote")).toBe(true);
|
||||
expect(isVagueJobLocation("Toronto, Canada")).toBe(false);
|
||||
});
|
||||
|
||||
it("finds blocked countries in description when location is worldwide", () => {
|
||||
const blocked = resolveBlockedCountriesFromStoredString('["india"]');
|
||||
expect(
|
||||
jobMatchesBlockedCountries(
|
||||
{
|
||||
location: "Worldwide",
|
||||
jobDescription:
|
||||
"Job Location: Mumbai/Nagpur. We are hiring in India for this role.",
|
||||
},
|
||||
blocked,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
jobMatchesBlockedCountries(
|
||||
{
|
||||
location: "Worldwide",
|
||||
jobDescription: "Fully remote team across North America.",
|
||||
},
|
||||
blocked,
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("infers country names embedded in free text", () => {
|
||||
expect(
|
||||
inferCountryKeysFromJobText(
|
||||
"Harrier is hiring in India. Job Location: Mumbai/Nagpur",
|
||||
),
|
||||
).toContain("india");
|
||||
});
|
||||
});
|
||||
|
||||
@ -7,6 +7,27 @@ import { inferCountryKeysFromJobLocation } from "./search-cities.js";
|
||||
|
||||
const supportedCountryKeySet = new Set(SUPPORTED_COUNTRY_KEYS);
|
||||
|
||||
/** Location strings that do not pin a hiring country (check description too). */
|
||||
const VAGUE_LOCATION_VALUES = new Set([
|
||||
"worldwide",
|
||||
"global",
|
||||
"anywhere",
|
||||
"remote",
|
||||
"wfh",
|
||||
"work from home",
|
||||
"unknown",
|
||||
"multiple locations",
|
||||
"multiple countries",
|
||||
]);
|
||||
|
||||
const VAGUE_COUNTRY_KEYS = new Set(["worldwide", "global"]);
|
||||
|
||||
export interface JobBlockedCountrySignals {
|
||||
location?: string | null;
|
||||
jobDescription?: string | null;
|
||||
title?: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse stored settings value for blocked countries.
|
||||
* Accepts JSON string array (normal) or legacy plain comma/newline-separated text.
|
||||
@ -43,14 +64,65 @@ export function normalizeBlockedCountryTokens(tokens: string[]): string[] {
|
||||
return [...keys];
|
||||
}
|
||||
|
||||
/** True when the job location mentions a blocked country (unknown location is kept). */
|
||||
export function isVagueJobLocation(location: string | null | undefined): boolean {
|
||||
if (!location?.trim()) return true;
|
||||
const normalized = location.trim().toLowerCase();
|
||||
if (VAGUE_LOCATION_VALUES.has(normalized)) return true;
|
||||
const keys = inferCountryKeysFromJobLocation(location);
|
||||
if (keys.length === 0) return true;
|
||||
return keys.every((key) => VAGUE_COUNTRY_KEYS.has(key));
|
||||
}
|
||||
|
||||
/**
|
||||
* Infer supported country keys mentioned anywhere in free text (title, description).
|
||||
*/
|
||||
export function inferCountryKeysFromJobText(
|
||||
text: string | null | undefined,
|
||||
): string[] {
|
||||
if (!text?.trim()) return [];
|
||||
const keys = new Set(inferCountryKeysFromJobLocation(text));
|
||||
const lower = text.toLowerCase();
|
||||
for (const countryKey of SUPPORTED_COUNTRY_KEYS) {
|
||||
if (VAGUE_COUNTRY_KEYS.has(countryKey)) continue;
|
||||
const pattern = countryKey.replace(/\s+/g, "\\s+");
|
||||
if (new RegExp(`\\b${pattern}\\b`, "i").test(lower)) {
|
||||
keys.add(countryKey);
|
||||
}
|
||||
}
|
||||
return [...keys];
|
||||
}
|
||||
|
||||
function collectJobCountryKeys(signals: JobBlockedCountrySignals): string[] {
|
||||
const keys = new Set<string>();
|
||||
for (const key of inferCountryKeysFromJobLocation(signals.location)) {
|
||||
keys.add(key);
|
||||
}
|
||||
if (isVagueJobLocation(signals.location)) {
|
||||
const blob = [signals.title, signals.jobDescription]
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
for (const key of inferCountryKeysFromJobText(blob)) {
|
||||
keys.add(key);
|
||||
}
|
||||
}
|
||||
return [...keys];
|
||||
}
|
||||
|
||||
/**
|
||||
* True when the job mentions a blocked country in location and/or (when location
|
||||
* is vague) title/description. Unknown location with no country in text is kept.
|
||||
*/
|
||||
export function jobMatchesBlockedCountries(
|
||||
location: string | null | undefined,
|
||||
locationOrSignals: string | null | undefined | JobBlockedCountrySignals,
|
||||
blockedCountryKeys: readonly string[],
|
||||
): boolean {
|
||||
if (blockedCountryKeys.length === 0) return false;
|
||||
const blocked = new Set(blockedCountryKeys);
|
||||
const jobCountries = inferCountryKeysFromJobLocation(location);
|
||||
const signals: JobBlockedCountrySignals =
|
||||
typeof locationOrSignals === "object" && locationOrSignals !== null
|
||||
? locationOrSignals
|
||||
: { location: locationOrSignals };
|
||||
const jobCountries = collectJobCountryKeys(signals);
|
||||
if (jobCountries.length === 0) return false;
|
||||
return jobCountries.some((key) => blocked.has(key));
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user