fix(discovery): block countries in vague locations via job description
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m22s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m28s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m13s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m12s
CI / Documentation (push) Successful in 2m0s
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m22s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m28s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m13s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m12s
CI / Documentation (push) Successful in 2m0s
QAJobsBoard and similar feeds often store Worldwide/Remote while the real country is only in the description. Scan title and description when location is vague, and prefer concrete locations from QAJobsBoard postings. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
f0261711c6
commit
0a63316100
@ -81,14 +81,47 @@ function salaryLabel(raw: SalaryBand | undefined): string | undefined {
|
|||||||
return schedule.trim() || undefined;
|
return schedule.trim() || undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const VAGUE_LOCATION_LABELS = new Set([
|
||||||
|
"worldwide",
|
||||||
|
"global",
|
||||||
|
"anywhere",
|
||||||
|
"remote",
|
||||||
|
"unknown",
|
||||||
|
]);
|
||||||
|
|
||||||
|
function isVagueLocationLabel(value: string): boolean {
|
||||||
|
return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractJobLocationFromDescription(
|
||||||
|
html: string | undefined,
|
||||||
|
): string | undefined {
|
||||||
|
if (!html) return undefined;
|
||||||
|
const text = stripHtml(html);
|
||||||
|
const match = text.match(
|
||||||
|
/\bjob\s+location\s*:\s*([^\n.]{2,120})/i,
|
||||||
|
);
|
||||||
|
if (!match?.[1]) return undefined;
|
||||||
|
const extracted = match[1].trim();
|
||||||
|
return extracted || undefined;
|
||||||
|
}
|
||||||
|
|
||||||
function locationLabel(job: QaJobBoardlyJob): string {
|
function locationLabel(job: QaJobBoardlyJob): string {
|
||||||
const limits = Array.isArray(job.location_limits)
|
const limits = Array.isArray(job.location_limits)
|
||||||
? job.location_limits.filter(
|
? job.location_limits
|
||||||
(v): v is string => typeof v === "string" && v.trim().length > 0,
|
.map((v) => (typeof v === "string" ? v.trim() : ""))
|
||||||
)
|
.filter((v) => v.length > 0 && !isVagueLocationLabel(v))
|
||||||
: [];
|
: [];
|
||||||
if (limits.length > 0) return limits.join(", ");
|
if (limits.length > 0) return limits.join(", ");
|
||||||
|
|
||||||
const loc = asString(job.location);
|
const loc = asString(job.location);
|
||||||
|
if (loc && !isVagueLocationLabel(loc)) return loc;
|
||||||
|
|
||||||
|
const fromDescription = extractJobLocationFromDescription(
|
||||||
|
job.description?.html,
|
||||||
|
);
|
||||||
|
if (fromDescription) return fromDescription;
|
||||||
|
|
||||||
if (loc) return loc;
|
if (loc) return loc;
|
||||||
return "Unknown";
|
return "Unknown";
|
||||||
}
|
}
|
||||||
|
|||||||
@ -394,6 +394,60 @@ describe("discoverJobsStep", () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("drops jobs with blocked country in description when location is worldwide", async () => {
|
||||||
|
const settingsRepo = await import("@server/repositories/settings");
|
||||||
|
const registryModule = await import("@server/extractors/registry");
|
||||||
|
|
||||||
|
const qaManifest = {
|
||||||
|
id: "qajobsboard",
|
||||||
|
displayName: "QAJobsBoard",
|
||||||
|
providesSources: ["qajobsboard"],
|
||||||
|
run: vi.fn().mockResolvedValue({
|
||||||
|
success: true,
|
||||||
|
jobs: [
|
||||||
|
{
|
||||||
|
source: "qajobsboard",
|
||||||
|
title: "Sr. QA Automation Engineer",
|
||||||
|
employer: "Harrier",
|
||||||
|
location: "Worldwide",
|
||||||
|
jobDescription:
|
||||||
|
"Job Location: Mumbai/Nagpur. Open to candidates in India.",
|
||||||
|
jobUrl: "https://example.com/job-in",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
source: "qajobsboard",
|
||||||
|
title: "SDET",
|
||||||
|
employer: "Contoso",
|
||||||
|
location: "Toronto, ON, Canada",
|
||||||
|
jobUrl: "https://example.com/job-ca",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
|
||||||
|
searchTerms: JSON.stringify(["sdet"]),
|
||||||
|
blockedCountries: JSON.stringify(["india"]),
|
||||||
|
searchCities: "Canada",
|
||||||
|
} as any);
|
||||||
|
|
||||||
|
vi.mocked(registryModule.getExtractorRegistry).mockResolvedValue({
|
||||||
|
manifests: new Map([["qajobsboard", qaManifest as any]]),
|
||||||
|
manifestBySource: new Map([["qajobsboard", qaManifest as any]]),
|
||||||
|
availableSources: ["qajobsboard"],
|
||||||
|
} as any);
|
||||||
|
|
||||||
|
const result = await discoverJobsStep({
|
||||||
|
mergedConfig: {
|
||||||
|
...baseConfig,
|
||||||
|
sources: ["qajobsboard"],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.discoveredJobs).toHaveLength(1);
|
||||||
|
expect(result.discoveredJobs[0]?.jobUrl).toBe("https://example.com/job-ca");
|
||||||
|
});
|
||||||
|
|
||||||
it("applies shared city filtering for sources without native city filtering", async () => {
|
it("applies shared city filtering for sources without native city filtering", async () => {
|
||||||
const settingsRepo = await import("@server/repositories/settings");
|
const settingsRepo = await import("@server/repositories/settings");
|
||||||
const registryModule = await import("@server/extractors/registry");
|
const registryModule = await import("@server/extractors/registry");
|
||||||
|
|||||||
@ -553,7 +553,15 @@ export async function discoverJobsStep(args: {
|
|||||||
settings.blockedCountries,
|
settings.blockedCountries,
|
||||||
);
|
);
|
||||||
const filteredDiscoveredJobs = afterCompanyFilter.filter(
|
const filteredDiscoveredJobs = afterCompanyFilter.filter(
|
||||||
(job) => !jobMatchesBlockedCountries(job.location, blockedCountryKeys),
|
(job) =>
|
||||||
|
!jobMatchesBlockedCountries(
|
||||||
|
{
|
||||||
|
location: job.location,
|
||||||
|
jobDescription: job.jobDescription,
|
||||||
|
title: job.title,
|
||||||
|
},
|
||||||
|
blockedCountryKeys,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
const countryDroppedCount =
|
const countryDroppedCount =
|
||||||
afterCompanyFilter.length - filteredDiscoveredJobs.length;
|
afterCompanyFilter.length - filteredDiscoveredJobs.length;
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it } from "vitest";
|
||||||
import {
|
import {
|
||||||
|
inferCountryKeysFromJobText,
|
||||||
|
isVagueJobLocation,
|
||||||
jobMatchesBlockedCountries,
|
jobMatchesBlockedCountries,
|
||||||
normalizeBlockedCountryTokens,
|
normalizeBlockedCountryTokens,
|
||||||
resolveBlockedCountriesFromStoredString,
|
resolveBlockedCountriesFromStoredString,
|
||||||
@ -34,4 +36,41 @@ describe("blocked-countries", () => {
|
|||||||
expect(jobMatchesBlockedCountries("Remote", blocked)).toBe(false);
|
expect(jobMatchesBlockedCountries("Remote", blocked)).toBe(false);
|
||||||
expect(jobMatchesBlockedCountries(null, blocked)).toBe(false);
|
expect(jobMatchesBlockedCountries(null, blocked)).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("treats worldwide and remote-only locations as vague", () => {
|
||||||
|
expect(isVagueJobLocation("Worldwide")).toBe(true);
|
||||||
|
expect(isVagueJobLocation("Remote")).toBe(true);
|
||||||
|
expect(isVagueJobLocation("Toronto, Canada")).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("finds blocked countries in description when location is worldwide", () => {
|
||||||
|
const blocked = resolveBlockedCountriesFromStoredString('["india"]');
|
||||||
|
expect(
|
||||||
|
jobMatchesBlockedCountries(
|
||||||
|
{
|
||||||
|
location: "Worldwide",
|
||||||
|
jobDescription:
|
||||||
|
"Job Location: Mumbai/Nagpur. We are hiring in India for this role.",
|
||||||
|
},
|
||||||
|
blocked,
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
expect(
|
||||||
|
jobMatchesBlockedCountries(
|
||||||
|
{
|
||||||
|
location: "Worldwide",
|
||||||
|
jobDescription: "Fully remote team across North America.",
|
||||||
|
},
|
||||||
|
blocked,
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("infers country names embedded in free text", () => {
|
||||||
|
expect(
|
||||||
|
inferCountryKeysFromJobText(
|
||||||
|
"Harrier is hiring in India. Job Location: Mumbai/Nagpur",
|
||||||
|
),
|
||||||
|
).toContain("india");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@ -7,6 +7,27 @@ import { inferCountryKeysFromJobLocation } from "./search-cities.js";
|
|||||||
|
|
||||||
const supportedCountryKeySet = new Set(SUPPORTED_COUNTRY_KEYS);
|
const supportedCountryKeySet = new Set(SUPPORTED_COUNTRY_KEYS);
|
||||||
|
|
||||||
|
/** Location strings that do not pin a hiring country (check description too). */
|
||||||
|
const VAGUE_LOCATION_VALUES = new Set([
|
||||||
|
"worldwide",
|
||||||
|
"global",
|
||||||
|
"anywhere",
|
||||||
|
"remote",
|
||||||
|
"wfh",
|
||||||
|
"work from home",
|
||||||
|
"unknown",
|
||||||
|
"multiple locations",
|
||||||
|
"multiple countries",
|
||||||
|
]);
|
||||||
|
|
||||||
|
const VAGUE_COUNTRY_KEYS = new Set(["worldwide", "global"]);
|
||||||
|
|
||||||
|
export interface JobBlockedCountrySignals {
|
||||||
|
location?: string | null;
|
||||||
|
jobDescription?: string | null;
|
||||||
|
title?: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse stored settings value for blocked countries.
|
* Parse stored settings value for blocked countries.
|
||||||
* Accepts JSON string array (normal) or legacy plain comma/newline-separated text.
|
* Accepts JSON string array (normal) or legacy plain comma/newline-separated text.
|
||||||
@ -43,14 +64,65 @@ export function normalizeBlockedCountryTokens(tokens: string[]): string[] {
|
|||||||
return [...keys];
|
return [...keys];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** True when the job location mentions a blocked country (unknown location is kept). */
|
export function isVagueJobLocation(location: string | null | undefined): boolean {
|
||||||
|
if (!location?.trim()) return true;
|
||||||
|
const normalized = location.trim().toLowerCase();
|
||||||
|
if (VAGUE_LOCATION_VALUES.has(normalized)) return true;
|
||||||
|
const keys = inferCountryKeysFromJobLocation(location);
|
||||||
|
if (keys.length === 0) return true;
|
||||||
|
return keys.every((key) => VAGUE_COUNTRY_KEYS.has(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Infer supported country keys mentioned anywhere in free text (title, description).
|
||||||
|
*/
|
||||||
|
export function inferCountryKeysFromJobText(
|
||||||
|
text: string | null | undefined,
|
||||||
|
): string[] {
|
||||||
|
if (!text?.trim()) return [];
|
||||||
|
const keys = new Set(inferCountryKeysFromJobLocation(text));
|
||||||
|
const lower = text.toLowerCase();
|
||||||
|
for (const countryKey of SUPPORTED_COUNTRY_KEYS) {
|
||||||
|
if (VAGUE_COUNTRY_KEYS.has(countryKey)) continue;
|
||||||
|
const pattern = countryKey.replace(/\s+/g, "\\s+");
|
||||||
|
if (new RegExp(`\\b${pattern}\\b`, "i").test(lower)) {
|
||||||
|
keys.add(countryKey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return [...keys];
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectJobCountryKeys(signals: JobBlockedCountrySignals): string[] {
|
||||||
|
const keys = new Set<string>();
|
||||||
|
for (const key of inferCountryKeysFromJobLocation(signals.location)) {
|
||||||
|
keys.add(key);
|
||||||
|
}
|
||||||
|
if (isVagueJobLocation(signals.location)) {
|
||||||
|
const blob = [signals.title, signals.jobDescription]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join("\n");
|
||||||
|
for (const key of inferCountryKeysFromJobText(blob)) {
|
||||||
|
keys.add(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return [...keys];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* True when the job mentions a blocked country in location and/or (when location
|
||||||
|
* is vague) title/description. Unknown location with no country in text is kept.
|
||||||
|
*/
|
||||||
export function jobMatchesBlockedCountries(
|
export function jobMatchesBlockedCountries(
|
||||||
location: string | null | undefined,
|
locationOrSignals: string | null | undefined | JobBlockedCountrySignals,
|
||||||
blockedCountryKeys: readonly string[],
|
blockedCountryKeys: readonly string[],
|
||||||
): boolean {
|
): boolean {
|
||||||
if (blockedCountryKeys.length === 0) return false;
|
if (blockedCountryKeys.length === 0) return false;
|
||||||
const blocked = new Set(blockedCountryKeys);
|
const blocked = new Set(blockedCountryKeys);
|
||||||
const jobCountries = inferCountryKeysFromJobLocation(location);
|
const signals: JobBlockedCountrySignals =
|
||||||
|
typeof locationOrSignals === "object" && locationOrSignals !== null
|
||||||
|
? locationOrSignals
|
||||||
|
: { location: locationOrSignals };
|
||||||
|
const jobCountries = collectJobCountryKeys(signals);
|
||||||
if (jobCountries.length === 0) return false;
|
if (jobCountries.length === 0) return false;
|
||||||
return jobCountries.some((key) => blocked.has(key));
|
return jobCountries.some((key) => blocked.has(key));
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user