ilia 0a63316100
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m22s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m28s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m13s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m12s
CI / Documentation (push) Successful in 2m0s
fix(discovery): block countries in vague locations via job description
QAJobsBoard and similar feeds often store Worldwide/Remote while the real
country is only in the description. Scan title and description when location
is vague, and prefer concrete locations from QAJobsBoard postings.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-16 17:15:18 -04:00

634 lines
19 KiB
TypeScript

import { logger } from "@infra/logger";
import { sanitizeUnknown } from "@infra/sanitize";
import { getExtractorRegistry } from "@server/extractors/registry";
import { DEFAULT_JOB_OWNER_PROFILE_ID } from "@server/infra/job-owner-context";
import { getAllJobUrls } from "@server/repositories/jobs";
import { getProfileById } from "@server/repositories/profiles";
import * as settingsRepo from "@server/repositories/settings";
import { asyncPool } from "@server/utils/async-pool";
import {
jobMatchesBlockedCountries,
resolveBlockedCountriesFromStoredString,
} from "@shared/blocked-countries.js";
import {
formatCountryLabel,
isSourceAllowedForCountry,
normalizeCountryKey,
} from "@shared/location-support.js";
import { resolveBlockedCompanyKeywordsFromStoredString } from "@shared/resolve-blocked-company-keywords.js";
import {
inferCountryKeyFromSearchGeography,
matchesRequestedCity,
resolveSearchCities,
shouldApplyStrictCityFilter,
} from "@shared/search-cities.js";
import { jobSearchProfileSchema } from "@shared/settings-registry.js";
import type { CreateJobInput, PipelineConfig } from "@shared/types";
import { type CrawlSource, progressHelpers, updateProgress } from "../progress";
const DISCOVERY_CONCURRENCY = 3;
type DiscoveryTaskResult = {
discoveredJobs: CreateJobInput[];
sourceErrors: string[];
};
type DiscoverySourceTask = {
source: CrawlSource;
termsTotal?: number;
detail: string;
run: () => Promise<DiscoveryTaskResult>;
};
function isBlockedEmployer(
employer: string | null | undefined,
blockedKeywordsLowerCase: string[],
): boolean {
if (!employer) return false;
if (blockedKeywordsLowerCase.length === 0) return false;
const normalizedEmployer = employer.toLowerCase();
return blockedKeywordsLowerCase.some((keyword) =>
normalizedEmployer.includes(keyword),
);
}
function filterJobsByRequestedCities(args: {
jobs: CreateJobInput[];
selectedCountry: string;
requestedCities: string[];
}): CreateJobInput[] {
const { jobs, selectedCountry, requestedCities } = args;
if (requestedCities.length === 0) return jobs;
return jobs.filter((job) =>
requestedCities.some((requestedCity) => {
const strict = shouldApplyStrictCityFilter(
requestedCity,
selectedCountry,
);
if (!strict) return true;
return matchesRequestedCity(job.location, requestedCity);
}),
);
}
const ROLE_TOKEN_STOPWORDS = new Set([
"a",
"an",
"and",
"the",
"of",
"to",
"for",
"in",
"on",
"with",
"at",
"by",
"from",
"senior",
"sr",
"jr",
"junior",
"lead",
"principal",
"staff",
"i",
"ii",
"iii",
"iv",
"v",
"remote",
"hybrid",
"onsite",
// These are too generic and cause massive false positives.
"software",
"development",
"developer",
"engineer",
"engineering",
]);
function normalizeText(value: string | null | undefined): string {
return (value ?? "").toLowerCase().replace(/\s+/g, " ").trim();
}
function buildRoleMatchers(phrases: string[]): {
phraseMatchers: string[];
tokenMatchers: string[];
} {
const phraseMatchers = phrases.map((p) => normalizeText(p)).filter(Boolean);
const tokenSet = new Set<string>();
for (const phrase of phraseMatchers) {
for (const token of phrase.split(/[^a-z0-9+.#]+/g)) {
const cleaned = token.trim();
if (!cleaned) continue;
if (cleaned.length < 2) continue;
if (ROLE_TOKEN_STOPWORDS.has(cleaned)) continue;
tokenSet.add(cleaned);
}
}
// Ensure common QA acronyms remain even if user only typed long-form roles.
for (const token of ["qa", "sdet", "test", "testing", "automation"]) {
tokenSet.add(token);
}
return { phraseMatchers, tokenMatchers: [...tokenSet] };
}
function matchesAny(text: string, needles: string[]): boolean {
if (!text) return false;
for (const needle of needles) {
if (needle && text.includes(needle)) return true;
}
return false;
}
function filterJobsBySearchProfile(args: {
jobs: CreateJobInput[];
targetRolePhrases: string[];
mustHaveSkills: string[];
dealBreakers: string[];
}): { jobs: CreateJobInput[]; dropped: number } {
const { jobs, targetRolePhrases, mustHaveSkills, dealBreakers } = args;
const roleMatchers = buildRoleMatchers(targetRolePhrases);
const mustHaveLower = mustHaveSkills.map(normalizeText).filter(Boolean);
const dealBreakersLower = dealBreakers.map(normalizeText).filter(Boolean);
const filtered = jobs.filter((job) => {
const title = normalizeText(job.title);
const body = normalizeText(job.jobDescription);
const haystack = `${title}\n${body}`;
if (
dealBreakersLower.length > 0 &&
matchesAny(haystack, dealBreakersLower)
) {
return false;
}
// If the user specified target roles, enforce a strict role match so we
// don't surface irrelevant jobs (e.g. legal/sales/finance) in Discovered.
if (roleMatchers.phraseMatchers.length > 0) {
const roleMatch =
matchesAny(title, roleMatchers.phraseMatchers) ||
matchesAny(title, roleMatchers.tokenMatchers) ||
matchesAny(body, roleMatchers.phraseMatchers) ||
matchesAny(body, roleMatchers.tokenMatchers);
if (!roleMatch) return false;
}
if (mustHaveLower.length > 0 && !matchesAny(haystack, mustHaveLower)) {
return false;
}
return true;
});
return { jobs: filtered, dropped: jobs.length - filtered.length };
}
export async function discoverJobsStep(args: {
mergedConfig: PipelineConfig;
shouldCancel?: () => boolean;
}): Promise<{
discoveredJobs: CreateJobInput[];
sourceErrors: string[];
}> {
logger.info("Running discovery step");
const discoveredJobs: CreateJobInput[] = [];
const sourceErrors: string[] = [];
const settings = await settingsRepo.getAllSettings();
const registry = await getExtractorRegistry();
const searchTermsSetting = settings.searchTerms;
let searchTerms: string[] = [];
if (searchTermsSetting) {
searchTerms = JSON.parse(searchTermsSetting) as string[];
} else {
const defaultSearchTermsEnv =
process.env.JOBSPY_SEARCH_TERMS || "web developer";
searchTerms = defaultSearchTermsEnv
.split("|")
.map((term) => term.trim())
.filter(Boolean);
}
const ownerProfileId =
args.mergedConfig.ownerProfileId ?? DEFAULT_JOB_OWNER_PROFILE_ID;
let searchProfileTargetRoles: string[] = [];
let searchProfileMustHaveSkills: string[] = [];
let searchProfileDealBreakers: string[] = [];
const mergeTargetRoles = (targetRoles: unknown) => {
if (!Array.isArray(targetRoles) || targetRoles.length === 0) return;
const existingLower = new Set(searchTerms.map((t) => t.toLowerCase()));
for (const role of targetRoles) {
if (
typeof role === "string" &&
role.trim() &&
!existingLower.has(role.trim().toLowerCase())
) {
searchTerms.push(role.trim());
existingLower.add(role.trim().toLowerCase());
}
}
logger.info("Augmented search terms with profile target roles", {
addedRoles: targetRoles.length,
totalTerms: searchTerms.length,
});
};
if (ownerProfileId && ownerProfileId !== DEFAULT_JOB_OWNER_PROFILE_ID) {
const row = await getProfileById(ownerProfileId);
if (row?.data) {
const parsed = jobSearchProfileSchema.safeParse(row.data);
if (parsed.success) {
searchProfileTargetRoles = parsed.data.targetRoles ?? [];
searchProfileMustHaveSkills = parsed.data.mustHaveSkills ?? [];
searchProfileDealBreakers = parsed.data.dealBreakers ?? [];
if (searchProfileTargetRoles.length > 0) {
mergeTargetRoles(searchProfileTargetRoles);
}
} else if (row.data.targetRoles?.length) {
// Legacy profile shapes: keep augmenting terms but we won't enforce strict filtering.
mergeTargetRoles(row.data.targetRoles);
}
}
} else {
const profileSetting = settings.jobSearchProfile;
if (profileSetting) {
try {
const profile = JSON.parse(profileSetting);
const parsed = jobSearchProfileSchema.safeParse(profile);
if (parsed.success) {
searchProfileTargetRoles = parsed.data.targetRoles ?? [];
searchProfileMustHaveSkills = parsed.data.mustHaveSkills ?? [];
searchProfileDealBreakers = parsed.data.dealBreakers ?? [];
if (searchProfileTargetRoles.length > 0) {
mergeTargetRoles(searchProfileTargetRoles);
}
} else if (
Array.isArray((profile as { targetRoles?: unknown }).targetRoles) &&
(profile as { targetRoles: unknown[] }).targetRoles.length > 0
) {
mergeTargetRoles((profile as { targetRoles: unknown }).targetRoles);
}
} catch {
// malformed profile JSON, continue with existing terms
}
}
}
const geographyCountryKey = inferCountryKeyFromSearchGeography(
settings.searchCities,
settings.jobspyLocation,
);
const configuredIndeedKey = settings.jobspyCountryIndeed?.trim()
? normalizeCountryKey(settings.jobspyCountryIndeed)
: null;
if (
geographyCountryKey &&
configuredIndeedKey &&
geographyCountryKey !== configuredIndeedKey
) {
logger.warn(
"Indeed country setting disagrees with country-level search geography; aligning JobSpy and source routing to geography",
{
step: "discover-jobs",
geographyCountryKey,
jobspyCountryIndeed: configuredIndeedKey,
},
);
}
const selectedCountry = normalizeCountryKey(
geographyCountryKey ??
settings.jobspyCountryIndeed ??
settings.searchCities ??
settings.jobspyLocation ??
"united kingdom",
);
const effectiveJobspyCountryIndeed =
geographyCountryKey ?? settings.jobspyCountryIndeed;
const compatibleSources = args.mergedConfig.sources.filter((source) =>
isSourceAllowedForCountry(source, selectedCountry),
);
let existingJobUrlsPromise: Promise<string[]> | null = null;
const getExistingJobUrls = (): Promise<string[]> => {
if (!existingJobUrlsPromise) {
existingJobUrlsPromise = getAllJobUrls(ownerProfileId);
}
return existingJobUrlsPromise;
};
const skippedSources = args.mergedConfig.sources.filter(
(source) => !compatibleSources.includes(source),
);
if (skippedSources.length > 0) {
logger.info("Skipping incompatible sources for selected country", {
step: "discover-jobs",
country: selectedCountry,
countryLabel: formatCountryLabel(selectedCountry),
requestedSources: args.mergedConfig.sources,
skippedSources,
});
}
if (args.mergedConfig.sources.length > 0 && compatibleSources.length === 0) {
throw new Error(
`No compatible sources for selected country: ${formatCountryLabel(selectedCountry)}`,
);
}
const groupedByManifest = new Map<
string,
{ sources: string[]; detail: string; termsTotal?: number }
>();
for (const source of compatibleSources) {
const manifest = registry.manifestBySource.get(source);
if (!manifest) {
sourceErrors.push(`${source}: extractor manifest not registered`);
continue;
}
const existing = groupedByManifest.get(manifest.id);
if (existing) {
existing.sources.push(source);
continue;
}
groupedByManifest.set(manifest.id, {
sources: [source],
termsTotal: searchTerms.length,
detail: `${manifest.displayName}: fetching jobs...`,
});
}
const sourceTasks: DiscoverySourceTask[] = [];
for (const [manifestId, grouped] of groupedByManifest) {
const manifest = registry.manifests.get(manifestId);
if (!manifest) continue;
sourceTasks.push({
source: manifest.id,
termsTotal: grouped.termsTotal,
detail:
grouped.sources.length > 1
? `${manifest.displayName}: ${grouped.sources.join(", ")}...`
: grouped.detail,
run: async () => {
const filteredSettings = Object.fromEntries(
Object.entries(settings).filter(
([, value]) =>
typeof value === "string" || typeof value === "undefined",
),
) as Record<string, string | undefined>;
if (effectiveJobspyCountryIndeed !== undefined) {
filteredSettings.jobspyCountryIndeed = effectiveJobspyCountryIndeed;
}
const result = await manifest.run({
source: grouped.sources[0],
selectedSources: grouped.sources,
settings: filteredSettings,
searchTerms,
selectedCountry,
getExistingJobUrls,
shouldCancel: args.shouldCancel,
onProgress: (event) => {
progressHelpers.crawlingUpdate({
source: manifest.id,
termsProcessed: event.termsProcessed,
termsTotal: event.termsTotal,
listPagesProcessed: event.listPagesProcessed,
listPagesTotal: event.listPagesTotal,
jobCardsFound: event.jobCardsFound,
jobPagesEnqueued: event.jobPagesEnqueued,
jobPagesSkipped: event.jobPagesSkipped,
jobPagesProcessed: event.jobPagesProcessed,
phase: event.phase,
currentUrl: event.currentUrl,
});
if (event.detail) {
updateProgress({
step: "crawling",
detail: event.detail,
});
}
},
});
if (!result.success) {
return {
discoveredJobs: [],
sourceErrors: [
`${manifest.displayName || manifest.id}: ${result.error ?? "unknown error"} (sources: ${grouped.sources.join(",")})`,
],
};
}
return {
discoveredJobs: result.jobs,
sourceErrors: [],
};
},
});
}
const totalSources = sourceTasks.length;
let completedSources = 0;
progressHelpers.startCrawling(totalSources);
if (args.shouldCancel?.()) {
return { discoveredJobs, sourceErrors };
}
const sourceResults = await asyncPool({
items: sourceTasks,
concurrency: DISCOVERY_CONCURRENCY,
shouldStop: args.shouldCancel,
onTaskStarted: (sourceTask) => {
progressHelpers.startSource(
sourceTask.source,
completedSources,
totalSources,
{
termsTotal: sourceTask.termsTotal,
detail: sourceTask.detail,
},
);
},
onTaskSettled: () => {
completedSources += 1;
progressHelpers.completeSource(completedSources, totalSources);
},
task: async (sourceTask) => {
try {
return await sourceTask.run();
} catch (error) {
logger.warn("Discovery source task failed", {
sourceTask: sourceTask.source,
error: sanitizeUnknown(error),
});
return {
discoveredJobs: [],
sourceErrors: [
`${sourceTask.source}: ${error instanceof Error ? error.message : "unknown error"}`,
],
};
}
},
});
for (const sourceResult of sourceResults) {
discoveredJobs.push(...sourceResult.discoveredJobs);
sourceErrors.push(...sourceResult.sourceErrors);
}
const requestedCities = resolveSearchCities({
single: settings.searchCities ?? settings.jobspyLocation,
});
const cityFilteredJobs = filterJobsByRequestedCities({
jobs: discoveredJobs,
selectedCountry,
requestedCities,
});
const cityFilteredOutCount = discoveredJobs.length - cityFilteredJobs.length;
if (cityFilteredOutCount > 0) {
logger.info("Dropped discovered jobs that did not match requested cities", {
step: "discover-jobs",
droppedCount: cityFilteredOutCount,
requestedCities,
selectedCountry,
});
}
const blockedCompanyKeywords = resolveBlockedCompanyKeywordsFromStoredString(
settings.blockedCompanyKeywords,
);
const blockedKeywordsLowerCase = blockedCompanyKeywords.map((value) =>
value.toLowerCase(),
);
const afterCompanyFilter = cityFilteredJobs.filter(
(job) => !isBlockedEmployer(job.employer, blockedKeywordsLowerCase),
);
const companyDroppedCount =
cityFilteredJobs.length - afterCompanyFilter.length;
if (companyDroppedCount > 0) {
const blockedCompanyKeywordsPreview = blockedCompanyKeywords.slice(0, 10);
const blockedCompanyKeywordsTruncated =
blockedCompanyKeywordsPreview.length < blockedCompanyKeywords.length;
logger.info("Dropped discovered jobs matching blocked company keywords", {
step: "discover-jobs",
droppedCount: companyDroppedCount,
blockedKeywordCount: blockedCompanyKeywords.length,
blockedCompanyKeywordsPreview,
blockedCompanyKeywordsTruncated,
});
logger.debug("Full blocked company keywords used for filtering", {
step: "discover-jobs",
blockedCompanyKeywords,
});
}
const blockedCountryKeys = resolveBlockedCountriesFromStoredString(
settings.blockedCountries,
);
const filteredDiscoveredJobs = afterCompanyFilter.filter(
(job) =>
!jobMatchesBlockedCountries(
{
location: job.location,
jobDescription: job.jobDescription,
title: job.title,
},
blockedCountryKeys,
),
);
const countryDroppedCount =
afterCompanyFilter.length - filteredDiscoveredJobs.length;
if (countryDroppedCount > 0) {
const blockedCountriesPreview = blockedCountryKeys.slice(0, 10);
const blockedCountriesTruncated =
blockedCountriesPreview.length < blockedCountryKeys.length;
logger.info("Dropped discovered jobs in blocked countries", {
step: "discover-jobs",
droppedCount: countryDroppedCount,
blockedCountryCount: blockedCountryKeys.length,
blockedCountriesPreview,
blockedCountriesTruncated,
});
logger.debug("Full blocked countries used for filtering", {
step: "discover-jobs",
blockedCountryKeys,
});
}
if (args.shouldCancel?.()) {
return { discoveredJobs: filteredDiscoveredJobs, sourceErrors };
}
const strictProfileFilteringEnabled =
searchProfileTargetRoles.length > 0 ||
searchProfileMustHaveSkills.length > 0 ||
searchProfileDealBreakers.length > 0;
const profileFiltered = strictProfileFilteringEnabled
? filterJobsBySearchProfile({
jobs: filteredDiscoveredJobs,
targetRolePhrases: searchProfileTargetRoles.length
? searchProfileTargetRoles
: searchTerms,
mustHaveSkills: searchProfileMustHaveSkills,
dealBreakers: searchProfileDealBreakers,
})
: { jobs: filteredDiscoveredJobs, dropped: 0 };
if (profileFiltered.dropped > 0) {
logger.info("Dropped discovered jobs that didn't match search profile", {
step: "discover-jobs",
droppedCount: profileFiltered.dropped,
targetRolesCount: searchProfileTargetRoles.length,
mustHaveSkillsCount: searchProfileMustHaveSkills.length,
dealBreakersCount: searchProfileDealBreakers.length,
});
}
if (profileFiltered.jobs.length === 0 && sourceErrors.length > 0) {
throw new Error(`All sources failed: ${sourceErrors.join("; ")}`);
}
if (sourceErrors.length > 0) {
logger.warn("Some discovery sources failed", { sourceErrors });
}
progressHelpers.crawlingComplete(profileFiltered.jobs.length);
const stamped = profileFiltered.jobs.map((job) => ({
...job,
ownerProfileId,
}));
return { discoveredJobs: stamped, sourceErrors };
}