Shaheer Sarfaraz 19266fe5eb
City search (#217)
* wave 1, jobspy only

* combine usa/ca to united states

* strict city location filter

* hide and show based on focus

* UI changes

* allow clicking cross!

* pill animate in

* animate out, uggo fix

* animate out

* framer motion

* animate component height

* adzuna

* hiring cafe implementation

* refactor: centralize shared search-city parsing and matching

* feat: migrate city setting to searchCities with legacy fallback

* docs: update pipeline and extractor city-search wording

* fix(orchestrator): normalize tokenized paste behavior

* fix(shared): tighten city matching semantics

* docs(extractors): document city-location knobs and geocoding note
2026-02-21 00:42:09 +00:00

719 lines
21 KiB
TypeScript

import { mkdir, writeFile } from "node:fs/promises";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
import { launchOptions } from "camoufox-js";
import { parseSearchTerms } from "job-ops-shared/utils/search-terms";
import {
toNumberOrNull,
toStringOrNull,
} from "job-ops-shared/utils/type-conversion";
import { firefox, type Page } from "playwright";
import {
normalizeCountryKey,
resolveHiringCafeCountryLocation,
} from "./country-map.js";
import { createDefaultSearchState } from "./default-search-state.js";
const __dirname = dirname(fileURLToPath(import.meta.url));
const BASE_URL = "https://hiring.cafe";
const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
const DEFAULT_MAX_JOBS_PER_TERM = 200;
const DEFAULT_SEARCH_TERM = "web developer";
const DEFAULT_DATE_FETCHED_PAST_N_DAYS = 30;
const DEFAULT_LOCATION_RADIUS_MILES = 1;
const PAGE_LIMIT = 50;
type RawHiringCafeJob = Record<string, unknown>;
interface ExtractedJob {
source: "hiringcafe";
sourceJobId?: string;
title: string;
employer: string;
jobUrl: string;
applicationLink: string;
location?: string;
salary?: string;
datePosted?: string;
jobDescription?: string;
jobType?: string;
}
interface BrowserApiResponse {
ok: boolean;
status: number;
statusText: string;
data: unknown;
responseText: string;
}
interface CityLocationContext {
id: string;
city: string;
regionLong: string;
regionShort: string;
countryLong: string;
countryShort: string;
lat: number;
lon: number;
formattedAddress: string;
population: number | null;
radiusMiles: number;
}
interface NominatimResult {
lat?: string;
lon?: string;
display_name?: string;
address?: Record<string, unknown>;
}
function emitProgress(payload: Record<string, unknown>): void {
if (process.env.JOBOPS_EMIT_PROGRESS !== "1") return;
console.log(`${JOBOPS_PROGRESS_PREFIX}${JSON.stringify(payload)}`);
}
function parsePositiveInt(input: string | undefined, fallback: number): number {
const parsed = input ? Number.parseInt(input, 10) : Number.NaN;
if (!Number.isFinite(parsed) || parsed < 1) return fallback;
return parsed;
}
function encodeSearchState(searchState: unknown): string {
const json = JSON.stringify(searchState);
const urlEncodedJson = encodeURIComponent(json);
return Buffer.from(urlEncodedJson, "utf-8").toString("base64");
}
function asRecord(value: unknown): Record<string, unknown> | null {
if (!value || typeof value !== "object" || Array.isArray(value)) return null;
return value as Record<string, unknown>;
}
function asStringArray(value: unknown): string[] {
if (!Array.isArray(value)) return [];
return value
.map((item) => toStringOrNull(item))
.filter((item): item is string => Boolean(item));
}
function firstArrayValue(value: unknown): string | null {
const values = asStringArray(value);
return values.length > 0 ? values[0] : null;
}
function formatCompensation(
processedJobData: Record<string, unknown> | null,
): string | undefined {
if (!processedJobData) return undefined;
const min = toNumberOrNull(processedJobData.yearly_min_compensation);
const max = toNumberOrNull(processedJobData.yearly_max_compensation);
if (min === null && max === null) return undefined;
const currency = toStringOrNull(
processedJobData.listed_compensation_currency,
);
const frequency =
toStringOrNull(processedJobData.listed_compensation_frequency) ?? "Yearly";
const amount = formatCompensationAmount(min, max);
const parts = [currency, amount, frequency ? `/ ${frequency}` : ""]
.filter(Boolean)
.join(" ")
.trim();
return parts || undefined;
}
function formatCompensationAmount(
min: number | null,
max: number | null,
): string {
if (min !== null && max !== null) {
return `${Math.round(min)}-${Math.round(max)}`;
}
if (min !== null) return `${Math.round(min)}+`;
return `${Math.round(max ?? 0)}`;
}
function mapHiringCafeJob(raw: RawHiringCafeJob): ExtractedJob | null {
const jobInformation = asRecord(raw.job_information);
const processed = asRecord(raw.v5_processed_job_data);
const companyInfo = asRecord(jobInformation?.company_info);
const sourceJobId =
toStringOrNull(raw.id) ??
toStringOrNull(raw.objectID) ??
toStringOrNull(raw.original_source_id) ??
toStringOrNull(raw.requisition_id) ??
undefined;
const jobUrl = toStringOrNull(raw.apply_url);
if (!jobUrl) return null;
const title =
toStringOrNull(jobInformation?.title) ??
toStringOrNull(jobInformation?.job_title_raw) ??
toStringOrNull(processed?.core_job_title) ??
"Unknown Title";
const employer =
toStringOrNull(companyInfo?.name) ??
toStringOrNull(processed?.company_name) ??
"Unknown Employer";
const location =
toStringOrNull(processed?.formatted_workplace_location) ??
firstArrayValue(processed?.workplace_cities) ??
firstArrayValue(processed?.workplace_states) ??
firstArrayValue(processed?.workplace_countries) ??
undefined;
const commitments = asStringArray(processed?.commitment);
const jobType = commitments.length > 0 ? commitments.join(", ") : undefined;
return {
source: "hiringcafe",
sourceJobId,
title,
employer,
jobUrl,
applicationLink: jobUrl,
location,
salary: formatCompensation(processed),
datePosted: toStringOrNull(processed?.estimated_publish_date) ?? undefined,
jobDescription: toStringOrNull(jobInformation?.description) ?? undefined,
jobType,
};
}
function extractResultsBatch(payload: unknown): RawHiringCafeJob[] {
if (Array.isArray(payload)) {
return payload.filter(
(item): item is RawHiringCafeJob =>
Boolean(item) && typeof item === "object" && !Array.isArray(item),
);
}
const payloadRecord = asRecord(payload);
const results = payloadRecord?.results;
if (!Array.isArray(results)) return [];
return results.filter(
(item): item is RawHiringCafeJob =>
Boolean(item) && typeof item === "object" && !Array.isArray(item),
);
}
function parseTotalCount(payload: unknown): number | null {
const payloadRecord = asRecord(payload);
if (!payloadRecord) return null;
return toNumberOrNull(payloadRecord.total);
}
function buildCityLocationId(input: string): string {
const normalized = input.trim().toLowerCase().replace(/\s+/g, "_");
return `city_${normalized}`.slice(0, 32);
}
function toRegionShortName(value: string): string {
const compact = value
.replace(/[^a-zA-Z\s]/g, " ")
.trim()
.split(/\s+/)
.filter(Boolean);
if (compact.length === 0) return "REG";
if (compact.length === 1) {
return compact[0].slice(0, 3).toUpperCase();
}
return compact
.slice(0, 3)
.map((part) => part[0]?.toUpperCase() ?? "")
.join("");
}
async function resolveCityLocationContext(args: {
city: string;
countryLong: string;
countryShort: string;
radiusMiles: number;
}): Promise<CityLocationContext | null> {
const query = `${args.city}, ${args.countryLong}`;
const url = new URL("https://nominatim.openstreetmap.org/search");
url.searchParams.set("q", query);
url.searchParams.set("format", "jsonv2");
url.searchParams.set("addressdetails", "1");
url.searchParams.set("limit", "1");
try {
const response = await fetch(url, {
headers: {
Accept: "application/json",
"User-Agent": "job-ops-hiringcafe-extractor/1.0",
},
signal: AbortSignal.timeout(8_000),
});
if (!response.ok) {
throw new Error(`geocode failed (${response.status})`);
}
const payload = (await response.json()) as unknown;
if (!Array.isArray(payload) || payload.length === 0) {
throw new Error("geocode returned no results");
}
const first = payload[0] as NominatimResult;
const lat = Number(first.lat ?? "");
const lon = Number(first.lon ?? "");
if (!Number.isFinite(lat) || !Number.isFinite(lon)) {
throw new Error("invalid geocode coordinates");
}
const address = asRecord(first.address);
const regionLong =
toStringOrNull(address?.state) ??
toStringOrNull(address?.county) ??
toStringOrNull(address?.region) ??
args.countryLong;
const displayName =
toStringOrNull(first.display_name) ??
`${args.city}, ${regionLong}, ${args.countryShort}`;
return {
id: buildCityLocationId(args.city),
city: args.city,
regionLong,
regionShort: toRegionShortName(regionLong),
countryLong: args.countryLong,
countryShort: args.countryShort,
lat,
lon,
formattedAddress: displayName,
population: null,
radiusMiles: args.radiusMiles,
};
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn(`City geocode failed for '${query}': ${message}`);
return null;
}
}
function createCitySearchState(args: {
searchQuery: string;
dateFetchedPastNDays: number;
context: CityLocationContext;
}): Record<string, unknown> {
return {
locations: [
{
id: args.context.id,
types: ["locality"],
address_components: [
{
long_name: args.context.city,
short_name: args.context.city,
types: ["locality"],
},
{
long_name: args.context.regionLong,
short_name: args.context.regionShort,
types: ["administrative_area_level_1"],
},
{
long_name: args.context.countryLong,
short_name: args.context.countryShort,
types: ["country"],
},
],
geometry: {
location: {
lat: args.context.lat,
lon: args.context.lon,
},
},
formatted_address: args.context.formattedAddress,
population: args.context.population,
workplace_types: [],
options: {
radius: args.context.radiusMiles,
radius_unit: "miles",
ignore_radius: false,
},
},
],
workplaceTypes: ["Remote", "Hybrid", "Onsite"],
defaultToUserLocation: true,
userLocation: null,
physicalEnvironments: [
"Office",
"Outdoor",
"Vehicle",
"Industrial",
"Customer-Facing",
],
physicalLaborIntensity: ["Low", "Medium", "High"],
physicalPositions: ["Sitting", "Standing"],
oralCommunicationLevels: ["Low", "Medium", "High"],
computerUsageLevels: ["Low", "Medium", "High"],
cognitiveDemandLevels: ["Low", "Medium", "High"],
currency: { label: "Any", value: null },
frequency: { label: "Any", value: null },
minCompensationLowEnd: null,
minCompensationHighEnd: null,
maxCompensationLowEnd: null,
maxCompensationHighEnd: null,
restrictJobsToTransparentSalaries: false,
calcFrequency: "Yearly",
commitmentTypes: [
"Full Time",
"Part Time",
"Contract",
"Internship",
"Temporary",
"Seasonal",
"Volunteer",
],
jobTitleQuery: "",
jobDescriptionQuery: "",
associatesDegreeFieldsOfStudy: [],
excludedAssociatesDegreeFieldsOfStudy: [],
bachelorsDegreeFieldsOfStudy: [],
excludedBachelorsDegreeFieldsOfStudy: [],
mastersDegreeFieldsOfStudy: [],
excludedMastersDegreeFieldsOfStudy: [],
doctorateDegreeFieldsOfStudy: [],
excludedDoctorateDegreeFieldsOfStudy: [],
associatesDegreeRequirements: [],
bachelorsDegreeRequirements: [],
mastersDegreeRequirements: [],
doctorateDegreeRequirements: [],
licensesAndCertifications: [],
excludedLicensesAndCertifications: [],
excludeAllLicensesAndCertifications: false,
seniorityLevel: [
"No Prior Experience Required",
"Entry Level",
"Mid Level",
"Senior Level",
],
roleTypes: ["Individual Contributor", "People Manager"],
roleYoeRange: [0, 20],
excludeIfRoleYoeIsNotSpecified: false,
managementYoeRange: [0, 20],
excludeIfManagementYoeIsNotSpecified: false,
securityClearances: [
"None",
"Confidential",
"Secret",
"Top Secret",
"Top Secret/SCI",
"Public Trust",
"Interim Clearances",
"Other",
],
languageRequirements: [],
excludedLanguageRequirements: [],
languageRequirementsOperator: "OR",
excludeJobsWithAdditionalLanguageRequirements: false,
airTravelRequirement: ["None", "Minimal", "Moderate", "Extensive"],
landTravelRequirement: ["None", "Minimal", "Moderate", "Extensive"],
morningShiftWork: [],
eveningShiftWork: [],
overnightShiftWork: [],
weekendAvailabilityRequired: "Doesn't Matter",
holidayAvailabilityRequired: "Doesn't Matter",
overtimeRequired: "Doesn't Matter",
onCallRequirements: [
"None",
"Occasional (once a month or less)",
"Regular (once a week or more)",
],
benefitsAndPerks: [],
applicationFormEase: [],
companyNames: [],
excludedCompanyNames: [],
companyHqCountries: [],
excludedCompanyHqCountries: [],
usaGovPref: null,
industries: [],
excludedIndustries: [],
companyKeywords: [],
companyKeywordsBooleanOperator: "OR",
excludedCompanyKeywords: [],
hideJobTypes: [],
encouragedToApply: [],
searchQuery: args.searchQuery,
dateFetchedPastNDays: args.dateFetchedPastNDays,
hiddenCompanies: [],
user: null,
searchModeSelectedCompany: null,
departments: [],
restrictedSearchAttributes: [],
sortBy: "default",
technologyKeywordsQuery: "",
requirementsKeywordsQuery: "",
companyPublicOrPrivate: "all",
latestInvestmentYearRange: [null, null],
latestInvestmentSeries: [],
latestInvestmentAmount: null,
latestInvestmentCurrency: [],
investors: [],
excludedInvestors: [],
isNonProfit: "all",
organizationTypes: [],
excludedOrganizationTypes: [],
companySizeRanges: [],
minYearFounded: null,
maxYearFounded: null,
excludedLatestInvestmentSeries: [],
};
}
async function callHiringCafeApi(
page: Page,
endpoint: string,
params: Record<string, string>,
): Promise<unknown> {
const response = await page.evaluate(
async ({ endpointArg, paramsArg }) => {
const url = new URL(endpointArg, window.location.origin);
for (const [key, value] of Object.entries(paramsArg)) {
url.searchParams.set(key, value);
}
const res = await fetch(url.toString(), {
method: "GET",
credentials: "include",
headers: {
Accept: "application/json, text/plain, */*",
},
});
const text = await res.text();
let data: unknown = null;
try {
data = JSON.parse(text);
} catch {
// Keep response text for diagnostics.
}
const output: BrowserApiResponse = {
ok: res.ok,
status: res.status,
statusText: res.statusText,
data,
responseText: text,
};
return output;
},
{ endpointArg: endpoint, paramsArg: params },
);
const result = response as BrowserApiResponse;
if (!result.ok) {
const snippet = result.responseText.slice(0, 250);
throw new Error(
`Hiring Cafe API ${endpoint} failed (${result.status} ${result.statusText}): ${snippet}`,
);
}
if (result.data === null) {
const snippet = result.responseText.slice(0, 250);
throw new Error(
`Hiring Cafe API ${endpoint} returned non-JSON response: ${snippet}`,
);
}
return result.data;
}
async function run(): Promise<void> {
const searchTerms = parseSearchTerms(
process.env.HIRING_CAFE_SEARCH_TERMS,
DEFAULT_SEARCH_TERM,
);
const country = normalizeCountryKey(
process.env.HIRING_CAFE_COUNTRY ?? "united kingdom",
);
const maxJobsPerTerm = parsePositiveInt(
process.env.HIRING_CAFE_MAX_JOBS_PER_TERM,
DEFAULT_MAX_JOBS_PER_TERM,
);
const dateFetchedPastNDays = parsePositiveInt(
process.env.HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS,
DEFAULT_DATE_FETCHED_PAST_N_DAYS,
);
const locationQuery = process.env.HIRING_CAFE_LOCATION_QUERY?.trim() ?? "";
const locationRadiusMiles = parsePositiveInt(
process.env.HIRING_CAFE_LOCATION_RADIUS_MILES,
DEFAULT_LOCATION_RADIUS_MILES,
);
const outputPath =
process.env.HIRING_CAFE_OUTPUT_JSON ||
join(__dirname, "../storage/datasets/default/jobs.json");
const headless = process.env.HIRING_CAFE_HEADLESS !== "false";
let browser = await firefox.launch(
await launchOptions({
headless,
humanize: true,
geoip: true,
}),
);
let context = await browser.newContext();
let page = await context.newPage();
const allJobs: ExtractedJob[] = [];
const seen = new Set<string>();
try {
const initializePage = async () => {
await page.goto(BASE_URL, {
waitUntil: "domcontentloaded",
timeout: 60_000,
});
await page.waitForTimeout(2_000);
};
try {
await initializePage();
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn(
`Camoufox browser startup was unstable, retrying with vanilla Firefox: ${message}`,
);
await browser.close();
browser = await firefox.launch({ headless });
context = await browser.newContext();
page = await context.newPage();
await initializePage();
}
const countryLocation = resolveHiringCafeCountryLocation(country);
const countryLong =
countryLocation?.address_components[0]?.long_name ?? "United Kingdom";
const countryShort =
countryLocation?.address_components[0]?.short_name ?? "GB";
const cityLocationContext = locationQuery
? await resolveCityLocationContext({
city: locationQuery,
countryLong,
countryShort,
radiusMiles: locationRadiusMiles,
})
: null;
for (let i = 0; i < searchTerms.length; i += 1) {
const searchTerm = searchTerms[i];
const termIndex = i + 1;
emitProgress({
event: "term_start",
termIndex,
termTotal: searchTerms.length,
searchTerm,
});
const searchState = cityLocationContext
? createCitySearchState({
searchQuery: searchTerm,
dateFetchedPastNDays,
context: cityLocationContext,
})
: createDefaultSearchState({
searchQuery: searchTerm,
location: countryLocation,
dateFetchedPastNDays,
});
const encodedSearchState = encodeSearchState(searchState);
let totalAvailable: number | null = null;
try {
const countPayload = await callHiringCafeApi(
page,
"/api/search-jobs/get-total-count",
{
s: encodedSearchState,
},
);
totalAvailable = parseTotalCount(countPayload);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn(
`Hiring Cafe count request failed for term '${searchTerm}': ${message}`,
);
}
const termTarget =
totalAvailable !== null
? Math.min(maxJobsPerTerm, totalAvailable)
: maxJobsPerTerm;
let pageNo = 0;
let termCollected = 0;
while (termCollected < termTarget && pageNo < PAGE_LIMIT) {
const size = Math.min(1000, termTarget - termCollected);
const jobsPayload = await callHiringCafeApi(page, "/api/search-jobs", {
size: String(size),
page: String(pageNo),
s: encodedSearchState,
});
const batch = extractResultsBatch(jobsPayload);
if (batch.length === 0) break;
let mappedOnPage = 0;
for (const rawJob of batch) {
if (termCollected >= termTarget) break;
const mapped = mapHiringCafeJob(rawJob);
if (!mapped) continue;
const dedupeKey = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(dedupeKey)) continue;
seen.add(dedupeKey);
allJobs.push(mapped);
termCollected += 1;
mappedOnPage += 1;
}
emitProgress({
event: "page_fetched",
termIndex,
termTotal: searchTerms.length,
searchTerm,
pageNo,
resultsOnPage: mappedOnPage,
totalCollected: termCollected,
});
if (batch.length < size) break;
pageNo += 1;
}
emitProgress({
event: "term_complete",
termIndex,
termTotal: searchTerms.length,
searchTerm,
jobsFoundTerm: termCollected,
});
}
} finally {
await browser.close();
}
await mkdir(dirname(outputPath), { recursive: true });
await writeFile(outputPath, `${JSON.stringify(allJobs, null, 2)}\n`, "utf-8");
console.log(`Hiring Cafe extractor wrote ${allJobs.length} jobs`);
}
run().catch((error: unknown) => {
const message = error instanceof Error ? error.message : "Unknown error";
console.error(`Hiring Cafe extractor failed: ${message}`);
process.exitCode = 1;
});