Hiring cafe extractor (#192)

* feat(hiringcafe): register new source across shared/server/client enums

* feat(hiringcafe-extractor): add browser-backed Hiring Cafe dataset extractor

* feat(orchestrator): integrate Hiring Cafe discovery service into pipeline

* feat(orchestrator-ui): add Hiring Cafe to source availability and run estimates

* chore(hiringcafe): wire CI/docker and add extractor documentation

* chore(format): apply biome formatting for Hiring Cafe integration

* add original websites

* coomints

* number or null
This commit is contained in:
Shaheer Sarfaraz 2026-02-19 12:51:55 +00:00 committed by GitHub
parent 16dd17ebea
commit d34a9f041b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 1363 additions and 5 deletions

View File

@ -52,6 +52,7 @@ jobs:
project: project:
- orchestrator - orchestrator
- adzuna-extractor - adzuna-extractor
- hiringcafe-extractor
- gradcracker-extractor - gradcracker-extractor
- ukvisajobs-extractor - ukvisajobs-extractor
steps: steps:

View File

@ -36,6 +36,7 @@ COPY docs-site/package*.json ./docs-site/
COPY shared/package*.json ./shared/ COPY shared/package*.json ./shared/
COPY orchestrator/package*.json ./orchestrator/ COPY orchestrator/package*.json ./orchestrator/
COPY extractors/adzuna/package*.json ./extractors/adzuna/ COPY extractors/adzuna/package*.json ./extractors/adzuna/
COPY extractors/hiringcafe/package*.json ./extractors/hiringcafe/
COPY extractors/gradcracker/package*.json ./extractors/gradcracker/ COPY extractors/gradcracker/package*.json ./extractors/gradcracker/
COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/ COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/
@ -54,6 +55,7 @@ COPY shared ./shared
COPY docs-site ./docs-site COPY docs-site ./docs-site
COPY orchestrator ./orchestrator COPY orchestrator ./orchestrator
COPY extractors/adzuna ./extractors/adzuna COPY extractors/adzuna ./extractors/adzuna
COPY extractors/hiringcafe ./extractors/hiringcafe
COPY extractors/gradcracker ./extractors/gradcracker COPY extractors/gradcracker ./extractors/gradcracker
COPY extractors/jobspy ./extractors/jobspy COPY extractors/jobspy ./extractors/jobspy
COPY extractors/ukvisajobs ./extractors/ukvisajobs COPY extractors/ukvisajobs ./extractors/ukvisajobs
@ -100,6 +102,7 @@ COPY docs-site/package*.json ./docs-site/
COPY shared/package*.json ./shared/ COPY shared/package*.json ./shared/
COPY orchestrator/package*.json ./orchestrator/ COPY orchestrator/package*.json ./orchestrator/
COPY extractors/adzuna/package*.json ./extractors/adzuna/ COPY extractors/adzuna/package*.json ./extractors/adzuna/
COPY extractors/hiringcafe/package*.json ./extractors/hiringcafe/
COPY extractors/gradcracker/package*.json ./extractors/gradcracker/ COPY extractors/gradcracker/package*.json ./extractors/gradcracker/
COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/ COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/
@ -114,6 +117,7 @@ COPY --from=builder /app/docs-site/build ./orchestrator/dist/docs
COPY shared ./shared COPY shared ./shared
COPY orchestrator ./orchestrator COPY orchestrator ./orchestrator
COPY extractors/adzuna ./extractors/adzuna COPY extractors/adzuna ./extractors/adzuna
COPY extractors/hiringcafe ./extractors/hiringcafe
COPY extractors/gradcracker ./extractors/gradcracker COPY extractors/gradcracker ./extractors/gradcracker
COPY extractors/jobspy ./extractors/jobspy COPY extractors/jobspy ./extractors/jobspy
COPY extractors/ukvisajobs ./extractors/ukvisajobs COPY extractors/ukvisajobs ./extractors/ukvisajobs

View File

@ -47,6 +47,9 @@ services:
- path: ./extractors/gradcracker/src - path: ./extractors/gradcracker/src
target: /app/extractors/gradcracker/src target: /app/extractors/gradcracker/src
action: sync+restart action: sync+restart
- path: ./extractors/hiringcafe/src
target: /app/extractors/hiringcafe/src
action: sync+restart
- path: ./extractors/ukvisajobs/src - path: ./extractors/ukvisajobs/src
target: /app/extractors/ukvisajobs/src target: /app/extractors/ukvisajobs/src
action: sync+restart action: sync+restart

View File

@ -7,6 +7,8 @@ sidebar_position: 6
## What it is ## What it is
Original website: [adzuna.com](https://www.adzuna.com)
Adzuna is an API-backed extractor implemented in two lean pieces: Adzuna is an API-backed extractor implemented in two lean pieces:
1. `extractors/adzuna/src/main.ts` fetches paginated Adzuna search results and writes `jobs.json`. 1. `extractors/adzuna/src/main.ts` fetches paginated Adzuna search results and writes `jobs.json`.

View File

@ -7,6 +7,8 @@ sidebar_position: 2
A plain-English walkthrough of the Gradcracker extractor in `extractors/gradcracker`. A plain-English walkthrough of the Gradcracker extractor in `extractors/gradcracker`.
Original website: [gradcracker.com](https://www.gradcracker.com)
## Big picture ## Big picture
The crawler builds search URLs, scrapes listing pages, then opens job details for descriptions and apply URLs. The crawler builds search URLs, scrapes listing pages, then opens job details for descriptions and apply URLs.

View File

@ -0,0 +1,74 @@
---
id: hiring-cafe
title: Hiring Cafe Extractor
description: Browser-backed Hiring Cafe extraction integrated into the pipeline source selector.
sidebar_position: 7
---
## What it is
Original website: [hiring.cafe](https://hiring.cafe)
Special thanks: Initial implementation inspiration came from [umur957/hiring-cafe-job-scraper](https://github.com/umur957/hiring-cafe-job-scraper).
Hiring Cafe is a browser-backed extractor that queries Hiring Cafe search APIs and maps results into the orchestrator `CreateJobInput` shape.
Implementation split:
1. `extractors/hiringcafe/src/main.ts` builds search state, calls Hiring Cafe APIs, and writes dataset JSON.
2. `orchestrator/src/server/services/hiring-cafe.ts` runs the extractor, streams progress events, and maps rows for pipeline import.
## Why it exists
Hiring Cafe adds another non-credentialed source that can be enabled from the existing source picker, without adding new settings UI.
It also supports term-by-term search and country-aware search state using the same pipeline knobs you already set for automatic runs.
## How to use it
1. Open **Run jobs** and choose **Automatic**.
2. **Hiring Cafe** is enabled by default in **Sources** (toggle it off if you do not want it for this run).
3. Set your existing automatic run knobs:
- `searchTerms` drive per-term Hiring Cafe `searchQuery`.
- selected country maps into Hiring Cafe location search state.
- run budget path (`jobspyResultsWanted`) is reused as the max jobs-per-term cap.
4. Start the run and watch progress in the pipeline progress card.
Defaults and constraints:
- No new Hiring Cafe settings fields were added.
- `worldwide` and `usa/ca` run in broad mode without a strict country location filter.
- Hiring Cafe is enabled by default in source selection.
- `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` controls recency window when running extractor directly (default `7`).
Local run example:
```bash
HIRING_CAFE_SEARCH_TERMS='["backend engineer"]' \
HIRING_CAFE_COUNTRY='united kingdom' \
HIRING_CAFE_MAX_JOBS_PER_TERM='50' \
npm --workspace hiringcafe-extractor run start
```
## Common problems
### Hiring Cafe returns 429 / Vercel security checkpoint
- The extractor first attempts Camoufox-backed Firefox and falls back to vanilla Firefox startup if Camoufox is unstable locally.
- If upstream blocks continue, retry later or reduce run concurrency at the pipeline level by selecting fewer sources.
### Hiring Cafe does not appear in sources
- Check that client is running on latest build containing the new source list.
- Hiring Cafe is source-only and does not require credentials, so it should appear once the new build is loaded.
### Results are lower than expected
- Cap is tied to automatic run budget path (`jobspyResultsWanted`) and search term count.
- Country mapping can narrow results when a strict country location is applied.
## Related pages
- [Extractors Overview](/docs/next/extractors/overview)
- [Pipeline Run](/docs/next/features/pipeline-run)
- [Settings](/docs/next/features/settings)

View File

@ -7,6 +7,11 @@ sidebar_position: 3
A walkthrough of the JobSpy extractor for Indeed, LinkedIn, and Glassdoor. A walkthrough of the JobSpy extractor for Indeed, LinkedIn, and Glassdoor.
Original websites:
- [indeed.com](https://www.indeed.com)
- [linkedin.com/jobs](https://www.linkedin.com/jobs)
- [glassdoor.com](https://www.glassdoor.com)
## Big picture ## Big picture
JobSpy runs as a Python script per search term, writes JSON, then orchestrator ingests and normalizes into internal job shape. JobSpy runs as a Python script per search term, writes JSON, then orchestrator ingests and normalizes into internal job shape.

View File

@ -14,6 +14,7 @@ This page helps you choose the right extractor for your run, understand key cons
| [Gradcracker](/docs/next/extractors/gradcracker) | UK graduate roles from Gradcracker | Crawling stability depends on page structure and anti-bot behavior; tuned for low concurrency | `GRADCRACKER_SEARCH_TERMS`, `GRADCRACKER_MAX_JOBS_PER_TERM`, `JOBOPS_SKIP_APPLY_FOR_EXISTING` | Scrapes listing metadata, then detail pages and apply URL resolution | | [Gradcracker](/docs/next/extractors/gradcracker) | UK graduate roles from Gradcracker | Crawling stability depends on page structure and anti-bot behavior; tuned for low concurrency | `GRADCRACKER_SEARCH_TERMS`, `GRADCRACKER_MAX_JOBS_PER_TERM`, `JOBOPS_SKIP_APPLY_FOR_EXISTING` | Scrapes listing metadata, then detail pages and apply URL resolution |
| [JobSpy](/docs/next/extractors/jobspy) | Multi-source discovery (Indeed, LinkedIn, Glassdoor) | Requires Python wrapper execution per term; source availability and quality vary by site/location | `JOBSPY_SITES`, `JOBSPY_SEARCH_TERMS`, `JOBSPY_RESULTS_WANTED`, `JOBSPY_HOURS_OLD`, `JOBSPY_LINKEDIN_FETCH_DESCRIPTION` | Produces JSON per term, then orchestrator normalizes and de-duplicates by `jobUrl` | | [JobSpy](/docs/next/extractors/jobspy) | Multi-source discovery (Indeed, LinkedIn, Glassdoor) | Requires Python wrapper execution per term; source availability and quality vary by site/location | `JOBSPY_SITES`, `JOBSPY_SEARCH_TERMS`, `JOBSPY_RESULTS_WANTED`, `JOBSPY_HOURS_OLD`, `JOBSPY_LINKEDIN_FETCH_DESCRIPTION` | Produces JSON per term, then orchestrator normalizes and de-duplicates by `jobUrl` |
| [Adzuna](/docs/next/extractors/adzuna) | API-based multi-country discovery with low scraping overhead | Requires valid App ID/App Key; country must be in Adzuna-supported list | `ADZUNA_APP_ID`, `ADZUNA_APP_KEY`, `ADZUNA_MAX_JOBS_PER_TERM` | API pagination to dataset output; orchestrator maps progress and de-duplicates by `sourceJobId`/`jobUrl` | | [Adzuna](/docs/next/extractors/adzuna) | API-based multi-country discovery with low scraping overhead | Requires valid App ID/App Key; country must be in Adzuna-supported list | `ADZUNA_APP_ID`, `ADZUNA_APP_KEY`, `ADZUNA_MAX_JOBS_PER_TERM` | API pagination to dataset output; orchestrator maps progress and de-duplicates by `sourceJobId`/`jobUrl` |
| [Hiring Cafe](/docs/next/extractors/hiring-cafe) | Browser-backed discovery using Hiring Cafe search APIs | Subject to upstream anti-bot checks; uses browser context and encoded search-state payloads | `HIRING_CAFE_SEARCH_TERMS`, `HIRING_CAFE_COUNTRY`, `HIRING_CAFE_MAX_JOBS_PER_TERM`, `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` | Uses existing pipeline term/country/budget knobs and maps directly to normalized jobs |
| [UKVisaJobs](/docs/next/extractors/ukvisajobs) | UK visa sponsorship-focused roles | Requires authenticated session and periodic token/cookie refresh | `UKVISAJOBS_EMAIL`, `UKVISAJOBS_PASSWORD`, `UKVISAJOBS_MAX_JOBS`, `UKVISAJOBS_SEARCH_KEYWORD` | API pagination + dataset output; orchestrator de-dupes and may fetch missing descriptions | | [UKVisaJobs](/docs/next/extractors/ukvisajobs) | UK visa sponsorship-focused roles | Requires authenticated session and periodic token/cookie refresh | `UKVISAJOBS_EMAIL`, `UKVISAJOBS_PASSWORD`, `UKVISAJOBS_MAX_JOBS`, `UKVISAJOBS_SEARCH_KEYWORD` | API pagination + dataset output; orchestrator de-dupes and may fetch missing descriptions |
| [Manual Import](/docs/next/extractors/manual) | One-off jobs not covered by scrapers | Inference quality depends on model/provider and input quality; some URLs cannot be fetched reliably | App/API endpoints (`/api/manual-jobs/infer`, `/api/manual-jobs/import`) | Accepts text/HTML/URL, runs inference, then saves and scores job after review | | [Manual Import](/docs/next/extractors/manual) | One-off jobs not covered by scrapers | Inference quality depends on model/provider and input quality; some URLs cannot be fetched reliably | App/API endpoints (`/api/manual-jobs/infer`, `/api/manual-jobs/import`) | Accepts text/HTML/URL, runs inference, then saves and scores job after review |
@ -21,6 +22,7 @@ This page helps you choose the right extractor for your run, understand key cons
- Use **JobSpy** for broad first-pass sourcing across common boards. - Use **JobSpy** for broad first-pass sourcing across common boards.
- Use **Adzuna** when you want API-first discovery in supported non-UK markets. - Use **Adzuna** when you want API-first discovery in supported non-UK markets.
- Use **Hiring Cafe** when you want another term/country-driven source without adding credentials.
- Use **Gradcracker** when targeting graduate pipelines in the UK. - Use **Gradcracker** when targeting graduate pipelines in the UK.
- Use **UKVisaJobs** for sponsorship-specific UK searches. - Use **UKVisaJobs** for sponsorship-specific UK searches.
- Use **Manual Import** when you already have a specific posting and need direct import. - Use **Manual Import** when you already have a specific posting and need direct import.
@ -32,5 +34,6 @@ Many runs combine sources: broad discovery first, then manual import for high-pr
- [Gradcracker](/docs/next/extractors/gradcracker) - [Gradcracker](/docs/next/extractors/gradcracker)
- [JobSpy](/docs/next/extractors/jobspy) - [JobSpy](/docs/next/extractors/jobspy)
- [Adzuna](/docs/next/extractors/adzuna) - [Adzuna](/docs/next/extractors/adzuna)
- [Hiring Cafe](/docs/next/extractors/hiring-cafe)
- [UKVisaJobs](/docs/next/extractors/ukvisajobs) - [UKVisaJobs](/docs/next/extractors/ukvisajobs)
- [Manual Import](/docs/next/extractors/manual) - [Manual Import](/docs/next/extractors/manual)

View File

@ -7,6 +7,8 @@ sidebar_position: 5
UKVisaJobs is the most complex extractor because authenticated sessions are required. UKVisaJobs is the most complex extractor because authenticated sessions are required.
Original website: [my.ukvisajobs.com](https://my.ukvisajobs.com)
## Big picture ## Big picture
Two layers: Two layers:

View File

@ -45,6 +45,7 @@ const sidebars: SidebarsConfig = {
"extractors/gradcracker", "extractors/gradcracker",
"extractors/jobspy", "extractors/jobspy",
"extractors/adzuna", "extractors/adzuna",
"extractors/hiring-cafe",
"extractors/manual", "extractors/manual",
"extractors/ukvisajobs", "extractors/ukvisajobs",
], ],

View File

@ -0,0 +1,20 @@
# Hiring Cafe Extractor
Browser-backed extractor for Hiring Cafe search APIs.
Special thanks: initial implementation inspiration came from [umur957/hiring-cafe-job-scraper](https://github.com/umur957/hiring-cafe-job-scraper).
## Environment
- `HIRING_CAFE_SEARCH_TERMS` (JSON array or `|` / comma / newline-delimited)
- `HIRING_CAFE_COUNTRY` (default: `united kingdom`)
- `HIRING_CAFE_MAX_JOBS_PER_TERM` (default: `200`)
- `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` (default: `7`)
- `HIRING_CAFE_OUTPUT_JSON` (default: `storage/datasets/default/jobs.json`)
- `JOBOPS_EMIT_PROGRESS=1` to emit `JOBOPS_PROGRESS` events
- `HIRING_CAFE_HEADLESS=false` to run headed
## Notes
- The extractor uses `s = base64(url-encoded JSON search state)`.
- `worldwide` and `usa/ca` are treated as broad search modes without hard country location filters.

View File

@ -0,0 +1,26 @@
{
"name": "hiringcafe-extractor",
"version": "0.0.1",
"type": "module",
"description": "Hiring Cafe extractor - fetches jobs via browser-backed API requests",
"main": "src/main.ts",
"dependencies": {
"camoufox-js": "^0.8.0",
"job-ops-shared": "^1.0.0",
"playwright": "^1.57.0",
"tsx": "^4.4.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"optionalDependencies": {
"impit-linux-x64-gnu": "^0.1.0"
},
"scripts": {
"start": "tsx src/main.ts",
"start:dev": "tsx src/main.ts",
"check:types": "tsc --noEmit",
"get-binaries": "camoufox-js fetch"
}
}

View File

@ -0,0 +1,118 @@
export function normalizeCountryKey(value: string | null | undefined): string {
const normalized = value?.trim().toLowerCase() ?? "";
if (normalized === "uk") return "united kingdom";
if (normalized === "us" || normalized === "usa") return "united states";
if (normalized === "türkiye") return "turkey";
if (normalized === "czech republic") return "czechia";
return normalized;
}
export interface HiringCafeCountryLocation {
formatted_address: string;
types: ["country"];
id: "user_country";
address_components: Array<{
long_name: string;
short_name: string;
types: ["country"];
}>;
options: {
flexible_regions: ["anywhere_in_continent", "anywhere_in_world"];
};
}
const GLOBAL_SEARCH_KEYS = new Set(["worldwide", "usa/ca"]);
const COUNTRY_NAME_OVERRIDES: Record<string, string> = {
"united states": "United States",
"united kingdom": "United Kingdom",
"united arab emirates": "United Arab Emirates",
"new zealand": "New Zealand",
"south korea": "South Korea",
"south africa": "South Africa",
"costa rica": "Costa Rica",
"saudi arabia": "Saudi Arabia",
"hong kong": "Hong Kong",
czechia: "Czechia",
türkiye: "Turkey",
turkey: "Turkey",
};
const ISO2_ALIASES: Record<string, string> = {
"united states": "US",
"united kingdom": "GB",
"united arab emirates": "AE",
"new zealand": "NZ",
"south korea": "KR",
"south africa": "ZA",
"costa rica": "CR",
"saudi arabia": "SA",
"hong kong": "HK",
czechia: "CZ",
türkiye: "TR",
turkey: "TR",
};
const regionNameMap = buildRegionNameMap();
function buildRegionNameMap(): Map<string, string> {
const names = new Intl.DisplayNames(["en"], { type: "region" });
const map = new Map<string, string>();
for (let i = 65; i <= 90; i += 1) {
for (let j = 65; j <= 90; j += 1) {
const iso2 = String.fromCharCode(i, j);
const displayName = names.of(iso2);
if (!displayName || displayName === iso2) continue;
map.set(normalizeCountryKey(displayName), iso2);
}
}
return map;
}
function toCountryLabel(countryKey: string): string {
const override = COUNTRY_NAME_OVERRIDES[countryKey];
if (override) return override;
return countryKey.replace(/\b\w/g, (char) => char.toUpperCase());
}
function toIso2(countryKey: string): string | null {
if (ISO2_ALIASES[countryKey]) {
return ISO2_ALIASES[countryKey];
}
return regionNameMap.get(countryKey) ?? null;
}
export function shouldUseGlobalLocation(countryInput?: string | null): boolean {
const countryKey = normalizeCountryKey(countryInput);
return !countryKey || GLOBAL_SEARCH_KEYS.has(countryKey);
}
export function resolveHiringCafeCountryLocation(
countryInput?: string | null,
): HiringCafeCountryLocation | null {
const countryKey = normalizeCountryKey(countryInput);
if (!countryKey || GLOBAL_SEARCH_KEYS.has(countryKey)) return null;
const iso2 = toIso2(countryKey);
if (!iso2) return null;
const longName = toCountryLabel(countryKey);
return {
formatted_address: longName,
types: ["country"],
id: "user_country",
address_components: [
{
long_name: longName,
short_name: iso2,
types: ["country"],
},
],
options: {
flexible_regions: ["anywhere_in_continent", "anywhere_in_world"],
},
};
}

View File

@ -0,0 +1,91 @@
import type { HiringCafeCountryLocation } from "./country-map.js";
export interface HiringCafeSearchState {
locations: HiringCafeCountryLocation[];
workplaceTypes: Array<"Remote" | "Hybrid" | "Onsite">;
defaultToUserLocation: boolean;
userLocation: null;
commitmentTypes: string[];
seniorityLevel: string[];
roleTypes: string[];
roleYoeRange: [number, number];
excludeIfRoleYoeIsNotSpecified: boolean;
managementYoeRange: [number, number];
excludeIfManagementYoeIsNotSpecified: boolean;
securityClearances: string[];
searchQuery: string;
dateFetchedPastNDays: number;
hiddenCompanies: string[];
sortBy: "default";
companyPublicOrPrivate: "all";
latestInvestmentYearRange: [null, null];
latestInvestmentSeries: string[];
latestInvestmentAmount: null;
latestInvestmentCurrency: string[];
investors: string[];
excludedInvestors: string[];
isNonProfit: "all";
companySizeRanges: string[];
minYearFounded: null;
maxYearFounded: null;
excludedLatestInvestmentSeries: string[];
}
export function createDefaultSearchState(args: {
searchQuery: string;
location: HiringCafeCountryLocation | null;
dateFetchedPastNDays: number;
}): HiringCafeSearchState {
return {
locations: args.location ? [args.location] : [],
workplaceTypes: ["Remote", "Hybrid", "Onsite"],
defaultToUserLocation: false,
userLocation: null,
commitmentTypes: [
"Full Time",
"Part Time",
"Contract",
"Internship",
"Temporary",
"Seasonal",
"Volunteer",
],
seniorityLevel: [
"No Prior Experience Required",
"Entry Level",
"Mid Level",
"Senior Level",
],
roleTypes: ["Individual Contributor", "People Manager"],
roleYoeRange: [0, 20],
excludeIfRoleYoeIsNotSpecified: false,
managementYoeRange: [0, 20],
excludeIfManagementYoeIsNotSpecified: false,
securityClearances: [
"None",
"Confidential",
"Secret",
"Top Secret",
"Top Secret/SCI",
"Public Trust",
"Interim Clearances",
"Other",
],
searchQuery: args.searchQuery,
dateFetchedPastNDays: args.dateFetchedPastNDays,
hiddenCompanies: [],
sortBy: "default",
companyPublicOrPrivate: "all",
latestInvestmentYearRange: [null, null],
latestInvestmentSeries: [],
latestInvestmentAmount: null,
latestInvestmentCurrency: [],
investors: [],
excludedInvestors: [],
isNonProfit: "all",
companySizeRanges: [],
minYearFounded: null,
maxYearFounded: null,
excludedLatestInvestmentSeries: [],
};
}

View File

@ -0,0 +1,439 @@
import { mkdir, writeFile } from "node:fs/promises";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
import { launchOptions } from "camoufox-js";
import {
toNumberOrNull,
toStringOrNull,
} from "job-ops-shared/utils/type-conversion";
import { firefox, type Page } from "playwright";
import {
normalizeCountryKey,
resolveHiringCafeCountryLocation,
} from "./country-map.js";
import { createDefaultSearchState } from "./default-search-state.js";
const __dirname = dirname(fileURLToPath(import.meta.url));
const BASE_URL = "https://hiring.cafe";
const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
const DEFAULT_MAX_JOBS_PER_TERM = 200;
const DEFAULT_SEARCH_TERM = "web developer";
const DEFAULT_DATE_FETCHED_PAST_N_DAYS = 30;
const PAGE_LIMIT = 50;
type RawHiringCafeJob = Record<string, unknown>;
interface ExtractedJob {
source: "hiringcafe";
sourceJobId?: string;
title: string;
employer: string;
jobUrl: string;
applicationLink: string;
location?: string;
salary?: string;
datePosted?: string;
jobDescription?: string;
jobType?: string;
}
interface BrowserApiResponse {
ok: boolean;
status: number;
statusText: string;
data: unknown;
responseText: string;
}
function emitProgress(payload: Record<string, unknown>): void {
if (process.env.JOBOPS_EMIT_PROGRESS !== "1") return;
console.log(`${JOBOPS_PROGRESS_PREFIX}${JSON.stringify(payload)}`);
}
function parsePositiveInt(input: string | undefined, fallback: number): number {
const parsed = input ? Number.parseInt(input, 10) : Number.NaN;
if (!Number.isFinite(parsed) || parsed < 1) return fallback;
return parsed;
}
function parseSearchTerms(raw: string | undefined): string[] {
if (!raw || raw.trim().length === 0) return [DEFAULT_SEARCH_TERM];
const trimmed = raw.trim();
if (trimmed.startsWith("[")) {
try {
const parsed = JSON.parse(trimmed) as unknown;
if (Array.isArray(parsed)) {
const terms = parsed
.map((value) => toStringOrNull(value))
.filter((value): value is string => Boolean(value));
if (terms.length > 0) return terms;
}
} catch {
// Fall through to delimiter parsing.
}
}
const delimiter = trimmed.includes("|")
? "|"
: trimmed.includes("\n")
? "\n"
: ",";
const terms = trimmed
.split(delimiter)
.map((value) => value.trim())
.filter(Boolean);
return terms.length > 0 ? terms : [DEFAULT_SEARCH_TERM];
}
function encodeSearchState(searchState: unknown): string {
const json = JSON.stringify(searchState);
const urlEncodedJson = encodeURIComponent(json);
return Buffer.from(urlEncodedJson, "utf-8").toString("base64");
}
function asRecord(value: unknown): Record<string, unknown> | null {
if (!value || typeof value !== "object" || Array.isArray(value)) return null;
return value as Record<string, unknown>;
}
function asStringArray(value: unknown): string[] {
if (!Array.isArray(value)) return [];
return value
.map((item) => toStringOrNull(item))
.filter((item): item is string => Boolean(item));
}
function firstArrayValue(value: unknown): string | null {
const values = asStringArray(value);
return values.length > 0 ? values[0] : null;
}
function formatCompensation(
processedJobData: Record<string, unknown> | null,
): string | undefined {
if (!processedJobData) return undefined;
const min = toNumberOrNull(processedJobData.yearly_min_compensation);
const max = toNumberOrNull(processedJobData.yearly_max_compensation);
if (min === null && max === null) return undefined;
const currency = toStringOrNull(
processedJobData.listed_compensation_currency,
);
const frequency =
toStringOrNull(processedJobData.listed_compensation_frequency) ?? "Yearly";
const amount =
min !== null && max !== null
? `${Math.round(min)}-${Math.round(max)}`
: min !== null
? `${Math.round(min)}+`
: `${Math.round(max ?? 0)}`;
const parts = [currency, amount, frequency ? `/ ${frequency}` : ""]
.filter(Boolean)
.join(" ")
.trim();
return parts || undefined;
}
function mapHiringCafeJob(raw: RawHiringCafeJob): ExtractedJob | null {
const jobInformation = asRecord(raw.job_information);
const processed = asRecord(raw.v5_processed_job_data);
const companyInfo = asRecord(jobInformation?.company_info);
const sourceJobId =
toStringOrNull(raw.id) ??
toStringOrNull(raw.objectID) ??
toStringOrNull(raw.original_source_id) ??
toStringOrNull(raw.requisition_id) ??
undefined;
const jobUrl = toStringOrNull(raw.apply_url);
if (!jobUrl) return null;
const title =
toStringOrNull(jobInformation?.title) ??
toStringOrNull(jobInformation?.job_title_raw) ??
toStringOrNull(processed?.core_job_title) ??
"Unknown Title";
const employer =
toStringOrNull(companyInfo?.name) ??
toStringOrNull(processed?.company_name) ??
"Unknown Employer";
const location =
toStringOrNull(processed?.formatted_workplace_location) ??
firstArrayValue(processed?.workplace_cities) ??
firstArrayValue(processed?.workplace_states) ??
firstArrayValue(processed?.workplace_countries) ??
undefined;
const commitments = asStringArray(processed?.commitment);
const jobType = commitments.length > 0 ? commitments.join(", ") : undefined;
return {
source: "hiringcafe",
sourceJobId,
title,
employer,
jobUrl,
applicationLink: jobUrl,
location,
salary: formatCompensation(processed),
datePosted: toStringOrNull(processed?.estimated_publish_date) ?? undefined,
jobDescription: toStringOrNull(jobInformation?.description) ?? undefined,
jobType,
};
}
function extractResultsBatch(payload: unknown): RawHiringCafeJob[] {
if (Array.isArray(payload)) {
return payload.filter(
(item): item is RawHiringCafeJob =>
Boolean(item) && typeof item === "object" && !Array.isArray(item),
);
}
const payloadRecord = asRecord(payload);
const results = payloadRecord?.results;
if (!Array.isArray(results)) return [];
return results.filter(
(item): item is RawHiringCafeJob =>
Boolean(item) && typeof item === "object" && !Array.isArray(item),
);
}
function parseTotalCount(payload: unknown): number | null {
const payloadRecord = asRecord(payload);
if (!payloadRecord) return null;
return toNumberOrNull(payloadRecord.total);
}
async function callHiringCafeApi(
page: Page,
endpoint: string,
params: Record<string, string>,
): Promise<unknown> {
const response = await page.evaluate(
async ({ endpointArg, paramsArg }) => {
const url = new URL(endpointArg, window.location.origin);
for (const [key, value] of Object.entries(paramsArg)) {
url.searchParams.set(key, value);
}
const res = await fetch(url.toString(), {
method: "GET",
credentials: "include",
headers: {
Accept: "application/json, text/plain, */*",
},
});
const text = await res.text();
let data: unknown = null;
try {
data = JSON.parse(text);
} catch {
// Keep response text for diagnostics.
}
const output: BrowserApiResponse = {
ok: res.ok,
status: res.status,
statusText: res.statusText,
data,
responseText: text,
};
return output;
},
{ endpointArg: endpoint, paramsArg: params },
);
const result = response as BrowserApiResponse;
if (!result.ok) {
const snippet = result.responseText.slice(0, 250);
throw new Error(
`Hiring Cafe API ${endpoint} failed (${result.status} ${result.statusText}): ${snippet}`,
);
}
if (result.data === null) {
const snippet = result.responseText.slice(0, 250);
throw new Error(
`Hiring Cafe API ${endpoint} returned non-JSON response: ${snippet}`,
);
}
return result.data;
}
async function run(): Promise<void> {
const searchTerms = parseSearchTerms(process.env.HIRING_CAFE_SEARCH_TERMS);
const country = normalizeCountryKey(
process.env.HIRING_CAFE_COUNTRY ?? "united kingdom",
);
const maxJobsPerTerm = parsePositiveInt(
process.env.HIRING_CAFE_MAX_JOBS_PER_TERM,
DEFAULT_MAX_JOBS_PER_TERM,
);
const dateFetchedPastNDays = parsePositiveInt(
process.env.HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS,
DEFAULT_DATE_FETCHED_PAST_N_DAYS,
);
const outputPath =
process.env.HIRING_CAFE_OUTPUT_JSON ||
join(__dirname, "../storage/datasets/default/jobs.json");
const headless = process.env.HIRING_CAFE_HEADLESS !== "false";
let browser = await firefox.launch(
await launchOptions({
headless,
humanize: true,
geoip: true,
}),
);
let context = await browser.newContext();
let page = await context.newPage();
const allJobs: ExtractedJob[] = [];
const seen = new Set<string>();
try {
const initializePage = async () => {
await page.goto(BASE_URL, {
waitUntil: "domcontentloaded",
timeout: 60_000,
});
await page.waitForTimeout(2_000);
};
try {
await initializePage();
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn(
`Camoufox browser startup was unstable, retrying with vanilla Firefox: ${message}`,
);
await browser.close();
browser = await firefox.launch({ headless });
context = await browser.newContext();
page = await context.newPage();
await initializePage();
}
for (let i = 0; i < searchTerms.length; i += 1) {
const searchTerm = searchTerms[i];
const termIndex = i + 1;
emitProgress({
event: "term_start",
termIndex,
termTotal: searchTerms.length,
searchTerm,
});
const location = resolveHiringCafeCountryLocation(country);
const searchState = createDefaultSearchState({
searchQuery: searchTerm,
location,
dateFetchedPastNDays,
});
const encodedSearchState = encodeSearchState(searchState);
let totalAvailable: number | null = null;
try {
const countPayload = await callHiringCafeApi(
page,
"/api/search-jobs/get-total-count",
{
s: encodedSearchState,
},
);
totalAvailable = parseTotalCount(countPayload);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn(
`Hiring Cafe count request failed for term '${searchTerm}': ${message}`,
);
}
const termTarget =
totalAvailable !== null
? Math.min(maxJobsPerTerm, totalAvailable)
: maxJobsPerTerm;
let pageNo = 0;
let termCollected = 0;
while (termCollected < termTarget && pageNo < PAGE_LIMIT) {
const size = Math.min(1000, termTarget - termCollected);
const jobsPayload = await callHiringCafeApi(page, "/api/search-jobs", {
size: String(size),
page: String(pageNo),
s: encodedSearchState,
});
const batch = extractResultsBatch(jobsPayload);
if (batch.length === 0) break;
let mappedOnPage = 0;
for (const rawJob of batch) {
if (termCollected >= termTarget) break;
const mapped = mapHiringCafeJob(rawJob);
if (!mapped) continue;
const dedupeKey = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(dedupeKey)) continue;
seen.add(dedupeKey);
allJobs.push(mapped);
termCollected += 1;
mappedOnPage += 1;
}
emitProgress({
event: "page_fetched",
termIndex,
termTotal: searchTerms.length,
searchTerm,
pageNo,
resultsOnPage: mappedOnPage,
totalCollected: termCollected,
});
if (batch.length < size) break;
pageNo += 1;
}
emitProgress({
event: "term_complete",
termIndex,
termTotal: searchTerms.length,
searchTerm,
jobsFoundTerm: termCollected,
});
}
} finally {
await browser.close();
}
await mkdir(dirname(outputPath), { recursive: true });
await writeFile(outputPath, `${JSON.stringify(allJobs, null, 2)}\n`, "utf-8");
console.log(`Hiring Cafe extractor wrote ${allJobs.length} jobs`);
}
run().catch((error: unknown) => {
const message = error instanceof Error ? error.message : "Unknown error";
console.error(`Hiring Cafe extractor failed: ${message}`);
process.exitCode = 1;
});

View File

@ -0,0 +1,13 @@
{
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"]
},
"include": ["./src/**/*"]
}

View File

@ -24,7 +24,13 @@ interface PipelineProgress {
| "failed"; | "failed";
message: string; message: string;
detail?: string; detail?: string;
crawlingSource: "gradcracker" | "jobspy" | "ukvisajobs" | "adzuna" | null; crawlingSource:
| "gradcracker"
| "jobspy"
| "ukvisajobs"
| "adzuna"
| "hiringcafe"
| null;
crawlingSourcesCompleted: number; crawlingSourcesCompleted: number;
crawlingSourcesTotal: number; crawlingSourcesTotal: number;
crawlingTermsProcessed: number; crawlingTermsProcessed: number;
@ -85,6 +91,7 @@ const sourceLabel: Record<
jobspy: "JobSpy", jobspy: "JobSpy",
ukvisajobs: "UKVisaJobs", ukvisajobs: "UKVisaJobs",
adzuna: "Adzuna", adzuna: "Adzuna",
hiringcafe: "Hiring Cafe",
}; };
const clamp = (value: number, min: number, max: number) => const clamp = (value: number, min: number, max: number) =>

View File

@ -92,4 +92,20 @@ describe("automatic-run utilities", () => {
expect(estimate.discovered.cap).toBeGreaterThan(0); expect(estimate.discovered.cap).toBeGreaterThan(0);
expect(estimate.discovered.cap).toBeLessThanOrEqual(120); expect(estimate.discovered.cap).toBeLessThanOrEqual(120);
}); });
it("includes hiringcafe in estimate caps using the shared term budget", () => {
const estimate = calculateAutomaticEstimate({
values: {
topN: 10,
minSuitabilityScore: 50,
searchTerms: ["backend", "platform"],
runBudget: 120,
country: "united kingdom",
},
sources: ["hiringcafe"],
});
expect(estimate.discovered.cap).toBeGreaterThan(0);
expect(estimate.discovered.cap).toBeLessThanOrEqual(120);
});
}); });

View File

@ -77,6 +77,7 @@ export function deriveExtractorLimits(args: {
const includesGradcracker = args.sources.includes("gradcracker"); const includesGradcracker = args.sources.includes("gradcracker");
const includesUkVisaJobs = args.sources.includes("ukvisajobs"); const includesUkVisaJobs = args.sources.includes("ukvisajobs");
const includesAdzuna = args.sources.includes("adzuna"); const includesAdzuna = args.sources.includes("adzuna");
const includesHiringCafe = args.sources.includes("hiringcafe");
const weightedContributors = const weightedContributors =
(includesIndeed ? termCount : 0) + (includesIndeed ? termCount : 0) +
@ -84,7 +85,8 @@ export function deriveExtractorLimits(args: {
(includesGlassdoor ? termCount : 0) + (includesGlassdoor ? termCount : 0) +
(includesGradcracker ? termCount : 0) + (includesGradcracker ? termCount : 0) +
(includesUkVisaJobs ? 1 : 0) + (includesUkVisaJobs ? 1 : 0) +
(includesAdzuna ? termCount : 0); (includesAdzuna ? termCount : 0) +
(includesHiringCafe ? termCount : 0);
if (weightedContributors <= 0) { if (weightedContributors <= 0) {
return { return {
@ -143,6 +145,7 @@ export function calculateAutomaticEstimate(args: {
const hasLinkedIn = sources.includes("linkedin"); const hasLinkedIn = sources.includes("linkedin");
const hasGlassdoor = sources.includes("glassdoor"); const hasGlassdoor = sources.includes("glassdoor");
const hasAdzuna = sources.includes("adzuna"); const hasAdzuna = sources.includes("adzuna");
const hasHiringCafe = sources.includes("hiringcafe");
const limits = deriveExtractorLimits({ const limits = deriveExtractorLimits({
budget: values.runBudget, budget: values.runBudget,
searchTerms: values.searchTerms, searchTerms: values.searchTerms,
@ -158,8 +161,12 @@ export function calculateAutomaticEstimate(args: {
: 0; : 0;
const ukvisaCap = hasUkVisaJobs ? limits.ukvisajobsMaxJobs : 0; const ukvisaCap = hasUkVisaJobs ? limits.ukvisajobsMaxJobs : 0;
const adzunaCap = hasAdzuna ? limits.adzunaMaxJobsPerTerm * termCount : 0; const adzunaCap = hasAdzuna ? limits.adzunaMaxJobsPerTerm * termCount : 0;
const hiringCafeCap = hasHiringCafe
? limits.jobspyResultsWanted * termCount
: 0;
const discoveredCap = jobspyCap + gradcrackerCap + ukvisaCap + adzunaCap; const discoveredCap =
jobspyCap + gradcrackerCap + ukvisaCap + adzunaCap + hiringCafeCap;
const discoveredMin = Math.round(discoveredCap * 0.35); const discoveredMin = Math.round(discoveredCap * 0.35);
const discoveredMax = Math.round(discoveredCap * 0.75); const discoveredMax = Math.round(discoveredCap * 0.75);
const processedMin = Math.min(values.topN, discoveredMin); const processedMin = Math.min(values.topN, discoveredMin);

View File

@ -14,6 +14,7 @@ export const orderedSources: JobSource[] = [
"linkedin", "linkedin",
"glassdoor", "glassdoor",
"adzuna", "adzuna",
"hiringcafe",
"ukvisajobs", "ukvisajobs",
]; ];
export const orderedFilterSources: JobSource[] = [...orderedSources, "manual"]; export const orderedFilterSources: JobSource[] = [...orderedSources, "manual"];

View File

@ -168,7 +168,8 @@ export const getSourcesWithJobs = (jobs: JobListItem[]): JobSource[] => {
export const getEnabledSources = ( export const getEnabledSources = (
settings: AppSettings | null, settings: AppSettings | null,
): JobSource[] => { ): JobSource[] => {
if (!settings) return [...DEFAULT_PIPELINE_SOURCES, "glassdoor"]; if (!settings)
return [...DEFAULT_PIPELINE_SOURCES, "glassdoor", "hiringcafe"];
const enabled: JobSource[] = []; const enabled: JobSource[] = [];
const hasUkVisaJobsAuth = Boolean( const hasUkVisaJobsAuth = Boolean(
@ -191,6 +192,10 @@ export const getEnabledSources = (
if (hasAdzunaAuth) enabled.push(source); if (hasAdzunaAuth) enabled.push(source);
continue; continue;
} }
if (source === "hiringcafe") {
enabled.push(source);
continue;
}
if ( if (
source === "indeed" || source === "indeed" ||
source === "linkedin" || source === "linkedin" ||

View File

@ -144,5 +144,6 @@ export const sourceLabel: Record<Job["source"], string> = {
glassdoor: "Glassdoor", glassdoor: "Glassdoor",
ukvisajobs: "UK Visa Jobs", ukvisajobs: "UK Visa Jobs",
adzuna: "Adzuna", adzuna: "Adzuna",
hiringcafe: "Hiring Cafe",
manual: "Manual", manual: "Manual",
}; };

View File

@ -101,6 +101,7 @@ const runPipelineSchema = z.object({
"glassdoor", "glassdoor",
"ukvisajobs", "ukvisajobs",
"adzuna", "adzuna",
"hiringcafe",
]), ]),
) )
.min(1) .min(1)

View File

@ -253,6 +253,7 @@ export const DEMO_SOURCE_BASE_URLS: Record<JobSource, string> = {
gradcracker: "https://www.gradcracker.com", gradcracker: "https://www.gradcracker.com",
ukvisajobs: "https://www.ukvisajobs.com", ukvisajobs: "https://www.ukvisajobs.com",
adzuna: "https://www.adzuna.com", adzuna: "https://www.adzuna.com",
hiringcafe: "https://hiring.cafe",
manual: "https://example.com", manual: "https://example.com",
}; };

View File

@ -40,6 +40,7 @@ export const jobs = sqliteTable("jobs", {
"glassdoor", "glassdoor",
"ukvisajobs", "ukvisajobs",
"adzuna", "adzuna",
"hiringcafe",
"manual", "manual",
], ],
}) })

View File

@ -14,7 +14,12 @@ export type PipelineStep =
| "cancelled" | "cancelled"
| "failed"; | "failed";
export type CrawlSource = "gradcracker" | "jobspy" | "ukvisajobs" | "adzuna"; export type CrawlSource =
| "gradcracker"
| "jobspy"
| "ukvisajobs"
| "adzuna"
| "hiringcafe";
export interface PipelineProgress { export interface PipelineProgress {
step: PipelineStep; step: PipelineStep;

View File

@ -23,6 +23,10 @@ vi.mock("../../services/adzuna", () => ({
runAdzuna: vi.fn(), runAdzuna: vi.fn(),
})); }));
vi.mock("../../services/hiring-cafe", () => ({
runHiringCafe: vi.fn(),
}));
vi.mock("../../services/ukvisajobs", () => ({ vi.mock("../../services/ukvisajobs", () => ({
runUkVisaJobs: vi.fn(), runUkVisaJobs: vi.fn(),
})); }));
@ -218,6 +222,126 @@ describe("discoverJobsStep", () => {
expect(vi.mocked(adzuna.runAdzuna)).not.toHaveBeenCalled(); expect(vi.mocked(adzuna.runAdzuna)).not.toHaveBeenCalled();
}); });
it("runs hiringcafe when selected and passes country/terms/cap", async () => {
const settingsRepo = await import("../../repositories/settings");
const hiringCafe = await import("../../services/hiring-cafe");
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
searchTerms: JSON.stringify(["engineer"]),
jobspyCountryIndeed: "united states",
jobspyResultsWanted: "25",
} as any);
vi.mocked(hiringCafe.runHiringCafe).mockResolvedValue({
success: true,
jobs: [
{
source: "hiringcafe",
sourceJobId: "hc-1",
title: "Engineer",
employer: "ACME",
jobUrl: "https://example.com/hc",
applicationLink: "https://example.com/hc",
},
],
} as any);
const result = await discoverJobsStep({
mergedConfig: {
...config,
sources: ["hiringcafe"],
},
});
expect(result.discoveredJobs).toHaveLength(1);
expect(vi.mocked(hiringCafe.runHiringCafe)).toHaveBeenCalledWith(
expect.objectContaining({
country: "united states",
searchTerms: ["engineer"],
maxJobsPerTerm: 25,
}),
);
});
it("updates Hiring Cafe terms and pages via progress callbacks", async () => {
const settingsRepo = await import("../../repositories/settings");
const hiringCafe = await import("../../services/hiring-cafe");
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
searchTerms: JSON.stringify(["engineer", "frontend"]),
jobspyCountryIndeed: "united kingdom",
jobspyResultsWanted: "50",
} as any);
vi.mocked(hiringCafe.runHiringCafe).mockImplementation(
async (options: any) => {
options?.onProgress?.({
type: "term_start",
termIndex: 1,
termTotal: 2,
searchTerm: "engineer",
});
options?.onProgress?.({
type: "page_fetched",
termIndex: 1,
termTotal: 2,
searchTerm: "engineer",
pageNo: 0,
resultsOnPage: 10,
totalCollected: 10,
});
options?.onProgress?.({
type: "term_complete",
termIndex: 1,
termTotal: 2,
searchTerm: "engineer",
jobsFoundTerm: 10,
});
return { success: true, jobs: [] } as any;
},
);
await discoverJobsStep({
mergedConfig: {
...config,
sources: ["hiringcafe"],
},
});
const progress = getProgress();
expect(progress.crawlingTermsProcessed).toBe(1);
expect(progress.crawlingTermsTotal).toBe(2);
expect(progress.crawlingListPagesProcessed).toBe(1);
expect(progress.crawlingJobPagesEnqueued).toBe(10);
expect(progress.crawlingJobPagesProcessed).toBe(10);
});
it("returns Hiring Cafe source error when extractor fails", async () => {
const settingsRepo = await import("../../repositories/settings");
const hiringCafe = await import("../../services/hiring-cafe");
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
searchTerms: JSON.stringify(["engineer"]),
jobspyCountryIndeed: "united kingdom",
jobspyResultsWanted: "50",
} as any);
vi.mocked(hiringCafe.runHiringCafe).mockResolvedValue({
success: false,
jobs: [],
error: "blocked upstream",
} as any);
await expect(
discoverJobsStep({
mergedConfig: {
...config,
sources: ["hiringcafe"],
},
}),
).rejects.toThrow("All sources failed: hiringcafe: blocked upstream");
});
it("maps Gradcracker progress callback into live crawling counters", async () => { it("maps Gradcracker progress callback into live crawling counters", async () => {
const settingsRepo = await import("../../repositories/settings"); const settingsRepo = await import("../../repositories/settings");
const crawler = await import("../../services/crawler"); const crawler = await import("../../services/crawler");
@ -402,6 +526,7 @@ describe("discoverJobsStep", () => {
it("does not throw when no sources are requested", async () => { it("does not throw when no sources are requested", async () => {
const settingsRepo = await import("../../repositories/settings"); const settingsRepo = await import("../../repositories/settings");
const adzuna = await import("../../services/adzuna"); const adzuna = await import("../../services/adzuna");
const hiringCafe = await import("../../services/hiring-cafe");
const jobSpy = await import("../../services/jobspy"); const jobSpy = await import("../../services/jobspy");
const crawler = await import("../../services/crawler"); const crawler = await import("../../services/crawler");
const ukVisa = await import("../../services/ukvisajobs"); const ukVisa = await import("../../services/ukvisajobs");
@ -422,6 +547,7 @@ describe("discoverJobsStep", () => {
expect(result.sourceErrors).toEqual([]); expect(result.sourceErrors).toEqual([]);
expect(vi.mocked(jobSpy.runJobSpy)).not.toHaveBeenCalled(); expect(vi.mocked(jobSpy.runJobSpy)).not.toHaveBeenCalled();
expect(vi.mocked(adzuna.runAdzuna)).not.toHaveBeenCalled(); expect(vi.mocked(adzuna.runAdzuna)).not.toHaveBeenCalled();
expect(vi.mocked(hiringCafe.runHiringCafe)).not.toHaveBeenCalled();
expect(vi.mocked(crawler.runCrawler)).not.toHaveBeenCalled(); expect(vi.mocked(crawler.runCrawler)).not.toHaveBeenCalled();
expect(vi.mocked(ukVisa.runUkVisaJobs)).not.toHaveBeenCalled(); expect(vi.mocked(ukVisa.runUkVisaJobs)).not.toHaveBeenCalled();
}); });

View File

@ -10,6 +10,7 @@ import * as jobsRepo from "../../repositories/jobs";
import * as settingsRepo from "../../repositories/settings"; import * as settingsRepo from "../../repositories/settings";
import { runAdzuna } from "../../services/adzuna"; import { runAdzuna } from "../../services/adzuna";
import { runCrawler } from "../../services/crawler"; import { runCrawler } from "../../services/crawler";
import { runHiringCafe } from "../../services/hiring-cafe";
import { runJobSpy } from "../../services/jobspy"; import { runJobSpy } from "../../services/jobspy";
import { runUkVisaJobs } from "../../services/ukvisajobs"; import { runUkVisaJobs } from "../../services/ukvisajobs";
import { progressHelpers, updateProgress } from "../progress"; import { progressHelpers, updateProgress } from "../progress";
@ -75,12 +76,14 @@ export async function discoverJobsStep(args: {
const shouldRunJobSpy = jobSpySites.length > 0; const shouldRunJobSpy = jobSpySites.length > 0;
const shouldRunAdzuna = compatibleSources.includes("adzuna"); const shouldRunAdzuna = compatibleSources.includes("adzuna");
const shouldRunHiringCafe = compatibleSources.includes("hiringcafe");
const shouldRunGradcracker = compatibleSources.includes("gradcracker"); const shouldRunGradcracker = compatibleSources.includes("gradcracker");
const shouldRunUkVisaJobs = compatibleSources.includes("ukvisajobs"); const shouldRunUkVisaJobs = compatibleSources.includes("ukvisajobs");
const totalSources = const totalSources =
Number(shouldRunJobSpy) + Number(shouldRunJobSpy) +
Number(shouldRunAdzuna) + Number(shouldRunAdzuna) +
Number(shouldRunHiringCafe) +
Number(shouldRunGradcracker) + Number(shouldRunGradcracker) +
Number(shouldRunUkVisaJobs); Number(shouldRunUkVisaJobs);
let completedSources = 0; let completedSources = 0;
@ -236,6 +239,84 @@ export async function discoverJobsStep(args: {
return { discoveredJobs, sourceErrors }; return { discoveredJobs, sourceErrors };
} }
if (shouldRunHiringCafe) {
progressHelpers.startSource("hiringcafe", completedSources, totalSources, {
termsTotal: searchTerms.length,
detail: "Hiring Cafe: fetching jobs...",
});
const hiringCafeMaxJobsPerTerm = settings.jobspyResultsWanted
? parseInt(settings.jobspyResultsWanted, 10)
: 200;
const hiringCafeResult = await runHiringCafe({
country: selectedCountry,
searchTerms,
maxJobsPerTerm: hiringCafeMaxJobsPerTerm,
onProgress: (event) => {
if (event.type === "term_start") {
progressHelpers.crawlingUpdate({
source: "hiringcafe",
termsProcessed: Math.max(event.termIndex - 1, 0),
termsTotal: event.termTotal,
phase: "list",
currentUrl: event.searchTerm,
});
updateProgress({
step: "crawling",
detail: `Hiring Cafe: term ${event.termIndex}/${event.termTotal} (${event.searchTerm})`,
});
return;
}
if (event.type === "page_fetched") {
const displayPageNo = event.pageNo + 1;
progressHelpers.crawlingUpdate({
source: "hiringcafe",
termsProcessed: Math.max(event.termIndex - 1, 0),
termsTotal: event.termTotal,
listPagesProcessed: displayPageNo,
jobPagesEnqueued: event.totalCollected,
jobPagesProcessed: event.totalCollected,
phase: "list",
currentUrl: `page ${displayPageNo}`,
});
updateProgress({
step: "crawling",
detail: `Hiring Cafe: term ${event.termIndex}/${event.termTotal}, page ${displayPageNo} (${event.totalCollected} collected)`,
});
return;
}
progressHelpers.crawlingUpdate({
source: "hiringcafe",
termsProcessed: event.termIndex,
termsTotal: event.termTotal,
phase: "list",
currentUrl: event.searchTerm,
});
updateProgress({
step: "crawling",
detail: `Hiring Cafe: completed term ${event.termIndex}/${event.termTotal} (${event.searchTerm})`,
});
},
});
if (!hiringCafeResult.success) {
sourceErrors.push(
`hiringcafe: ${hiringCafeResult.error ?? "unknown error"}`,
);
} else {
discoveredJobs.push(...hiringCafeResult.jobs);
}
markSourceComplete();
}
if (args.shouldCancel?.()) {
return { discoveredJobs, sourceErrors };
}
if (shouldRunGradcracker) { if (shouldRunGradcracker) {
progressHelpers.startSource("gradcracker", completedSources, totalSources, { progressHelpers.startSource("gradcracker", completedSources, totalSources, {
detail: "Gradcracker: scraping...", detail: "Gradcracker: scraping...",

View File

@ -0,0 +1,270 @@
import { spawn, spawnSync } from "node:child_process";
import { mkdir, readFile, rm } from "node:fs/promises";
import { createRequire } from "node:module";
import { dirname, join } from "node:path";
import { createInterface } from "node:readline";
import { fileURLToPath } from "node:url";
import { logger } from "@infra/logger";
import { sanitizeUnknown } from "@infra/sanitize";
import type { CreateJobInput } from "@shared/types";
import { toNumberOrNull, toStringOrNull } from "@shared/utils/type-conversion";
const __dirname = dirname(fileURLToPath(import.meta.url));
const HIRING_CAFE_DIR = join(__dirname, "../../../../extractors/hiringcafe");
const DATASET_PATH = join(
HIRING_CAFE_DIR,
"storage/datasets/default/jobs.json",
);
const STORAGE_DATASET_DIR = join(HIRING_CAFE_DIR, "storage/datasets/default");
const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
const require = createRequire(import.meta.url);
const TSX_CLI_PATH = resolveTsxCliPath();
type HiringCafeRawJob = Record<string, unknown>;
export type HiringCafeProgressEvent =
| {
type: "term_start";
termIndex: number;
termTotal: number;
searchTerm: string;
}
| {
type: "page_fetched";
termIndex: number;
termTotal: number;
searchTerm: string;
pageNo: number;
resultsOnPage: number;
totalCollected: number;
}
| {
type: "term_complete";
termIndex: number;
termTotal: number;
searchTerm: string;
jobsFoundTerm: number;
};
export interface RunHiringCafeOptions {
searchTerms?: string[];
country?: string;
maxJobsPerTerm?: number;
onProgress?: (event: HiringCafeProgressEvent) => void;
}
export interface HiringCafeResult {
success: boolean;
jobs: CreateJobInput[];
error?: string;
}
function resolveTsxCliPath(): string | null {
try {
return require.resolve("tsx/dist/cli.mjs");
} catch {
return null;
}
}
function canRunNpmCommand(): boolean {
const result = spawnSync("npm", ["--version"], { stdio: "ignore" });
return !result.error && result.status === 0;
}
function parseProgressLine(line: string): HiringCafeProgressEvent | null {
if (!line.startsWith(JOBOPS_PROGRESS_PREFIX)) return null;
const raw = line.slice(JOBOPS_PROGRESS_PREFIX.length).trim();
let parsed: Record<string, unknown>;
try {
parsed = JSON.parse(raw) as Record<string, unknown>;
} catch {
return null;
}
const event = toStringOrNull(parsed.event);
const termIndex = toNumberOrNull(parsed.termIndex);
const termTotal = toNumberOrNull(parsed.termTotal);
const searchTerm = toStringOrNull(parsed.searchTerm) ?? "";
if (!event || termIndex === null || termTotal === null) {
return null;
}
if (event === "term_start") {
return { type: "term_start", termIndex, termTotal, searchTerm };
}
if (event === "page_fetched") {
const pageNo = toNumberOrNull(parsed.pageNo);
if (pageNo === null) return null;
return {
type: "page_fetched",
termIndex,
termTotal,
searchTerm,
pageNo,
resultsOnPage: toNumberOrNull(parsed.resultsOnPage) ?? 0,
totalCollected: toNumberOrNull(parsed.totalCollected) ?? 0,
};
}
if (event === "term_complete") {
return {
type: "term_complete",
termIndex,
termTotal,
searchTerm,
jobsFoundTerm: toNumberOrNull(parsed.jobsFoundTerm) ?? 0,
};
}
return null;
}
function mapHiringCafeRow(row: HiringCafeRawJob): CreateJobInput | null {
const jobUrl = toStringOrNull(row.jobUrl);
if (!jobUrl) return null;
return {
source: "hiringcafe",
sourceJobId: toStringOrNull(row.sourceJobId) ?? undefined,
title: toStringOrNull(row.title) ?? "Unknown Title",
employer: toStringOrNull(row.employer) ?? "Unknown Employer",
jobUrl,
applicationLink: toStringOrNull(row.applicationLink) ?? jobUrl,
location: toStringOrNull(row.location) ?? undefined,
salary: toStringOrNull(row.salary) ?? undefined,
datePosted: toStringOrNull(row.datePosted) ?? undefined,
jobDescription: toStringOrNull(row.jobDescription) ?? undefined,
jobType: toStringOrNull(row.jobType) ?? undefined,
};
}
async function readDataset(): Promise<CreateJobInput[]> {
const content = await readFile(DATASET_PATH, "utf-8");
const parsed = JSON.parse(content) as unknown;
if (!Array.isArray(parsed)) return [];
const jobs: CreateJobInput[] = [];
const seen = new Set<string>();
for (const value of parsed) {
if (!value || typeof value !== "object" || Array.isArray(value)) continue;
const mapped = mapHiringCafeRow(value as HiringCafeRawJob);
if (!mapped) continue;
const dedupeKey = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(dedupeKey)) continue;
seen.add(dedupeKey);
jobs.push(mapped);
}
return jobs;
}
async function clearStorageDataset(): Promise<void> {
await rm(STORAGE_DATASET_DIR, { recursive: true, force: true });
await mkdir(STORAGE_DATASET_DIR, { recursive: true });
}
export async function runHiringCafe(
options: RunHiringCafeOptions = {},
): Promise<HiringCafeResult> {
const searchTerms =
options.searchTerms && options.searchTerms.length > 0
? options.searchTerms
: ["web developer"];
const country = (options.country || "united kingdom").trim().toLowerCase();
const maxJobsPerTerm = options.maxJobsPerTerm ?? 200;
const useNpmCommand = canRunNpmCommand();
if (!useNpmCommand && !TSX_CLI_PATH) {
return {
success: false,
jobs: [],
error: "Unable to execute Hiring Cafe extractor (npm/tsx unavailable)",
};
}
try {
await clearStorageDataset();
await new Promise<void>((resolve, reject) => {
const extractorEnv = {
...process.env,
JOBOPS_EMIT_PROGRESS: "1",
HIRING_CAFE_SEARCH_TERMS: JSON.stringify(searchTerms),
HIRING_CAFE_COUNTRY: country,
HIRING_CAFE_MAX_JOBS_PER_TERM: String(maxJobsPerTerm),
HIRING_CAFE_OUTPUT_JSON: DATASET_PATH,
};
const child = useNpmCommand
? spawn("npm", ["run", "start"], {
cwd: HIRING_CAFE_DIR,
stdio: ["ignore", "pipe", "pipe"],
env: extractorEnv,
})
: (() => {
const tsxCliPath = TSX_CLI_PATH;
if (!tsxCliPath) {
throw new Error(
"Unable to execute Hiring Cafe extractor (npm/tsx unavailable)",
);
}
return spawn(process.execPath, [tsxCliPath, "src/main.ts"], {
cwd: HIRING_CAFE_DIR,
stdio: ["ignore", "pipe", "pipe"],
env: extractorEnv,
});
})();
const handleLine = (line: string, stream: NodeJS.WriteStream) => {
const progressEvent = parseProgressLine(line);
if (progressEvent) {
options.onProgress?.(progressEvent);
return;
}
stream.write(`${line}\n`);
};
const stdoutRl = child.stdout
? createInterface({ input: child.stdout })
: null;
const stderrRl = child.stderr
? createInterface({ input: child.stderr })
: null;
stdoutRl?.on("line", (line) => handleLine(line, process.stdout));
stderrRl?.on("line", (line) => handleLine(line, process.stderr));
child.on("close", (code) => {
stdoutRl?.close();
stderrRl?.close();
if (code === 0) resolve();
else
reject(new Error(`Hiring Cafe extractor exited with code ${code}`));
});
child.on("error", reject);
});
const jobs = await readDataset();
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
logger.warn("Hiring Cafe extractor run failed", {
error: message,
details: sanitizeUnknown(error),
});
return { success: false, jobs: [], error: message };
}
}

31
package-lock.json generated
View File

@ -153,6 +153,33 @@
"undici-types": "~7.16.0" "undici-types": "~7.16.0"
} }
}, },
"extractors/hiringcafe": {
"name": "hiringcafe-extractor",
"version": "0.0.1",
"dependencies": {
"camoufox-js": "^0.8.0",
"job-ops-shared": "^1.0.0",
"playwright": "^1.57.0",
"tsx": "^4.4.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"optionalDependencies": {
"impit-linux-x64-gnu": "^0.1.0"
}
},
"extractors/hiringcafe/node_modules/@types/node": {
"version": "24.10.13",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.13.tgz",
"integrity": "sha512-oH72nZRfDv9lADUBSo104Aq7gPHpQZc4BTx38r9xf9pg5LfP6EzSyH2n7qFmmxRQXh7YlUXODcYsg6PuTDSxGg==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/ukvisajobs": { "extractors/ukvisajobs": {
"name": "ukvisajobs-extractor", "name": "ukvisajobs-extractor",
"version": "0.0.1", "version": "0.0.1",
@ -13175,6 +13202,10 @@
"node": ">=16.0.0" "node": ">=16.0.0"
} }
}, },
"node_modules/hiringcafe-extractor": {
"resolved": "extractors/hiringcafe",
"link": true
},
"node_modules/history": { "node_modules/history": {
"version": "4.10.1", "version": "4.10.1",
"resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz", "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz",

View File

@ -126,6 +126,7 @@ export type JobSource =
| "glassdoor" | "glassdoor"
| "ukvisajobs" | "ukvisajobs"
| "adzuna" | "adzuna"
| "hiringcafe"
| "manual"; | "manual";
export interface Job { export interface Job {