diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97ca177..b6866ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,6 +52,7 @@ jobs: project: - orchestrator - adzuna-extractor + - hiringcafe-extractor - gradcracker-extractor - ukvisajobs-extractor steps: diff --git a/Dockerfile b/Dockerfile index 2cfe57b..e7addd9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,6 +36,7 @@ COPY docs-site/package*.json ./docs-site/ COPY shared/package*.json ./shared/ COPY orchestrator/package*.json ./orchestrator/ COPY extractors/adzuna/package*.json ./extractors/adzuna/ +COPY extractors/hiringcafe/package*.json ./extractors/hiringcafe/ COPY extractors/gradcracker/package*.json ./extractors/gradcracker/ COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/ @@ -54,6 +55,7 @@ COPY shared ./shared COPY docs-site ./docs-site COPY orchestrator ./orchestrator COPY extractors/adzuna ./extractors/adzuna +COPY extractors/hiringcafe ./extractors/hiringcafe COPY extractors/gradcracker ./extractors/gradcracker COPY extractors/jobspy ./extractors/jobspy COPY extractors/ukvisajobs ./extractors/ukvisajobs @@ -100,6 +102,7 @@ COPY docs-site/package*.json ./docs-site/ COPY shared/package*.json ./shared/ COPY orchestrator/package*.json ./orchestrator/ COPY extractors/adzuna/package*.json ./extractors/adzuna/ +COPY extractors/hiringcafe/package*.json ./extractors/hiringcafe/ COPY extractors/gradcracker/package*.json ./extractors/gradcracker/ COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/ @@ -114,6 +117,7 @@ COPY --from=builder /app/docs-site/build ./orchestrator/dist/docs COPY shared ./shared COPY orchestrator ./orchestrator COPY extractors/adzuna ./extractors/adzuna +COPY extractors/hiringcafe ./extractors/hiringcafe COPY extractors/gradcracker ./extractors/gradcracker COPY extractors/jobspy ./extractors/jobspy COPY extractors/ukvisajobs ./extractors/ukvisajobs diff --git a/docker-compose.yml b/docker-compose.yml index 99c16f2..fddd092 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,6 +47,9 @@ services: - path: ./extractors/gradcracker/src target: /app/extractors/gradcracker/src action: sync+restart + - path: ./extractors/hiringcafe/src + target: /app/extractors/hiringcafe/src + action: sync+restart - path: ./extractors/ukvisajobs/src target: /app/extractors/ukvisajobs/src action: sync+restart diff --git a/docs-site/docs/extractors/adzuna.md b/docs-site/docs/extractors/adzuna.md index f3ad6d0..43e77c2 100644 --- a/docs-site/docs/extractors/adzuna.md +++ b/docs-site/docs/extractors/adzuna.md @@ -7,6 +7,8 @@ sidebar_position: 6 ## What it is +Original website: [adzuna.com](https://www.adzuna.com) + Adzuna is an API-backed extractor implemented in two lean pieces: 1. `extractors/adzuna/src/main.ts` fetches paginated Adzuna search results and writes `jobs.json`. diff --git a/docs-site/docs/extractors/gradcracker.md b/docs-site/docs/extractors/gradcracker.md index 6933ed5..b04eec3 100644 --- a/docs-site/docs/extractors/gradcracker.md +++ b/docs-site/docs/extractors/gradcracker.md @@ -7,6 +7,8 @@ sidebar_position: 2 A plain-English walkthrough of the Gradcracker extractor in `extractors/gradcracker`. +Original website: [gradcracker.com](https://www.gradcracker.com) + ## Big picture The crawler builds search URLs, scrapes listing pages, then opens job details for descriptions and apply URLs. diff --git a/docs-site/docs/extractors/hiring-cafe.md b/docs-site/docs/extractors/hiring-cafe.md new file mode 100644 index 0000000..c4e4975 --- /dev/null +++ b/docs-site/docs/extractors/hiring-cafe.md @@ -0,0 +1,74 @@ +--- +id: hiring-cafe +title: Hiring Cafe Extractor +description: Browser-backed Hiring Cafe extraction integrated into the pipeline source selector. +sidebar_position: 7 +--- + +## What it is + +Original website: [hiring.cafe](https://hiring.cafe) + +Special thanks: Initial implementation inspiration came from [umur957/hiring-cafe-job-scraper](https://github.com/umur957/hiring-cafe-job-scraper). + +Hiring Cafe is a browser-backed extractor that queries Hiring Cafe search APIs and maps results into the orchestrator `CreateJobInput` shape. + +Implementation split: + +1. `extractors/hiringcafe/src/main.ts` builds search state, calls Hiring Cafe APIs, and writes dataset JSON. +2. `orchestrator/src/server/services/hiring-cafe.ts` runs the extractor, streams progress events, and maps rows for pipeline import. + +## Why it exists + +Hiring Cafe adds another non-credentialed source that can be enabled from the existing source picker, without adding new settings UI. + +It also supports term-by-term search and country-aware search state using the same pipeline knobs you already set for automatic runs. + +## How to use it + +1. Open **Run jobs** and choose **Automatic**. +2. **Hiring Cafe** is enabled by default in **Sources** (toggle it off if you do not want it for this run). +3. Set your existing automatic run knobs: + - `searchTerms` drive per-term Hiring Cafe `searchQuery`. + - selected country maps into Hiring Cafe location search state. + - run budget path (`jobspyResultsWanted`) is reused as the max jobs-per-term cap. +4. Start the run and watch progress in the pipeline progress card. + +Defaults and constraints: + +- No new Hiring Cafe settings fields were added. +- `worldwide` and `usa/ca` run in broad mode without a strict country location filter. +- Hiring Cafe is enabled by default in source selection. +- `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` controls recency window when running extractor directly (default `7`). + +Local run example: + +```bash +HIRING_CAFE_SEARCH_TERMS='["backend engineer"]' \ +HIRING_CAFE_COUNTRY='united kingdom' \ +HIRING_CAFE_MAX_JOBS_PER_TERM='50' \ +npm --workspace hiringcafe-extractor run start +``` + +## Common problems + +### Hiring Cafe returns 429 / Vercel security checkpoint + +- The extractor first attempts Camoufox-backed Firefox and falls back to vanilla Firefox startup if Camoufox is unstable locally. +- If upstream blocks continue, retry later or reduce run concurrency at the pipeline level by selecting fewer sources. + +### Hiring Cafe does not appear in sources + +- Check that client is running on latest build containing the new source list. +- Hiring Cafe is source-only and does not require credentials, so it should appear once the new build is loaded. + +### Results are lower than expected + +- Cap is tied to automatic run budget path (`jobspyResultsWanted`) and search term count. +- Country mapping can narrow results when a strict country location is applied. + +## Related pages + +- [Extractors Overview](/docs/next/extractors/overview) +- [Pipeline Run](/docs/next/features/pipeline-run) +- [Settings](/docs/next/features/settings) diff --git a/docs-site/docs/extractors/jobspy.md b/docs-site/docs/extractors/jobspy.md index 7390967..9d55e91 100644 --- a/docs-site/docs/extractors/jobspy.md +++ b/docs-site/docs/extractors/jobspy.md @@ -7,6 +7,11 @@ sidebar_position: 3 A walkthrough of the JobSpy extractor for Indeed, LinkedIn, and Glassdoor. +Original websites: +- [indeed.com](https://www.indeed.com) +- [linkedin.com/jobs](https://www.linkedin.com/jobs) +- [glassdoor.com](https://www.glassdoor.com) + ## Big picture JobSpy runs as a Python script per search term, writes JSON, then orchestrator ingests and normalizes into internal job shape. diff --git a/docs-site/docs/extractors/overview.md b/docs-site/docs/extractors/overview.md index 203602f..553bbfa 100644 --- a/docs-site/docs/extractors/overview.md +++ b/docs-site/docs/extractors/overview.md @@ -14,6 +14,7 @@ This page helps you choose the right extractor for your run, understand key cons | [Gradcracker](/docs/next/extractors/gradcracker) | UK graduate roles from Gradcracker | Crawling stability depends on page structure and anti-bot behavior; tuned for low concurrency | `GRADCRACKER_SEARCH_TERMS`, `GRADCRACKER_MAX_JOBS_PER_TERM`, `JOBOPS_SKIP_APPLY_FOR_EXISTING` | Scrapes listing metadata, then detail pages and apply URL resolution | | [JobSpy](/docs/next/extractors/jobspy) | Multi-source discovery (Indeed, LinkedIn, Glassdoor) | Requires Python wrapper execution per term; source availability and quality vary by site/location | `JOBSPY_SITES`, `JOBSPY_SEARCH_TERMS`, `JOBSPY_RESULTS_WANTED`, `JOBSPY_HOURS_OLD`, `JOBSPY_LINKEDIN_FETCH_DESCRIPTION` | Produces JSON per term, then orchestrator normalizes and de-duplicates by `jobUrl` | | [Adzuna](/docs/next/extractors/adzuna) | API-based multi-country discovery with low scraping overhead | Requires valid App ID/App Key; country must be in Adzuna-supported list | `ADZUNA_APP_ID`, `ADZUNA_APP_KEY`, `ADZUNA_MAX_JOBS_PER_TERM` | API pagination to dataset output; orchestrator maps progress and de-duplicates by `sourceJobId`/`jobUrl` | +| [Hiring Cafe](/docs/next/extractors/hiring-cafe) | Browser-backed discovery using Hiring Cafe search APIs | Subject to upstream anti-bot checks; uses browser context and encoded search-state payloads | `HIRING_CAFE_SEARCH_TERMS`, `HIRING_CAFE_COUNTRY`, `HIRING_CAFE_MAX_JOBS_PER_TERM`, `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` | Uses existing pipeline term/country/budget knobs and maps directly to normalized jobs | | [UKVisaJobs](/docs/next/extractors/ukvisajobs) | UK visa sponsorship-focused roles | Requires authenticated session and periodic token/cookie refresh | `UKVISAJOBS_EMAIL`, `UKVISAJOBS_PASSWORD`, `UKVISAJOBS_MAX_JOBS`, `UKVISAJOBS_SEARCH_KEYWORD` | API pagination + dataset output; orchestrator de-dupes and may fetch missing descriptions | | [Manual Import](/docs/next/extractors/manual) | One-off jobs not covered by scrapers | Inference quality depends on model/provider and input quality; some URLs cannot be fetched reliably | App/API endpoints (`/api/manual-jobs/infer`, `/api/manual-jobs/import`) | Accepts text/HTML/URL, runs inference, then saves and scores job after review | @@ -21,6 +22,7 @@ This page helps you choose the right extractor for your run, understand key cons - Use **JobSpy** for broad first-pass sourcing across common boards. - Use **Adzuna** when you want API-first discovery in supported non-UK markets. +- Use **Hiring Cafe** when you want another term/country-driven source without adding credentials. - Use **Gradcracker** when targeting graduate pipelines in the UK. - Use **UKVisaJobs** for sponsorship-specific UK searches. - Use **Manual Import** when you already have a specific posting and need direct import. @@ -32,5 +34,6 @@ Many runs combine sources: broad discovery first, then manual import for high-pr - [Gradcracker](/docs/next/extractors/gradcracker) - [JobSpy](/docs/next/extractors/jobspy) - [Adzuna](/docs/next/extractors/adzuna) +- [Hiring Cafe](/docs/next/extractors/hiring-cafe) - [UKVisaJobs](/docs/next/extractors/ukvisajobs) - [Manual Import](/docs/next/extractors/manual) diff --git a/docs-site/docs/extractors/ukvisajobs.md b/docs-site/docs/extractors/ukvisajobs.md index 133042a..96ad6bc 100644 --- a/docs-site/docs/extractors/ukvisajobs.md +++ b/docs-site/docs/extractors/ukvisajobs.md @@ -7,6 +7,8 @@ sidebar_position: 5 UKVisaJobs is the most complex extractor because authenticated sessions are required. +Original website: [my.ukvisajobs.com](https://my.ukvisajobs.com) + ## Big picture Two layers: diff --git a/docs-site/sidebars.ts b/docs-site/sidebars.ts index d6e36a4..8c69311 100644 --- a/docs-site/sidebars.ts +++ b/docs-site/sidebars.ts @@ -45,6 +45,7 @@ const sidebars: SidebarsConfig = { "extractors/gradcracker", "extractors/jobspy", "extractors/adzuna", + "extractors/hiring-cafe", "extractors/manual", "extractors/ukvisajobs", ], diff --git a/extractors/hiringcafe/README.md b/extractors/hiringcafe/README.md new file mode 100644 index 0000000..8298cca --- /dev/null +++ b/extractors/hiringcafe/README.md @@ -0,0 +1,20 @@ +# Hiring Cafe Extractor + +Browser-backed extractor for Hiring Cafe search APIs. + +Special thanks: initial implementation inspiration came from [umur957/hiring-cafe-job-scraper](https://github.com/umur957/hiring-cafe-job-scraper). + +## Environment + +- `HIRING_CAFE_SEARCH_TERMS` (JSON array or `|` / comma / newline-delimited) +- `HIRING_CAFE_COUNTRY` (default: `united kingdom`) +- `HIRING_CAFE_MAX_JOBS_PER_TERM` (default: `200`) +- `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` (default: `7`) +- `HIRING_CAFE_OUTPUT_JSON` (default: `storage/datasets/default/jobs.json`) +- `JOBOPS_EMIT_PROGRESS=1` to emit `JOBOPS_PROGRESS` events +- `HIRING_CAFE_HEADLESS=false` to run headed + +## Notes + +- The extractor uses `s = base64(url-encoded JSON search state)`. +- `worldwide` and `usa/ca` are treated as broad search modes without hard country location filters. diff --git a/extractors/hiringcafe/package.json b/extractors/hiringcafe/package.json new file mode 100644 index 0000000..7832d9f --- /dev/null +++ b/extractors/hiringcafe/package.json @@ -0,0 +1,26 @@ +{ + "name": "hiringcafe-extractor", + "version": "0.0.1", + "type": "module", + "description": "Hiring Cafe extractor - fetches jobs via browser-backed API requests", + "main": "src/main.ts", + "dependencies": { + "camoufox-js": "^0.8.0", + "job-ops-shared": "^1.0.0", + "playwright": "^1.57.0", + "tsx": "^4.4.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "optionalDependencies": { + "impit-linux-x64-gnu": "^0.1.0" + }, + "scripts": { + "start": "tsx src/main.ts", + "start:dev": "tsx src/main.ts", + "check:types": "tsc --noEmit", + "get-binaries": "camoufox-js fetch" + } +} diff --git a/extractors/hiringcafe/src/country-map.ts b/extractors/hiringcafe/src/country-map.ts new file mode 100644 index 0000000..6878906 --- /dev/null +++ b/extractors/hiringcafe/src/country-map.ts @@ -0,0 +1,118 @@ +export function normalizeCountryKey(value: string | null | undefined): string { + const normalized = value?.trim().toLowerCase() ?? ""; + if (normalized === "uk") return "united kingdom"; + if (normalized === "us" || normalized === "usa") return "united states"; + if (normalized === "türkiye") return "turkey"; + if (normalized === "czech republic") return "czechia"; + return normalized; +} + +export interface HiringCafeCountryLocation { + formatted_address: string; + types: ["country"]; + id: "user_country"; + address_components: Array<{ + long_name: string; + short_name: string; + types: ["country"]; + }>; + options: { + flexible_regions: ["anywhere_in_continent", "anywhere_in_world"]; + }; +} + +const GLOBAL_SEARCH_KEYS = new Set(["worldwide", "usa/ca"]); + +const COUNTRY_NAME_OVERRIDES: Record = { + "united states": "United States", + "united kingdom": "United Kingdom", + "united arab emirates": "United Arab Emirates", + "new zealand": "New Zealand", + "south korea": "South Korea", + "south africa": "South Africa", + "costa rica": "Costa Rica", + "saudi arabia": "Saudi Arabia", + "hong kong": "Hong Kong", + czechia: "Czechia", + türkiye: "Turkey", + turkey: "Turkey", +}; + +const ISO2_ALIASES: Record = { + "united states": "US", + "united kingdom": "GB", + "united arab emirates": "AE", + "new zealand": "NZ", + "south korea": "KR", + "south africa": "ZA", + "costa rica": "CR", + "saudi arabia": "SA", + "hong kong": "HK", + czechia: "CZ", + türkiye: "TR", + turkey: "TR", +}; + +const regionNameMap = buildRegionNameMap(); + +function buildRegionNameMap(): Map { + const names = new Intl.DisplayNames(["en"], { type: "region" }); + const map = new Map(); + + for (let i = 65; i <= 90; i += 1) { + for (let j = 65; j <= 90; j += 1) { + const iso2 = String.fromCharCode(i, j); + const displayName = names.of(iso2); + if (!displayName || displayName === iso2) continue; + map.set(normalizeCountryKey(displayName), iso2); + } + } + + return map; +} + +function toCountryLabel(countryKey: string): string { + const override = COUNTRY_NAME_OVERRIDES[countryKey]; + if (override) return override; + return countryKey.replace(/\b\w/g, (char) => char.toUpperCase()); +} + +function toIso2(countryKey: string): string | null { + if (ISO2_ALIASES[countryKey]) { + return ISO2_ALIASES[countryKey]; + } + return regionNameMap.get(countryKey) ?? null; +} + +export function shouldUseGlobalLocation(countryInput?: string | null): boolean { + const countryKey = normalizeCountryKey(countryInput); + return !countryKey || GLOBAL_SEARCH_KEYS.has(countryKey); +} + +export function resolveHiringCafeCountryLocation( + countryInput?: string | null, +): HiringCafeCountryLocation | null { + const countryKey = normalizeCountryKey(countryInput); + if (!countryKey || GLOBAL_SEARCH_KEYS.has(countryKey)) return null; + + const iso2 = toIso2(countryKey); + if (!iso2) return null; + + const longName = toCountryLabel(countryKey); + + return { + formatted_address: longName, + types: ["country"], + id: "user_country", + address_components: [ + { + long_name: longName, + short_name: iso2, + types: ["country"], + }, + ], + options: { + flexible_regions: ["anywhere_in_continent", "anywhere_in_world"], + }, + }; +} diff --git a/extractors/hiringcafe/src/default-search-state.ts b/extractors/hiringcafe/src/default-search-state.ts new file mode 100644 index 0000000..77fe33e --- /dev/null +++ b/extractors/hiringcafe/src/default-search-state.ts @@ -0,0 +1,91 @@ +import type { HiringCafeCountryLocation } from "./country-map.js"; + +export interface HiringCafeSearchState { + locations: HiringCafeCountryLocation[]; + workplaceTypes: Array<"Remote" | "Hybrid" | "Onsite">; + defaultToUserLocation: boolean; + userLocation: null; + commitmentTypes: string[]; + seniorityLevel: string[]; + roleTypes: string[]; + roleYoeRange: [number, number]; + excludeIfRoleYoeIsNotSpecified: boolean; + managementYoeRange: [number, number]; + excludeIfManagementYoeIsNotSpecified: boolean; + securityClearances: string[]; + searchQuery: string; + dateFetchedPastNDays: number; + hiddenCompanies: string[]; + sortBy: "default"; + companyPublicOrPrivate: "all"; + latestInvestmentYearRange: [null, null]; + latestInvestmentSeries: string[]; + latestInvestmentAmount: null; + latestInvestmentCurrency: string[]; + investors: string[]; + excludedInvestors: string[]; + isNonProfit: "all"; + companySizeRanges: string[]; + minYearFounded: null; + maxYearFounded: null; + excludedLatestInvestmentSeries: string[]; +} + +export function createDefaultSearchState(args: { + searchQuery: string; + location: HiringCafeCountryLocation | null; + dateFetchedPastNDays: number; +}): HiringCafeSearchState { + return { + locations: args.location ? [args.location] : [], + workplaceTypes: ["Remote", "Hybrid", "Onsite"], + defaultToUserLocation: false, + userLocation: null, + commitmentTypes: [ + "Full Time", + "Part Time", + "Contract", + "Internship", + "Temporary", + "Seasonal", + "Volunteer", + ], + seniorityLevel: [ + "No Prior Experience Required", + "Entry Level", + "Mid Level", + "Senior Level", + ], + roleTypes: ["Individual Contributor", "People Manager"], + roleYoeRange: [0, 20], + excludeIfRoleYoeIsNotSpecified: false, + managementYoeRange: [0, 20], + excludeIfManagementYoeIsNotSpecified: false, + securityClearances: [ + "None", + "Confidential", + "Secret", + "Top Secret", + "Top Secret/SCI", + "Public Trust", + "Interim Clearances", + "Other", + ], + searchQuery: args.searchQuery, + dateFetchedPastNDays: args.dateFetchedPastNDays, + hiddenCompanies: [], + sortBy: "default", + companyPublicOrPrivate: "all", + latestInvestmentYearRange: [null, null], + latestInvestmentSeries: [], + latestInvestmentAmount: null, + latestInvestmentCurrency: [], + investors: [], + excludedInvestors: [], + isNonProfit: "all", + companySizeRanges: [], + minYearFounded: null, + maxYearFounded: null, + excludedLatestInvestmentSeries: [], + }; +} diff --git a/extractors/hiringcafe/src/main.ts b/extractors/hiringcafe/src/main.ts new file mode 100644 index 0000000..c4b9e0b --- /dev/null +++ b/extractors/hiringcafe/src/main.ts @@ -0,0 +1,439 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { launchOptions } from "camoufox-js"; +import { + toNumberOrNull, + toStringOrNull, +} from "job-ops-shared/utils/type-conversion"; +import { firefox, type Page } from "playwright"; +import { + normalizeCountryKey, + resolveHiringCafeCountryLocation, +} from "./country-map.js"; +import { createDefaultSearchState } from "./default-search-state.js"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const BASE_URL = "https://hiring.cafe"; +const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS "; +const DEFAULT_MAX_JOBS_PER_TERM = 200; +const DEFAULT_SEARCH_TERM = "web developer"; +const DEFAULT_DATE_FETCHED_PAST_N_DAYS = 30; +const PAGE_LIMIT = 50; + +type RawHiringCafeJob = Record; + +interface ExtractedJob { + source: "hiringcafe"; + sourceJobId?: string; + title: string; + employer: string; + jobUrl: string; + applicationLink: string; + location?: string; + salary?: string; + datePosted?: string; + jobDescription?: string; + jobType?: string; +} + +interface BrowserApiResponse { + ok: boolean; + status: number; + statusText: string; + data: unknown; + responseText: string; +} + +function emitProgress(payload: Record): void { + if (process.env.JOBOPS_EMIT_PROGRESS !== "1") return; + console.log(`${JOBOPS_PROGRESS_PREFIX}${JSON.stringify(payload)}`); +} + +function parsePositiveInt(input: string | undefined, fallback: number): number { + const parsed = input ? Number.parseInt(input, 10) : Number.NaN; + if (!Number.isFinite(parsed) || parsed < 1) return fallback; + return parsed; +} + +function parseSearchTerms(raw: string | undefined): string[] { + if (!raw || raw.trim().length === 0) return [DEFAULT_SEARCH_TERM]; + + const trimmed = raw.trim(); + if (trimmed.startsWith("[")) { + try { + const parsed = JSON.parse(trimmed) as unknown; + if (Array.isArray(parsed)) { + const terms = parsed + .map((value) => toStringOrNull(value)) + .filter((value): value is string => Boolean(value)); + if (terms.length > 0) return terms; + } + } catch { + // Fall through to delimiter parsing. + } + } + + const delimiter = trimmed.includes("|") + ? "|" + : trimmed.includes("\n") + ? "\n" + : ","; + + const terms = trimmed + .split(delimiter) + .map((value) => value.trim()) + .filter(Boolean); + + return terms.length > 0 ? terms : [DEFAULT_SEARCH_TERM]; +} + +function encodeSearchState(searchState: unknown): string { + const json = JSON.stringify(searchState); + const urlEncodedJson = encodeURIComponent(json); + return Buffer.from(urlEncodedJson, "utf-8").toString("base64"); +} + +function asRecord(value: unknown): Record | null { + if (!value || typeof value !== "object" || Array.isArray(value)) return null; + return value as Record; +} + +function asStringArray(value: unknown): string[] { + if (!Array.isArray(value)) return []; + return value + .map((item) => toStringOrNull(item)) + .filter((item): item is string => Boolean(item)); +} + +function firstArrayValue(value: unknown): string | null { + const values = asStringArray(value); + return values.length > 0 ? values[0] : null; +} + +function formatCompensation( + processedJobData: Record | null, +): string | undefined { + if (!processedJobData) return undefined; + + const min = toNumberOrNull(processedJobData.yearly_min_compensation); + const max = toNumberOrNull(processedJobData.yearly_max_compensation); + if (min === null && max === null) return undefined; + + const currency = toStringOrNull( + processedJobData.listed_compensation_currency, + ); + const frequency = + toStringOrNull(processedJobData.listed_compensation_frequency) ?? "Yearly"; + + const amount = + min !== null && max !== null + ? `${Math.round(min)}-${Math.round(max)}` + : min !== null + ? `${Math.round(min)}+` + : `${Math.round(max ?? 0)}`; + + const parts = [currency, amount, frequency ? `/ ${frequency}` : ""] + .filter(Boolean) + .join(" ") + .trim(); + + return parts || undefined; +} + +function mapHiringCafeJob(raw: RawHiringCafeJob): ExtractedJob | null { + const jobInformation = asRecord(raw.job_information); + const processed = asRecord(raw.v5_processed_job_data); + const companyInfo = asRecord(jobInformation?.company_info); + + const sourceJobId = + toStringOrNull(raw.id) ?? + toStringOrNull(raw.objectID) ?? + toStringOrNull(raw.original_source_id) ?? + toStringOrNull(raw.requisition_id) ?? + undefined; + + const jobUrl = toStringOrNull(raw.apply_url); + if (!jobUrl) return null; + + const title = + toStringOrNull(jobInformation?.title) ?? + toStringOrNull(jobInformation?.job_title_raw) ?? + toStringOrNull(processed?.core_job_title) ?? + "Unknown Title"; + + const employer = + toStringOrNull(companyInfo?.name) ?? + toStringOrNull(processed?.company_name) ?? + "Unknown Employer"; + + const location = + toStringOrNull(processed?.formatted_workplace_location) ?? + firstArrayValue(processed?.workplace_cities) ?? + firstArrayValue(processed?.workplace_states) ?? + firstArrayValue(processed?.workplace_countries) ?? + undefined; + + const commitments = asStringArray(processed?.commitment); + const jobType = commitments.length > 0 ? commitments.join(", ") : undefined; + + return { + source: "hiringcafe", + sourceJobId, + title, + employer, + jobUrl, + applicationLink: jobUrl, + location, + salary: formatCompensation(processed), + datePosted: toStringOrNull(processed?.estimated_publish_date) ?? undefined, + jobDescription: toStringOrNull(jobInformation?.description) ?? undefined, + jobType, + }; +} + +function extractResultsBatch(payload: unknown): RawHiringCafeJob[] { + if (Array.isArray(payload)) { + return payload.filter( + (item): item is RawHiringCafeJob => + Boolean(item) && typeof item === "object" && !Array.isArray(item), + ); + } + + const payloadRecord = asRecord(payload); + const results = payloadRecord?.results; + if (!Array.isArray(results)) return []; + + return results.filter( + (item): item is RawHiringCafeJob => + Boolean(item) && typeof item === "object" && !Array.isArray(item), + ); +} + +function parseTotalCount(payload: unknown): number | null { + const payloadRecord = asRecord(payload); + if (!payloadRecord) return null; + return toNumberOrNull(payloadRecord.total); +} + +async function callHiringCafeApi( + page: Page, + endpoint: string, + params: Record, +): Promise { + const response = await page.evaluate( + async ({ endpointArg, paramsArg }) => { + const url = new URL(endpointArg, window.location.origin); + for (const [key, value] of Object.entries(paramsArg)) { + url.searchParams.set(key, value); + } + + const res = await fetch(url.toString(), { + method: "GET", + credentials: "include", + headers: { + Accept: "application/json, text/plain, */*", + }, + }); + + const text = await res.text(); + let data: unknown = null; + try { + data = JSON.parse(text); + } catch { + // Keep response text for diagnostics. + } + + const output: BrowserApiResponse = { + ok: res.ok, + status: res.status, + statusText: res.statusText, + data, + responseText: text, + }; + + return output; + }, + { endpointArg: endpoint, paramsArg: params }, + ); + + const result = response as BrowserApiResponse; + + if (!result.ok) { + const snippet = result.responseText.slice(0, 250); + throw new Error( + `Hiring Cafe API ${endpoint} failed (${result.status} ${result.statusText}): ${snippet}`, + ); + } + + if (result.data === null) { + const snippet = result.responseText.slice(0, 250); + throw new Error( + `Hiring Cafe API ${endpoint} returned non-JSON response: ${snippet}`, + ); + } + + return result.data; +} + +async function run(): Promise { + const searchTerms = parseSearchTerms(process.env.HIRING_CAFE_SEARCH_TERMS); + const country = normalizeCountryKey( + process.env.HIRING_CAFE_COUNTRY ?? "united kingdom", + ); + const maxJobsPerTerm = parsePositiveInt( + process.env.HIRING_CAFE_MAX_JOBS_PER_TERM, + DEFAULT_MAX_JOBS_PER_TERM, + ); + const dateFetchedPastNDays = parsePositiveInt( + process.env.HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS, + DEFAULT_DATE_FETCHED_PAST_N_DAYS, + ); + const outputPath = + process.env.HIRING_CAFE_OUTPUT_JSON || + join(__dirname, "../storage/datasets/default/jobs.json"); + const headless = process.env.HIRING_CAFE_HEADLESS !== "false"; + + let browser = await firefox.launch( + await launchOptions({ + headless, + humanize: true, + geoip: true, + }), + ); + let context = await browser.newContext(); + let page = await context.newPage(); + + const allJobs: ExtractedJob[] = []; + const seen = new Set(); + + try { + const initializePage = async () => { + await page.goto(BASE_URL, { + waitUntil: "domcontentloaded", + timeout: 60_000, + }); + await page.waitForTimeout(2_000); + }; + + try { + await initializePage(); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn( + `Camoufox browser startup was unstable, retrying with vanilla Firefox: ${message}`, + ); + await browser.close(); + browser = await firefox.launch({ headless }); + context = await browser.newContext(); + page = await context.newPage(); + await initializePage(); + } + + for (let i = 0; i < searchTerms.length; i += 1) { + const searchTerm = searchTerms[i]; + const termIndex = i + 1; + + emitProgress({ + event: "term_start", + termIndex, + termTotal: searchTerms.length, + searchTerm, + }); + + const location = resolveHiringCafeCountryLocation(country); + const searchState = createDefaultSearchState({ + searchQuery: searchTerm, + location, + dateFetchedPastNDays, + }); + const encodedSearchState = encodeSearchState(searchState); + + let totalAvailable: number | null = null; + try { + const countPayload = await callHiringCafeApi( + page, + "/api/search-jobs/get-total-count", + { + s: encodedSearchState, + }, + ); + totalAvailable = parseTotalCount(countPayload); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn( + `Hiring Cafe count request failed for term '${searchTerm}': ${message}`, + ); + } + + const termTarget = + totalAvailable !== null + ? Math.min(maxJobsPerTerm, totalAvailable) + : maxJobsPerTerm; + + let pageNo = 0; + let termCollected = 0; + + while (termCollected < termTarget && pageNo < PAGE_LIMIT) { + const size = Math.min(1000, termTarget - termCollected); + const jobsPayload = await callHiringCafeApi(page, "/api/search-jobs", { + size: String(size), + page: String(pageNo), + s: encodedSearchState, + }); + + const batch = extractResultsBatch(jobsPayload); + if (batch.length === 0) break; + + let mappedOnPage = 0; + for (const rawJob of batch) { + if (termCollected >= termTarget) break; + const mapped = mapHiringCafeJob(rawJob); + if (!mapped) continue; + + const dedupeKey = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(dedupeKey)) continue; + seen.add(dedupeKey); + + allJobs.push(mapped); + termCollected += 1; + mappedOnPage += 1; + } + + emitProgress({ + event: "page_fetched", + termIndex, + termTotal: searchTerms.length, + searchTerm, + pageNo, + resultsOnPage: mappedOnPage, + totalCollected: termCollected, + }); + + if (batch.length < size) break; + pageNo += 1; + } + + emitProgress({ + event: "term_complete", + termIndex, + termTotal: searchTerms.length, + searchTerm, + jobsFoundTerm: termCollected, + }); + } + } finally { + await browser.close(); + } + + await mkdir(dirname(outputPath), { recursive: true }); + await writeFile(outputPath, `${JSON.stringify(allJobs, null, 2)}\n`, "utf-8"); + + console.log(`Hiring Cafe extractor wrote ${allJobs.length} jobs`); +} + +run().catch((error: unknown) => { + const message = error instanceof Error ? error.message : "Unknown error"; + console.error(`Hiring Cafe extractor failed: ${message}`); + process.exitCode = 1; +}); diff --git a/extractors/hiringcafe/tsconfig.json b/extractors/hiringcafe/tsconfig.json new file mode 100644 index 0000000..6ace792 --- /dev/null +++ b/extractors/hiringcafe/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "module": "NodeNext", + "moduleResolution": "NodeNext", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"] + }, + "include": ["./src/**/*"] +} diff --git a/orchestrator/src/client/components/PipelineProgress.tsx b/orchestrator/src/client/components/PipelineProgress.tsx index 42198e2..16ab531 100644 --- a/orchestrator/src/client/components/PipelineProgress.tsx +++ b/orchestrator/src/client/components/PipelineProgress.tsx @@ -24,7 +24,13 @@ interface PipelineProgress { | "failed"; message: string; detail?: string; - crawlingSource: "gradcracker" | "jobspy" | "ukvisajobs" | "adzuna" | null; + crawlingSource: + | "gradcracker" + | "jobspy" + | "ukvisajobs" + | "adzuna" + | "hiringcafe" + | null; crawlingSourcesCompleted: number; crawlingSourcesTotal: number; crawlingTermsProcessed: number; @@ -85,6 +91,7 @@ const sourceLabel: Record< jobspy: "JobSpy", ukvisajobs: "UKVisaJobs", adzuna: "Adzuna", + hiringcafe: "Hiring Cafe", }; const clamp = (value: number, min: number, max: number) => diff --git a/orchestrator/src/client/pages/orchestrator/automatic-run.test.ts b/orchestrator/src/client/pages/orchestrator/automatic-run.test.ts index cd17979..7ecf508 100644 --- a/orchestrator/src/client/pages/orchestrator/automatic-run.test.ts +++ b/orchestrator/src/client/pages/orchestrator/automatic-run.test.ts @@ -92,4 +92,20 @@ describe("automatic-run utilities", () => { expect(estimate.discovered.cap).toBeGreaterThan(0); expect(estimate.discovered.cap).toBeLessThanOrEqual(120); }); + + it("includes hiringcafe in estimate caps using the shared term budget", () => { + const estimate = calculateAutomaticEstimate({ + values: { + topN: 10, + minSuitabilityScore: 50, + searchTerms: ["backend", "platform"], + runBudget: 120, + country: "united kingdom", + }, + sources: ["hiringcafe"], + }); + + expect(estimate.discovered.cap).toBeGreaterThan(0); + expect(estimate.discovered.cap).toBeLessThanOrEqual(120); + }); }); diff --git a/orchestrator/src/client/pages/orchestrator/automatic-run.ts b/orchestrator/src/client/pages/orchestrator/automatic-run.ts index 6823caf..f94b6a2 100644 --- a/orchestrator/src/client/pages/orchestrator/automatic-run.ts +++ b/orchestrator/src/client/pages/orchestrator/automatic-run.ts @@ -77,6 +77,7 @@ export function deriveExtractorLimits(args: { const includesGradcracker = args.sources.includes("gradcracker"); const includesUkVisaJobs = args.sources.includes("ukvisajobs"); const includesAdzuna = args.sources.includes("adzuna"); + const includesHiringCafe = args.sources.includes("hiringcafe"); const weightedContributors = (includesIndeed ? termCount : 0) + @@ -84,7 +85,8 @@ export function deriveExtractorLimits(args: { (includesGlassdoor ? termCount : 0) + (includesGradcracker ? termCount : 0) + (includesUkVisaJobs ? 1 : 0) + - (includesAdzuna ? termCount : 0); + (includesAdzuna ? termCount : 0) + + (includesHiringCafe ? termCount : 0); if (weightedContributors <= 0) { return { @@ -143,6 +145,7 @@ export function calculateAutomaticEstimate(args: { const hasLinkedIn = sources.includes("linkedin"); const hasGlassdoor = sources.includes("glassdoor"); const hasAdzuna = sources.includes("adzuna"); + const hasHiringCafe = sources.includes("hiringcafe"); const limits = deriveExtractorLimits({ budget: values.runBudget, searchTerms: values.searchTerms, @@ -158,8 +161,12 @@ export function calculateAutomaticEstimate(args: { : 0; const ukvisaCap = hasUkVisaJobs ? limits.ukvisajobsMaxJobs : 0; const adzunaCap = hasAdzuna ? limits.adzunaMaxJobsPerTerm * termCount : 0; + const hiringCafeCap = hasHiringCafe + ? limits.jobspyResultsWanted * termCount + : 0; - const discoveredCap = jobspyCap + gradcrackerCap + ukvisaCap + adzunaCap; + const discoveredCap = + jobspyCap + gradcrackerCap + ukvisaCap + adzunaCap + hiringCafeCap; const discoveredMin = Math.round(discoveredCap * 0.35); const discoveredMax = Math.round(discoveredCap * 0.75); const processedMin = Math.min(values.topN, discoveredMin); diff --git a/orchestrator/src/client/pages/orchestrator/constants.ts b/orchestrator/src/client/pages/orchestrator/constants.ts index 6f61943..1f5d2c6 100644 --- a/orchestrator/src/client/pages/orchestrator/constants.ts +++ b/orchestrator/src/client/pages/orchestrator/constants.ts @@ -14,6 +14,7 @@ export const orderedSources: JobSource[] = [ "linkedin", "glassdoor", "adzuna", + "hiringcafe", "ukvisajobs", ]; export const orderedFilterSources: JobSource[] = [...orderedSources, "manual"]; diff --git a/orchestrator/src/client/pages/orchestrator/utils.ts b/orchestrator/src/client/pages/orchestrator/utils.ts index 3c89286..b1f66ad 100644 --- a/orchestrator/src/client/pages/orchestrator/utils.ts +++ b/orchestrator/src/client/pages/orchestrator/utils.ts @@ -168,7 +168,8 @@ export const getSourcesWithJobs = (jobs: JobListItem[]): JobSource[] => { export const getEnabledSources = ( settings: AppSettings | null, ): JobSource[] => { - if (!settings) return [...DEFAULT_PIPELINE_SOURCES, "glassdoor"]; + if (!settings) + return [...DEFAULT_PIPELINE_SOURCES, "glassdoor", "hiringcafe"]; const enabled: JobSource[] = []; const hasUkVisaJobsAuth = Boolean( @@ -191,6 +192,10 @@ export const getEnabledSources = ( if (hasAdzunaAuth) enabled.push(source); continue; } + if (source === "hiringcafe") { + enabled.push(source); + continue; + } if ( source === "indeed" || source === "linkedin" || diff --git a/orchestrator/src/lib/utils.ts b/orchestrator/src/lib/utils.ts index 59fae8c..c8def09 100644 --- a/orchestrator/src/lib/utils.ts +++ b/orchestrator/src/lib/utils.ts @@ -144,5 +144,6 @@ export const sourceLabel: Record = { glassdoor: "Glassdoor", ukvisajobs: "UK Visa Jobs", adzuna: "Adzuna", + hiringcafe: "Hiring Cafe", manual: "Manual", }; diff --git a/orchestrator/src/server/api/routes/pipeline.ts b/orchestrator/src/server/api/routes/pipeline.ts index 84c69bf..b3414e9 100644 --- a/orchestrator/src/server/api/routes/pipeline.ts +++ b/orchestrator/src/server/api/routes/pipeline.ts @@ -101,6 +101,7 @@ const runPipelineSchema = z.object({ "glassdoor", "ukvisajobs", "adzuna", + "hiringcafe", ]), ) .min(1) diff --git a/orchestrator/src/server/config/demo-defaults.data.ts b/orchestrator/src/server/config/demo-defaults.data.ts index ed6ee10..2bb9275 100644 --- a/orchestrator/src/server/config/demo-defaults.data.ts +++ b/orchestrator/src/server/config/demo-defaults.data.ts @@ -253,6 +253,7 @@ export const DEMO_SOURCE_BASE_URLS: Record = { gradcracker: "https://www.gradcracker.com", ukvisajobs: "https://www.ukvisajobs.com", adzuna: "https://www.adzuna.com", + hiringcafe: "https://hiring.cafe", manual: "https://example.com", }; diff --git a/orchestrator/src/server/db/schema.ts b/orchestrator/src/server/db/schema.ts index 09348c6..99c0ff7 100644 --- a/orchestrator/src/server/db/schema.ts +++ b/orchestrator/src/server/db/schema.ts @@ -40,6 +40,7 @@ export const jobs = sqliteTable("jobs", { "glassdoor", "ukvisajobs", "adzuna", + "hiringcafe", "manual", ], }) diff --git a/orchestrator/src/server/pipeline/progress.ts b/orchestrator/src/server/pipeline/progress.ts index 8bff458..3d0ad54 100644 --- a/orchestrator/src/server/pipeline/progress.ts +++ b/orchestrator/src/server/pipeline/progress.ts @@ -14,7 +14,12 @@ export type PipelineStep = | "cancelled" | "failed"; -export type CrawlSource = "gradcracker" | "jobspy" | "ukvisajobs" | "adzuna"; +export type CrawlSource = + | "gradcracker" + | "jobspy" + | "ukvisajobs" + | "adzuna" + | "hiringcafe"; export interface PipelineProgress { step: PipelineStep; diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts index 265d345..b37c240 100644 --- a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts +++ b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts @@ -23,6 +23,10 @@ vi.mock("../../services/adzuna", () => ({ runAdzuna: vi.fn(), })); +vi.mock("../../services/hiring-cafe", () => ({ + runHiringCafe: vi.fn(), +})); + vi.mock("../../services/ukvisajobs", () => ({ runUkVisaJobs: vi.fn(), })); @@ -218,6 +222,126 @@ describe("discoverJobsStep", () => { expect(vi.mocked(adzuna.runAdzuna)).not.toHaveBeenCalled(); }); + it("runs hiringcafe when selected and passes country/terms/cap", async () => { + const settingsRepo = await import("../../repositories/settings"); + const hiringCafe = await import("../../services/hiring-cafe"); + + vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({ + searchTerms: JSON.stringify(["engineer"]), + jobspyCountryIndeed: "united states", + jobspyResultsWanted: "25", + } as any); + + vi.mocked(hiringCafe.runHiringCafe).mockResolvedValue({ + success: true, + jobs: [ + { + source: "hiringcafe", + sourceJobId: "hc-1", + title: "Engineer", + employer: "ACME", + jobUrl: "https://example.com/hc", + applicationLink: "https://example.com/hc", + }, + ], + } as any); + + const result = await discoverJobsStep({ + mergedConfig: { + ...config, + sources: ["hiringcafe"], + }, + }); + + expect(result.discoveredJobs).toHaveLength(1); + expect(vi.mocked(hiringCafe.runHiringCafe)).toHaveBeenCalledWith( + expect.objectContaining({ + country: "united states", + searchTerms: ["engineer"], + maxJobsPerTerm: 25, + }), + ); + }); + + it("updates Hiring Cafe terms and pages via progress callbacks", async () => { + const settingsRepo = await import("../../repositories/settings"); + const hiringCafe = await import("../../services/hiring-cafe"); + + vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({ + searchTerms: JSON.stringify(["engineer", "frontend"]), + jobspyCountryIndeed: "united kingdom", + jobspyResultsWanted: "50", + } as any); + + vi.mocked(hiringCafe.runHiringCafe).mockImplementation( + async (options: any) => { + options?.onProgress?.({ + type: "term_start", + termIndex: 1, + termTotal: 2, + searchTerm: "engineer", + }); + options?.onProgress?.({ + type: "page_fetched", + termIndex: 1, + termTotal: 2, + searchTerm: "engineer", + pageNo: 0, + resultsOnPage: 10, + totalCollected: 10, + }); + options?.onProgress?.({ + type: "term_complete", + termIndex: 1, + termTotal: 2, + searchTerm: "engineer", + jobsFoundTerm: 10, + }); + return { success: true, jobs: [] } as any; + }, + ); + + await discoverJobsStep({ + mergedConfig: { + ...config, + sources: ["hiringcafe"], + }, + }); + + const progress = getProgress(); + expect(progress.crawlingTermsProcessed).toBe(1); + expect(progress.crawlingTermsTotal).toBe(2); + expect(progress.crawlingListPagesProcessed).toBe(1); + expect(progress.crawlingJobPagesEnqueued).toBe(10); + expect(progress.crawlingJobPagesProcessed).toBe(10); + }); + + it("returns Hiring Cafe source error when extractor fails", async () => { + const settingsRepo = await import("../../repositories/settings"); + const hiringCafe = await import("../../services/hiring-cafe"); + + vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({ + searchTerms: JSON.stringify(["engineer"]), + jobspyCountryIndeed: "united kingdom", + jobspyResultsWanted: "50", + } as any); + + vi.mocked(hiringCafe.runHiringCafe).mockResolvedValue({ + success: false, + jobs: [], + error: "blocked upstream", + } as any); + + await expect( + discoverJobsStep({ + mergedConfig: { + ...config, + sources: ["hiringcafe"], + }, + }), + ).rejects.toThrow("All sources failed: hiringcafe: blocked upstream"); + }); + it("maps Gradcracker progress callback into live crawling counters", async () => { const settingsRepo = await import("../../repositories/settings"); const crawler = await import("../../services/crawler"); @@ -402,6 +526,7 @@ describe("discoverJobsStep", () => { it("does not throw when no sources are requested", async () => { const settingsRepo = await import("../../repositories/settings"); const adzuna = await import("../../services/adzuna"); + const hiringCafe = await import("../../services/hiring-cafe"); const jobSpy = await import("../../services/jobspy"); const crawler = await import("../../services/crawler"); const ukVisa = await import("../../services/ukvisajobs"); @@ -422,6 +547,7 @@ describe("discoverJobsStep", () => { expect(result.sourceErrors).toEqual([]); expect(vi.mocked(jobSpy.runJobSpy)).not.toHaveBeenCalled(); expect(vi.mocked(adzuna.runAdzuna)).not.toHaveBeenCalled(); + expect(vi.mocked(hiringCafe.runHiringCafe)).not.toHaveBeenCalled(); expect(vi.mocked(crawler.runCrawler)).not.toHaveBeenCalled(); expect(vi.mocked(ukVisa.runUkVisaJobs)).not.toHaveBeenCalled(); }); diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.ts index 87cc462..12d936e 100644 --- a/orchestrator/src/server/pipeline/steps/discover-jobs.ts +++ b/orchestrator/src/server/pipeline/steps/discover-jobs.ts @@ -10,6 +10,7 @@ import * as jobsRepo from "../../repositories/jobs"; import * as settingsRepo from "../../repositories/settings"; import { runAdzuna } from "../../services/adzuna"; import { runCrawler } from "../../services/crawler"; +import { runHiringCafe } from "../../services/hiring-cafe"; import { runJobSpy } from "../../services/jobspy"; import { runUkVisaJobs } from "../../services/ukvisajobs"; import { progressHelpers, updateProgress } from "../progress"; @@ -75,12 +76,14 @@ export async function discoverJobsStep(args: { const shouldRunJobSpy = jobSpySites.length > 0; const shouldRunAdzuna = compatibleSources.includes("adzuna"); + const shouldRunHiringCafe = compatibleSources.includes("hiringcafe"); const shouldRunGradcracker = compatibleSources.includes("gradcracker"); const shouldRunUkVisaJobs = compatibleSources.includes("ukvisajobs"); const totalSources = Number(shouldRunJobSpy) + Number(shouldRunAdzuna) + + Number(shouldRunHiringCafe) + Number(shouldRunGradcracker) + Number(shouldRunUkVisaJobs); let completedSources = 0; @@ -236,6 +239,84 @@ export async function discoverJobsStep(args: { return { discoveredJobs, sourceErrors }; } + if (shouldRunHiringCafe) { + progressHelpers.startSource("hiringcafe", completedSources, totalSources, { + termsTotal: searchTerms.length, + detail: "Hiring Cafe: fetching jobs...", + }); + + const hiringCafeMaxJobsPerTerm = settings.jobspyResultsWanted + ? parseInt(settings.jobspyResultsWanted, 10) + : 200; + + const hiringCafeResult = await runHiringCafe({ + country: selectedCountry, + searchTerms, + maxJobsPerTerm: hiringCafeMaxJobsPerTerm, + onProgress: (event) => { + if (event.type === "term_start") { + progressHelpers.crawlingUpdate({ + source: "hiringcafe", + termsProcessed: Math.max(event.termIndex - 1, 0), + termsTotal: event.termTotal, + phase: "list", + currentUrl: event.searchTerm, + }); + updateProgress({ + step: "crawling", + detail: `Hiring Cafe: term ${event.termIndex}/${event.termTotal} (${event.searchTerm})`, + }); + return; + } + + if (event.type === "page_fetched") { + const displayPageNo = event.pageNo + 1; + progressHelpers.crawlingUpdate({ + source: "hiringcafe", + termsProcessed: Math.max(event.termIndex - 1, 0), + termsTotal: event.termTotal, + listPagesProcessed: displayPageNo, + jobPagesEnqueued: event.totalCollected, + jobPagesProcessed: event.totalCollected, + phase: "list", + currentUrl: `page ${displayPageNo}`, + }); + updateProgress({ + step: "crawling", + detail: `Hiring Cafe: term ${event.termIndex}/${event.termTotal}, page ${displayPageNo} (${event.totalCollected} collected)`, + }); + return; + } + + progressHelpers.crawlingUpdate({ + source: "hiringcafe", + termsProcessed: event.termIndex, + termsTotal: event.termTotal, + phase: "list", + currentUrl: event.searchTerm, + }); + updateProgress({ + step: "crawling", + detail: `Hiring Cafe: completed term ${event.termIndex}/${event.termTotal} (${event.searchTerm})`, + }); + }, + }); + + if (!hiringCafeResult.success) { + sourceErrors.push( + `hiringcafe: ${hiringCafeResult.error ?? "unknown error"}`, + ); + } else { + discoveredJobs.push(...hiringCafeResult.jobs); + } + + markSourceComplete(); + } + + if (args.shouldCancel?.()) { + return { discoveredJobs, sourceErrors }; + } + if (shouldRunGradcracker) { progressHelpers.startSource("gradcracker", completedSources, totalSources, { detail: "Gradcracker: scraping...", diff --git a/orchestrator/src/server/services/hiring-cafe.ts b/orchestrator/src/server/services/hiring-cafe.ts new file mode 100644 index 0000000..2060963 --- /dev/null +++ b/orchestrator/src/server/services/hiring-cafe.ts @@ -0,0 +1,270 @@ +import { spawn, spawnSync } from "node:child_process"; +import { mkdir, readFile, rm } from "node:fs/promises"; +import { createRequire } from "node:module"; +import { dirname, join } from "node:path"; +import { createInterface } from "node:readline"; +import { fileURLToPath } from "node:url"; +import { logger } from "@infra/logger"; +import { sanitizeUnknown } from "@infra/sanitize"; +import type { CreateJobInput } from "@shared/types"; +import { toNumberOrNull, toStringOrNull } from "@shared/utils/type-conversion"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const HIRING_CAFE_DIR = join(__dirname, "../../../../extractors/hiringcafe"); +const DATASET_PATH = join( + HIRING_CAFE_DIR, + "storage/datasets/default/jobs.json", +); +const STORAGE_DATASET_DIR = join(HIRING_CAFE_DIR, "storage/datasets/default"); +const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS "; + +const require = createRequire(import.meta.url); +const TSX_CLI_PATH = resolveTsxCliPath(); + +type HiringCafeRawJob = Record; + +export type HiringCafeProgressEvent = + | { + type: "term_start"; + termIndex: number; + termTotal: number; + searchTerm: string; + } + | { + type: "page_fetched"; + termIndex: number; + termTotal: number; + searchTerm: string; + pageNo: number; + resultsOnPage: number; + totalCollected: number; + } + | { + type: "term_complete"; + termIndex: number; + termTotal: number; + searchTerm: string; + jobsFoundTerm: number; + }; + +export interface RunHiringCafeOptions { + searchTerms?: string[]; + country?: string; + maxJobsPerTerm?: number; + onProgress?: (event: HiringCafeProgressEvent) => void; +} + +export interface HiringCafeResult { + success: boolean; + jobs: CreateJobInput[]; + error?: string; +} + +function resolveTsxCliPath(): string | null { + try { + return require.resolve("tsx/dist/cli.mjs"); + } catch { + return null; + } +} + +function canRunNpmCommand(): boolean { + const result = spawnSync("npm", ["--version"], { stdio: "ignore" }); + return !result.error && result.status === 0; +} + +function parseProgressLine(line: string): HiringCafeProgressEvent | null { + if (!line.startsWith(JOBOPS_PROGRESS_PREFIX)) return null; + + const raw = line.slice(JOBOPS_PROGRESS_PREFIX.length).trim(); + + let parsed: Record; + try { + parsed = JSON.parse(raw) as Record; + } catch { + return null; + } + + const event = toStringOrNull(parsed.event); + const termIndex = toNumberOrNull(parsed.termIndex); + const termTotal = toNumberOrNull(parsed.termTotal); + const searchTerm = toStringOrNull(parsed.searchTerm) ?? ""; + + if (!event || termIndex === null || termTotal === null) { + return null; + } + + if (event === "term_start") { + return { type: "term_start", termIndex, termTotal, searchTerm }; + } + + if (event === "page_fetched") { + const pageNo = toNumberOrNull(parsed.pageNo); + if (pageNo === null) return null; + + return { + type: "page_fetched", + termIndex, + termTotal, + searchTerm, + pageNo, + resultsOnPage: toNumberOrNull(parsed.resultsOnPage) ?? 0, + totalCollected: toNumberOrNull(parsed.totalCollected) ?? 0, + }; + } + + if (event === "term_complete") { + return { + type: "term_complete", + termIndex, + termTotal, + searchTerm, + jobsFoundTerm: toNumberOrNull(parsed.jobsFoundTerm) ?? 0, + }; + } + + return null; +} + +function mapHiringCafeRow(row: HiringCafeRawJob): CreateJobInput | null { + const jobUrl = toStringOrNull(row.jobUrl); + if (!jobUrl) return null; + + return { + source: "hiringcafe", + sourceJobId: toStringOrNull(row.sourceJobId) ?? undefined, + title: toStringOrNull(row.title) ?? "Unknown Title", + employer: toStringOrNull(row.employer) ?? "Unknown Employer", + jobUrl, + applicationLink: toStringOrNull(row.applicationLink) ?? jobUrl, + location: toStringOrNull(row.location) ?? undefined, + salary: toStringOrNull(row.salary) ?? undefined, + datePosted: toStringOrNull(row.datePosted) ?? undefined, + jobDescription: toStringOrNull(row.jobDescription) ?? undefined, + jobType: toStringOrNull(row.jobType) ?? undefined, + }; +} + +async function readDataset(): Promise { + const content = await readFile(DATASET_PATH, "utf-8"); + const parsed = JSON.parse(content) as unknown; + if (!Array.isArray(parsed)) return []; + + const jobs: CreateJobInput[] = []; + const seen = new Set(); + + for (const value of parsed) { + if (!value || typeof value !== "object" || Array.isArray(value)) continue; + + const mapped = mapHiringCafeRow(value as HiringCafeRawJob); + if (!mapped) continue; + + const dedupeKey = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(dedupeKey)) continue; + + seen.add(dedupeKey); + jobs.push(mapped); + } + + return jobs; +} + +async function clearStorageDataset(): Promise { + await rm(STORAGE_DATASET_DIR, { recursive: true, force: true }); + await mkdir(STORAGE_DATASET_DIR, { recursive: true }); +} + +export async function runHiringCafe( + options: RunHiringCafeOptions = {}, +): Promise { + const searchTerms = + options.searchTerms && options.searchTerms.length > 0 + ? options.searchTerms + : ["web developer"]; + const country = (options.country || "united kingdom").trim().toLowerCase(); + const maxJobsPerTerm = options.maxJobsPerTerm ?? 200; + + const useNpmCommand = canRunNpmCommand(); + if (!useNpmCommand && !TSX_CLI_PATH) { + return { + success: false, + jobs: [], + error: "Unable to execute Hiring Cafe extractor (npm/tsx unavailable)", + }; + } + + try { + await clearStorageDataset(); + + await new Promise((resolve, reject) => { + const extractorEnv = { + ...process.env, + JOBOPS_EMIT_PROGRESS: "1", + HIRING_CAFE_SEARCH_TERMS: JSON.stringify(searchTerms), + HIRING_CAFE_COUNTRY: country, + HIRING_CAFE_MAX_JOBS_PER_TERM: String(maxJobsPerTerm), + HIRING_CAFE_OUTPUT_JSON: DATASET_PATH, + }; + + const child = useNpmCommand + ? spawn("npm", ["run", "start"], { + cwd: HIRING_CAFE_DIR, + stdio: ["ignore", "pipe", "pipe"], + env: extractorEnv, + }) + : (() => { + const tsxCliPath = TSX_CLI_PATH; + if (!tsxCliPath) { + throw new Error( + "Unable to execute Hiring Cafe extractor (npm/tsx unavailable)", + ); + } + + return spawn(process.execPath, [tsxCliPath, "src/main.ts"], { + cwd: HIRING_CAFE_DIR, + stdio: ["ignore", "pipe", "pipe"], + env: extractorEnv, + }); + })(); + + const handleLine = (line: string, stream: NodeJS.WriteStream) => { + const progressEvent = parseProgressLine(line); + if (progressEvent) { + options.onProgress?.(progressEvent); + return; + } + + stream.write(`${line}\n`); + }; + + const stdoutRl = child.stdout + ? createInterface({ input: child.stdout }) + : null; + const stderrRl = child.stderr + ? createInterface({ input: child.stderr }) + : null; + + stdoutRl?.on("line", (line) => handleLine(line, process.stdout)); + stderrRl?.on("line", (line) => handleLine(line, process.stderr)); + + child.on("close", (code) => { + stdoutRl?.close(); + stderrRl?.close(); + if (code === 0) resolve(); + else + reject(new Error(`Hiring Cafe extractor exited with code ${code}`)); + }); + child.on("error", reject); + }); + + const jobs = await readDataset(); + return { success: true, jobs }; + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + logger.warn("Hiring Cafe extractor run failed", { + error: message, + details: sanitizeUnknown(error), + }); + return { success: false, jobs: [], error: message }; + } +} diff --git a/package-lock.json b/package-lock.json index b7de445..17fab50 100644 --- a/package-lock.json +++ b/package-lock.json @@ -153,6 +153,33 @@ "undici-types": "~7.16.0" } }, + "extractors/hiringcafe": { + "name": "hiringcafe-extractor", + "version": "0.0.1", + "dependencies": { + "camoufox-js": "^0.8.0", + "job-ops-shared": "^1.0.0", + "playwright": "^1.57.0", + "tsx": "^4.4.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "optionalDependencies": { + "impit-linux-x64-gnu": "^0.1.0" + } + }, + "extractors/hiringcafe/node_modules/@types/node": { + "version": "24.10.13", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.13.tgz", + "integrity": "sha512-oH72nZRfDv9lADUBSo104Aq7gPHpQZc4BTx38r9xf9pg5LfP6EzSyH2n7qFmmxRQXh7YlUXODcYsg6PuTDSxGg==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, "extractors/ukvisajobs": { "name": "ukvisajobs-extractor", "version": "0.0.1", @@ -13175,6 +13202,10 @@ "node": ">=16.0.0" } }, + "node_modules/hiringcafe-extractor": { + "resolved": "extractors/hiringcafe", + "link": true + }, "node_modules/history": { "version": "4.10.1", "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz", diff --git a/shared/src/types.ts b/shared/src/types.ts index 388b328..1500fb0 100644 --- a/shared/src/types.ts +++ b/shared/src/types.ts @@ -126,6 +126,7 @@ export type JobSource = | "glassdoor" | "ukvisajobs" | "adzuna" + | "hiringcafe" | "manual"; export interface Job {