From 7b3dfb002a6302cba1d1c549e3c073057a782c2e Mon Sep 17 00:00:00 2001 From: ilia Date: Tue, 12 May 2026 20:17:52 -0400 Subject: [PATCH] feat(extractors): add 17 job source extractors and cross-source dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek, greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive, themuse, usajobs, weworkremotely, workday — each with manifest, package metadata and README. Pipeline / shared: - shared/job-fingerprint: stable hash for cross-source dedup, with tests - discover-jobs: dedup via fingerprint and richer per-source merging - jobs repository: fingerprint-aware upsert / lookup - settings-registry, settings types/routes, demo-defaults: knobs for the new sources - shared extractors index: register the new manifests - location-support, profiles route: small fixes for the new sources Tooling: - scripts/smoke-extractors.ts to sanity-check each source locally - scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron templates (CHANGEME placeholders only) - .env.example: documented env vars for the new extractors - .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only) Co-authored-by: Cursor --- .env.example | 113 +++++ .gitignore | 4 +- extractors/arbeitnow/README.md | 11 + extractors/arbeitnow/manifest.ts | 172 +++++++ extractors/arbeitnow/package.json | 17 + extractors/arbeitnow/tsconfig.json | 17 + extractors/ashby/README.md | 8 + extractors/ashby/manifest.ts | 185 ++++++++ extractors/ashby/package.json | 17 + extractors/ashby/tsconfig.json | 17 + extractors/careerjet/README.md | 12 + extractors/careerjet/manifest.ts | 267 +++++++++++ extractors/careerjet/package.json | 17 + extractors/careerjet/tsconfig.json | 17 + extractors/fourdayweek/README.md | 10 + extractors/fourdayweek/manifest.ts | 226 ++++++++++ extractors/fourdayweek/package.json | 17 + extractors/fourdayweek/tsconfig.json | 17 + extractors/greenhouse/README.md | 8 + extractors/greenhouse/manifest.ts | 188 ++++++++ extractors/greenhouse/package.json | 17 + extractors/greenhouse/tsconfig.json | 17 + extractors/himalayas/README.md | 10 + extractors/himalayas/manifest.ts | 195 ++++++++ extractors/himalayas/package.json | 17 + extractors/himalayas/tsconfig.json | 17 + extractors/jobicy/README.md | 8 + extractors/jobicy/manifest.ts | 186 ++++++++ extractors/jobicy/package.json | 17 + extractors/jobicy/tsconfig.json | 17 + extractors/jooble/README.md | 7 + extractors/jooble/manifest.ts | 177 ++++++++ extractors/jooble/package.json | 17 + extractors/jooble/tsconfig.json | 17 + extractors/lever/README.md | 9 + extractors/lever/manifest.ts | 182 ++++++++ extractors/lever/package.json | 17 + extractors/lever/tsconfig.json | 17 + extractors/reed/README.md | 8 + extractors/reed/manifest.ts | 188 ++++++++ extractors/reed/package.json | 17 + extractors/reed/tsconfig.json | 17 + extractors/remoteok/README.md | 15 + extractors/remoteok/manifest.ts | 190 ++++++++ extractors/remoteok/package.json | 17 + extractors/remoteok/tsconfig.json | 17 + extractors/remotive/README.md | 9 + extractors/remotive/manifest.ts | 153 +++++++ extractors/remotive/package.json | 17 + extractors/remotive/tsconfig.json | 17 + extractors/themuse/README.md | 8 + extractors/themuse/manifest.ts | 224 +++++++++ extractors/themuse/package.json | 17 + extractors/themuse/tsconfig.json | 17 + extractors/usajobs/README.md | 22 + extractors/usajobs/manifest.ts | 263 +++++++++++ extractors/usajobs/package.json | 17 + extractors/usajobs/tsconfig.json | 17 + extractors/weworkremotely/README.md | 10 + extractors/weworkremotely/manifest.ts | 192 ++++++++ extractors/weworkremotely/package.json | 17 + extractors/weworkremotely/tsconfig.json | 17 + extractors/workday/README.md | 25 ++ extractors/workday/manifest.ts | 263 +++++++++++ extractors/workday/package.json | 17 + extractors/workday/tsconfig.json | 17 + orchestrator/package.json | 2 +- orchestrator/src/client/api/client.ts | 37 +- .../client/pages/orchestrator/utils.test.ts | 6 + .../src/client/pages/orchestrator/utils.ts | 66 +++ .../src/server/api/routes/profiles.ts | 31 +- .../src/server/api/routes/settings.ts | 27 ++ .../src/server/config/demo-defaults.data.ts | 16 + orchestrator/src/server/db/migrate.ts | 7 + orchestrator/src/server/db/schema.ts | 9 + .../server/pipeline/steps/discover-jobs.ts | 187 +++++++- orchestrator/src/server/repositories/jobs.ts | 127 +++++- orchestrator/src/server/services/settings.ts | 14 + package-lock.json | 425 +++++++++++++++++- package.json | 1 + scripts/jobber-cron-cherepaha.env.example | 24 + scripts/jobber-cron-dobkin.env.example | 24 + scripts/smoke-extractors.ts | 219 +++++++++ shared/src/extractors/index.ts | 165 ++++++- shared/src/index.ts | 1 + shared/src/job-fingerprint.test.ts | 80 ++++ shared/src/job-fingerprint.ts | 77 ++++ shared/src/location-support.ts | 12 +- shared/src/settings-registry.ts | 207 +++++++++ shared/src/testing/factories.ts | 25 ++ shared/src/types/settings.ts | 25 ++ 91 files changed, 5849 insertions(+), 57 deletions(-) create mode 100644 extractors/arbeitnow/README.md create mode 100644 extractors/arbeitnow/manifest.ts create mode 100644 extractors/arbeitnow/package.json create mode 100644 extractors/arbeitnow/tsconfig.json create mode 100644 extractors/ashby/README.md create mode 100644 extractors/ashby/manifest.ts create mode 100644 extractors/ashby/package.json create mode 100644 extractors/ashby/tsconfig.json create mode 100644 extractors/careerjet/README.md create mode 100644 extractors/careerjet/manifest.ts create mode 100644 extractors/careerjet/package.json create mode 100644 extractors/careerjet/tsconfig.json create mode 100644 extractors/fourdayweek/README.md create mode 100644 extractors/fourdayweek/manifest.ts create mode 100644 extractors/fourdayweek/package.json create mode 100644 extractors/fourdayweek/tsconfig.json create mode 100644 extractors/greenhouse/README.md create mode 100644 extractors/greenhouse/manifest.ts create mode 100644 extractors/greenhouse/package.json create mode 100644 extractors/greenhouse/tsconfig.json create mode 100644 extractors/himalayas/README.md create mode 100644 extractors/himalayas/manifest.ts create mode 100644 extractors/himalayas/package.json create mode 100644 extractors/himalayas/tsconfig.json create mode 100644 extractors/jobicy/README.md create mode 100644 extractors/jobicy/manifest.ts create mode 100644 extractors/jobicy/package.json create mode 100644 extractors/jobicy/tsconfig.json create mode 100644 extractors/jooble/README.md create mode 100644 extractors/jooble/manifest.ts create mode 100644 extractors/jooble/package.json create mode 100644 extractors/jooble/tsconfig.json create mode 100644 extractors/lever/README.md create mode 100644 extractors/lever/manifest.ts create mode 100644 extractors/lever/package.json create mode 100644 extractors/lever/tsconfig.json create mode 100644 extractors/reed/README.md create mode 100644 extractors/reed/manifest.ts create mode 100644 extractors/reed/package.json create mode 100644 extractors/reed/tsconfig.json create mode 100644 extractors/remoteok/README.md create mode 100644 extractors/remoteok/manifest.ts create mode 100644 extractors/remoteok/package.json create mode 100644 extractors/remoteok/tsconfig.json create mode 100644 extractors/remotive/README.md create mode 100644 extractors/remotive/manifest.ts create mode 100644 extractors/remotive/package.json create mode 100644 extractors/remotive/tsconfig.json create mode 100644 extractors/themuse/README.md create mode 100644 extractors/themuse/manifest.ts create mode 100644 extractors/themuse/package.json create mode 100644 extractors/themuse/tsconfig.json create mode 100644 extractors/usajobs/README.md create mode 100644 extractors/usajobs/manifest.ts create mode 100644 extractors/usajobs/package.json create mode 100644 extractors/usajobs/tsconfig.json create mode 100644 extractors/weworkremotely/README.md create mode 100644 extractors/weworkremotely/manifest.ts create mode 100644 extractors/weworkremotely/package.json create mode 100644 extractors/weworkremotely/tsconfig.json create mode 100644 extractors/workday/README.md create mode 100644 extractors/workday/manifest.ts create mode 100644 extractors/workday/package.json create mode 100644 extractors/workday/tsconfig.json create mode 100644 scripts/jobber-cron-cherepaha.env.example create mode 100644 scripts/jobber-cron-dobkin.env.example create mode 100644 scripts/smoke-extractors.ts create mode 100644 shared/src/job-fingerprint.test.ts create mode 100644 shared/src/job-fingerprint.ts diff --git a/.env.example b/.env.example index 71c4fee..75efce5 100644 --- a/.env.example +++ b/.env.example @@ -97,3 +97,116 @@ ADZUNA_APP_KEY= # ============================================================================= # Filter for remote-only jobs (default: 0 = disabled) # JOBSPY_IS_REMOTE=0 + +# ============================================================================= +# USAJOBS API (US federal jobs) - optional, US-only +# ============================================================================= +# Register at https://developer.usajobs.gov/APIRequest/Index +# USAJOBS requires a User-Agent that is a real contact email (per their TOS). +# Leave unset to disable the source. +# USAJOBS_API_KEY= +# USAJOBS_USER_AGENT=you@example.com +# USAJOBS_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Jobicy (remote jobs feed) - optional, no auth +# ============================================================================= +# Public JSON endpoint, capped at 50 results per call. +# JOBICY_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# The Muse (jobs API) - optional, API key recommended +# ============================================================================= +# https://www.themuse.com/developers/api/v2 — works without a key but is +# heavily rate-limited. Set THEMUSE_API_KEY for higher quotas. +# THEMUSE_API_KEY= +# THEMUSE_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Jooble (aggregator API) - optional +# ============================================================================= +# Sign up at https://jooble.org/api/about for an API key. +# JOOBLE_API_KEY= +# JOOBLE_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Careerjet (publisher API v4) - optional +# ============================================================================= +# Register at https://www.careerjet.com/partners/api/ — declare API key + server IP(s). +# CAREERJET_AFFID=your_api_key +# CAREERJET_REFERER=https://your-site.com/path-to-job-search/ +# CAREERJET_USER_IP=203.0.113.1 +# Optional override for the required user_agent query param: +# CAREERJET_USER_AGENT=Mozilla/5.0 ... +# CAREERJET_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Reed.co.uk (UK jobs API) - optional, UK-only +# ============================================================================= +# Register at https://www.reed.co.uk/developers/jobseeker for an API key. +# REED_API_KEY= +# REED_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Remote OK (remote jobs feed) - optional, no auth +# ============================================================================= +# Public single-shot JSON feed at https://remoteok.com/api. We filter +# client-side by your search terms (matched against position + tags). +# Per Remote OK's TOS, link back to the original posting URLs when republishing. +# REMOTEOK_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Remotive (remote jobs feed) - optional, no auth +# ============================================================================= +# Public JSON API at https://remotive.com/api/remote-jobs?limit=N&search=term. +# Each search term is sent as the `search` parameter. +# REMOTIVE_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Arbeitnow (multi-ATS aggregator) - optional, no auth +# ============================================================================= +# Public JSON API at https://www.arbeitnow.com/api/job-board-api?page=N. +# Aggregates from Greenhouse, SmartRecruiters, Join, TeamTailor, Recruitee, +# and Comeet. No server-side search; filtering is done client-side. +# ARBEITNOW_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Himalayas (remote jobs feed) - optional, no auth +# ============================================================================= +# Public JSON API at https://himalayas.app/jobs/api?limit=N&offset=M. +# No server-side search; filtering is done client-side by title + categories. +# HIMALAYAS_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# We Work Remotely (RSS feed) - optional, no auth +# ============================================================================= +# Public RSS at https://weworkremotely.com/remote-jobs.rss (all categories). +# Single fetch; filtering is done client-side by title + skills + category. +# WEWORKREMOTELY_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# 4 Day Week (reduced-schedule jobs) - optional, no auth +# ============================================================================= +# Public JSON API at https://4dayweek.io/api/jobs?page=N. +# Paginated; filtering is done client-side by title + tech stack. +# No job description in listings; links to 4dayweek.io for details. +# FOURDAYWEEK_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# Public ATS sources (Lever / Ashby / Greenhouse) - optional +# ============================================================================= +# Comma- or newline-separated company slugs. The slug is the path segment used +# in each provider's public job board, e.g. `lever.co/some-company` → "some-company". +# LEVER_COMPANIES=netflix,figma +# ASHBY_COMPANIES=ramp,linear +# GREENHOUSE_COMPANIES=stripe,airbnb + +# ============================================================================= +# Workday (public career sites) - optional +# ============================================================================= +# Newline- or comma-separated entries. Each entry is either: +# 1) A career-site URL we'll auto-parse, e.g. +# https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite +# 2) A JSON object with explicit fields: +# {"company":"NVIDIA","tenantUrl":"https://nvidia.wd5.myworkdayjobs.com","tenant":"nvidia","site":"NVIDIAExternalCareerSite","locale":"en-US"} +# WORKDAY_TENANTS= diff --git a/.gitignore b/.gitignore index 599d118..54c88f7 100644 --- a/.gitignore +++ b/.gitignore @@ -15,8 +15,8 @@ docs-site/build/ # Data directory (bind mount in Docker) data/ -# Extractor storage outputs and cached auth -extractors/ukvisajobs/storage/ +# Extractor storage outputs and cached auth (per-extractor runtime data) +extractors/*/storage/ # OS files .DS_Store diff --git a/extractors/arbeitnow/README.md b/extractors/arbeitnow/README.md new file mode 100644 index 0000000..da18c9c --- /dev/null +++ b/extractors/arbeitnow/README.md @@ -0,0 +1,11 @@ +# arbeitnow-extractor + +Pulls listings from the public [Arbeitnow API](https://www.arbeitnow.com/api/job-board-api). + +- No authentication required. +- Returns 100 jobs per page; we paginate up to 5 pages (500 jobs). +- No server-side search — we filter client-side by matching title + tags + against each pipeline search term. +- Aggregates postings from Greenhouse, SmartRecruiters, Join, TeamTailor, + Recruitee, and Comeet. +- Caps results per term via the `arbeitnowMaxJobsPerTerm` setting (default 100). diff --git a/extractors/arbeitnow/manifest.ts b/extractors/arbeitnow/manifest.ts new file mode 100644 index 0000000..b4de88b --- /dev/null +++ b/extractors/arbeitnow/manifest.ts @@ -0,0 +1,172 @@ +/** + * Arbeitnow public job board API. + * + * https://www.arbeitnow.com/api/job-board-api?page=N + * + * No auth. Returns 100 results per page, sorted by creation date. + * No server-side search — we paginate and filter client-side by + * title + tags against each pipeline search term. + * + * Aggregates listings from Greenhouse, SmartRecruiters, Join, + * TeamTailor, Recruitee, and Comeet. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://www.arbeitnow.com/api/job-board-api"; +const MAX_PAGES = 5; + +interface ArbeitnowJob { + slug?: string; + company_name?: string; + title?: string; + description?: string; + remote?: boolean; + url?: string; + tags?: string[]; + job_types?: string[]; + location?: string; + created_at?: number; +} + +interface ArbeitnowResponse { + data?: ArbeitnowJob[]; + links?: { next?: string | null }; + meta?: { current_page?: number }; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed || undefined; +} + +function matchesTerm(job: ArbeitnowJob, term: string): boolean { + const lower = term.toLowerCase(); + if (job.title?.toLowerCase().includes(lower)) return true; + if ( + Array.isArray(job.tags) && + job.tags.some( + (t) => typeof t === "string" && t.toLowerCase().includes(lower), + ) + ) + return true; + return false; +} + +function mapJob(raw: ArbeitnowJob): CreateJobInput | null { + const jobUrl = asString(raw.url); + if (!jobUrl) return null; + + const tags = Array.isArray(raw.tags) + ? raw.tags.filter((t): t is string => typeof t === "string" && t.length > 0) + : []; + + const jobTypes = Array.isArray(raw.job_types) + ? raw.job_types + .filter((t): t is string => typeof t === "string" && t.length > 0) + .join(", ") + : undefined; + + const datePosted = + typeof raw.created_at === "number" + ? new Date(raw.created_at * 1000).toISOString() + : undefined; + + return { + source: "arbeitnow", + sourceJobId: asString(raw.slug), + title: asString(raw.title) ?? "Unknown Title", + employer: asString(raw.company_name) ?? "Unknown Employer", + jobUrl, + applicationLink: jobUrl, + location: asString(raw.location) ?? "Unknown", + isRemote: raw.remote === true, + jobType: jobTypes || undefined, + datePosted, + jobDescription: asString(raw.description), + disciplines: tags.length > 0 ? tags.join(", ") : undefined, + }; +} + +async function fetchPage(page: number): Promise { + const url = `${API_URL}?page=${page}`; + const response = await fetch(url, { + headers: { Accept: "application/json" }, + }); + if (!response.ok) { + throw new Error(`Arbeitnow request failed with status ${response.status}`); + } + return (await response.json()) as ArbeitnowResponse; +} + +export const manifest: ExtractorManifest = { + id: "arbeitnow", + displayName: "Arbeitnow", + providesSources: ["arbeitnow"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const maxJobs = context.settings.arbeitnowMaxJobsPerTerm + ? Number.parseInt(context.settings.arbeitnowMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : []; + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let page = 1; page <= MAX_PAGES; page += 1) { + if (context.shouldCancel?.()) break; + if (out.length >= maxJobs * Math.max(terms.length, 1)) break; + + context.onProgress?.({ + phase: "list", + termsProcessed: 0, + termsTotal: 1, + currentUrl: `page ${page}`, + detail: `Arbeitnow: fetching page ${page}`, + }); + + const body = await fetchPage(page); + const jobs = Array.isArray(body.data) ? body.data : []; + + if (jobs.length === 0) break; + + for (const raw of jobs) { + if (terms.length > 0 && !terms.some((t) => matchesTerm(raw, t))) { + continue; + } + const mapped = mapJob(raw); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + } + + context.onProgress?.({ + phase: "list", + termsProcessed: 0, + termsTotal: 1, + currentUrl: `page ${page}`, + jobPagesProcessed: out.length, + detail: `Arbeitnow: page ${page} done (${out.length} matched so far)`, + }); + + if (!body.links?.next) break; + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/arbeitnow/package.json b/extractors/arbeitnow/package.json new file mode 100644 index 0000000..88b3a78 --- /dev/null +++ b/extractors/arbeitnow/package.json @@ -0,0 +1,17 @@ +{ + "name": "arbeitnow-extractor", + "version": "0.0.1", + "type": "module", + "description": "Arbeitnow public job board API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/arbeitnow/tsconfig.json b/extractors/arbeitnow/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/arbeitnow/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/ashby/README.md b/extractors/ashby/README.md new file mode 100644 index 0000000..9ab9bb1 --- /dev/null +++ b/extractors/ashby/README.md @@ -0,0 +1,8 @@ +# ashby-extractor + +Public Ashby job-board feeds via +`GET https://api.ashbyhq.com/posting-api/job-board/{company}`. + +- No auth. +- Configure target slugs via `ashbyCompanies` (comma/newline) or + `ASHBY_COMPANIES` env. diff --git a/extractors/ashby/manifest.ts b/extractors/ashby/manifest.ts new file mode 100644 index 0000000..f4cdf66 --- /dev/null +++ b/extractors/ashby/manifest.ts @@ -0,0 +1,185 @@ +/** + * Ashby public job board API. + * + * https://developers.ashbyhq.com/reference/posting-api-job-board + * GET https://api.ashbyhq.com/posting-api/job-board/{company} + * + * No auth. Each entry in `ashbyCompanies` is fetched independently. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +interface AshbyAddress { + postalAddress?: { + addressLocality?: string; + addressRegion?: string; + addressCountry?: string; + }; +} +interface AshbyJob { + id?: string; + title?: string; + jobUrl?: string; + applyUrl?: string; + publishedAt?: string; + employmentType?: string; + isRemote?: boolean; + team?: string; + department?: string; + location?: string; + locationName?: string; + secondaryLocations?: Array<{ location?: string; locationName?: string }>; + address?: AshbyAddress; + descriptionPlain?: string; + descriptionHtml?: string; +} +interface AshbyResponse { + jobs?: AshbyJob[]; + apiVersion?: string; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function readCompanies(raw: string | undefined): string[] { + if (!raw) return []; + try { + const parsed = JSON.parse(raw); + if (Array.isArray(parsed)) { + return parsed + .map((entry) => (typeof entry === "string" ? entry.trim() : "")) + .filter(Boolean); + } + } catch { + // fall through + } + return raw + .split(/[\n,;|]+/) + .map((entry) => entry.trim()) + .filter(Boolean); +} + +function locationFor(job: AshbyJob): string | undefined { + const primary = + asString(job.locationName) ?? asString(job.location) ?? undefined; + const secondary = + job.secondaryLocations + ?.map((entry) => asString(entry.locationName) ?? asString(entry.location)) + .filter((value): value is string => Boolean(value)) ?? []; + const all = [primary, ...secondary].filter((value): value is string => + Boolean(value), + ); + return all.length > 0 ? all.join("; ") : undefined; +} + +function mapJob(job: AshbyJob, company: string): CreateJobInput | null { + const jobUrl = asString(job.jobUrl) ?? asString(job.applyUrl); + if (!jobUrl) return null; + const employer = company + .split(/[-_]/) + .filter(Boolean) + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(" "); + return { + source: "ashby", + sourceJobId: asString(job.id), + title: asString(job.title) ?? "Unknown Title", + employer: employer || company, + jobUrl, + applicationLink: asString(job.applyUrl) ?? jobUrl, + location: locationFor(job), + isRemote: typeof job.isRemote === "boolean" ? job.isRemote : undefined, + jobType: asString(job.employmentType), + jobFunction: asString(job.team), + companyIndustry: asString(job.department), + datePosted: asString(job.publishedAt), + jobDescription: + asString(job.descriptionPlain) ?? asString(job.descriptionHtml), + }; +} + +async function fetchCompany(company: string): Promise { + const url = `https://api.ashbyhq.com/posting-api/job-board/${encodeURIComponent(company)}`; + const response = await fetch(url, { + headers: { Accept: "application/json" }, + }); + if (response.status === 404) return []; + if (!response.ok) { + throw new Error( + `Ashby request for "${company}" failed with status ${response.status}`, + ); + } + const body = (await response.json()) as AshbyResponse; + return Array.isArray(body.jobs) ? body.jobs : []; +} + +export const manifest: ExtractorManifest = { + id: "ashby", + displayName: "Ashby (ATS)", + providesSources: ["ashby"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const companies = readCompanies(context.settings.ashbyCompanies); + if (companies.length === 0) { + return { + success: true, + jobs: [], + error: + "No Ashby companies configured. Set ASHBY_COMPANIES or the ashbyCompanies setting (comma- or newline-separated slugs).", + }; + } + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < companies.length; i += 1) { + if (context.shouldCancel?.()) break; + const company = companies[i]; + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: companies.length, + currentUrl: company, + detail: `Ashby: ${company} (${i + 1}/${companies.length})`, + }); + + let added = 0; + const jobs = await fetchCompany(company); + for (const job of jobs) { + const mapped = mapJob(job, company); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + added += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: companies.length, + currentUrl: company, + jobPagesProcessed: out.length, + detail: `Ashby: ${company} → ${added} jobs (${out.length} total)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/ashby/package.json b/extractors/ashby/package.json new file mode 100644 index 0000000..e39eb78 --- /dev/null +++ b/extractors/ashby/package.json @@ -0,0 +1,17 @@ +{ + "name": "ashby-extractor", + "version": "0.0.1", + "type": "module", + "description": "Ashby public ATS extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/ashby/tsconfig.json b/extractors/ashby/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/ashby/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/careerjet/README.md b/extractors/careerjet/README.md new file mode 100644 index 0000000..f8ffedd --- /dev/null +++ b/extractors/careerjet/README.md @@ -0,0 +1,12 @@ +# careerjet-extractor + +[Careerjet publisher API v4](https://www.careerjet.com/partners/api/) (`https://search.api.careerjet.net/v4/query`). + +## Required configuration + +- **`CAREERJET_AFFID`** — Your publisher **API key** (settings key `careerjetAffid`). Used as the Basic auth **username**; password is empty. +- **`CAREERJET_REFERER`** — The `Referer` header Careerjet requires: the full URL of the job-search page on your registered site (e.g. `https://yoursite.com/find-jobs/`). +- **`CAREERJET_USER_IP`** — The `user_ip` query parameter. In the [publisher dashboard](https://www.careerjet.com/partners/), add your **server’s outbound IP** (and any dev machine IP) under “Server IP address”; this value should match an allowlisted address. +- **`CAREERJET_USER_AGENT`** (optional) — Override the default `user_agent` param if Careerjet asks for a specific string. + +`selectedCountry` maps to `locale_code`; the first `searchCities` token is sent as `location`. Capped per term via `careerjetMaxJobsPerTerm` (default 100). The v4 API allows up to **10** pages per query. diff --git a/extractors/careerjet/manifest.ts b/extractors/careerjet/manifest.ts new file mode 100644 index 0000000..8f3f424 --- /dev/null +++ b/extractors/careerjet/manifest.ts @@ -0,0 +1,267 @@ +/** + * Careerjet publisher search API (v4). + * + * https://www.careerjet.com/partners/api/ + * GET https://search.api.careerjet.net/v4/query + * + * Uses Basic auth (username = publisher API key, password empty). Requires a + * Referer header and `user_ip` / `user_agent` query params. Register your + * server's outbound IP(s) in the Careerjet publisher dashboard. + * + * Env: CAREERJET_AFFID (API key), CAREERJET_REFERER (job-search page URL), + * CAREERJET_USER_IP (must match an allowlisted IP), optional CAREERJET_USER_AGENT. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://search.api.careerjet.net/v4/query"; + +const DEFAULT_USER_AGENT = + "Mozilla/5.0 (compatible; JobOps/1.0; job-search pipeline)"; + +interface CareerjetJob { + title?: string; + description?: string; + company?: string; + salary?: string; + date?: string; + url?: string; + site?: string; + locations?: string; +} +interface CareerjetResponse { + type?: string; + jobs?: CareerjetJob[]; + hits?: number; + pages?: number; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function mapJob(raw: CareerjetJob): CreateJobInput | null { + const jobUrl = asString(raw.url); + if (!jobUrl) return null; + return { + source: "careerjet", + title: asString(raw.title) ?? "Unknown Title", + employer: asString(raw.company) ?? "Unknown Employer", + jobUrl, + applicationLink: jobUrl, + location: asString(raw.locations), + salary: asString(raw.salary), + datePosted: asString(raw.date), + jobDescription: asString(raw.description), + companyDescription: asString(raw.site), + }; +} + +function localeForCountry(country: string): string { + const key = country.trim().toLowerCase(); + switch (key) { + case "united kingdom": + case "uk": + return "en_GB"; + case "united states": + case "usa": + case "us": + return "en_US"; + case "canada": + return "en_CA"; + case "australia": + return "en_AU"; + case "germany": + return "de_DE"; + case "france": + return "fr_FR"; + case "spain": + return "es_ES"; + case "italy": + return "it_IT"; + case "netherlands": + return "nl_NL"; + default: + return "en_GB"; + } +} + +function basicAuthorizationHeader(apiKey: string): string { + const credentials = `${apiKey}:`; + const encoded = Buffer.from(credentials, "utf8").toString("base64"); + return `Basic ${encoded}`; +} + +async function fetchPage(args: { + apiKey: string; + keywords: string; + location?: string; + page: number; + pageSize: number; + localeCode: string; + referer: string; + userIp: string; + userAgent: string; +}): Promise { + const url = new URL(API_URL); + url.searchParams.set("locale_code", args.localeCode); + url.searchParams.set("keywords", args.keywords); + if (args.location) url.searchParams.set("location", args.location); + url.searchParams.set("page", String(args.page)); + url.searchParams.set("page_size", String(args.pageSize)); + url.searchParams.set("user_ip", args.userIp); + url.searchParams.set("user_agent", args.userAgent); + + const response = await fetch(url.toString(), { + headers: { + Accept: "application/json", + Authorization: basicAuthorizationHeader(args.apiKey), + Referer: args.referer, + }, + }); + if (!response.ok) { + const snippet = (await response.text()).slice(0, 200); + throw new Error( + `Careerjet request failed with status ${response.status}${snippet ? `: ${snippet}` : ""}`, + ); + } + return (await response.json()) as CareerjetResponse; +} + +export const manifest: ExtractorManifest = { + id: "careerjet", + displayName: "Careerjet", + providesSources: ["careerjet"], + requiredEnvVars: [ + "CAREERJET_AFFID", + "CAREERJET_REFERER", + "CAREERJET_USER_IP", + ], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const apiKey = + context.settings.careerjetAffid?.trim() || + process.env.CAREERJET_AFFID?.trim(); + const referer = + context.settings.careerjetReferer?.trim() || + process.env.CAREERJET_REFERER?.trim(); + const userIp = + context.settings.careerjetUserIp?.trim() || + process.env.CAREERJET_USER_IP?.trim(); + const userAgent = + context.settings.careerjetUserAgent?.trim() || + process.env.CAREERJET_USER_AGENT?.trim() || + DEFAULT_USER_AGENT; + + if (!apiKey) { + return { + success: false, + jobs: [], + error: + "Careerjet requires CAREERJET_AFFID (publisher API key for Basic auth).", + }; + } + if (!referer) { + return { + success: false, + jobs: [], + error: + "Careerjet v4 requires CAREERJET_REFERER (the Referer URL of your job-search page, per Careerjet docs).", + }; + } + if (!userIp) { + return { + success: false, + jobs: [], + error: + "Careerjet v4 requires CAREERJET_USER_IP. Use an IP you have allowlisted in the Careerjet publisher dashboard (typically your server's public egress IP).", + }; + } + + const maxJobsPerTerm = context.settings.careerjetMaxJobsPerTerm + ? Number.parseInt(context.settings.careerjetMaxJobsPerTerm, 10) + : 100; + const pageSize = 50; + const localeCode = localeForCountry(context.selectedCountry || ""); + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [""]; + const location = + context.settings.searchCities?.split("|")[0]?.trim() || undefined; + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i].trim(); + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: term || "(all)", + detail: `Careerjet: term ${i + 1}/${terms.length}`, + }); + + let collected = 0; + let page = 1; + let totalPages = Number.POSITIVE_INFINITY; + while ( + collected < maxJobsPerTerm && + page <= totalPages && + page <= 10 + ) { + if (context.shouldCancel?.()) break; + const body = await fetchPage({ + apiKey, + keywords: term, + location, + page, + pageSize, + localeCode, + referer, + userIp, + userAgent, + }); + if (typeof body.pages === "number") totalPages = body.pages; + const items = Array.isArray(body.jobs) ? body.jobs : []; + if (items.length === 0) break; + for (const raw of items) { + const mapped = mapJob(raw); + if (!mapped) continue; + const key = mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + if (collected >= maxJobsPerTerm) break; + } + page += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: term || "(all)", + jobPagesProcessed: out.length, + detail: `Careerjet: completed term ${i + 1}/${terms.length} (${collected} found)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/careerjet/package.json b/extractors/careerjet/package.json new file mode 100644 index 0000000..d46a7b7 --- /dev/null +++ b/extractors/careerjet/package.json @@ -0,0 +1,17 @@ +{ + "name": "careerjet-extractor", + "version": "0.0.1", + "type": "module", + "description": "Careerjet public search API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/careerjet/tsconfig.json b/extractors/careerjet/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/careerjet/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/fourdayweek/README.md b/extractors/fourdayweek/README.md new file mode 100644 index 0000000..ab1b367 --- /dev/null +++ b/extractors/fourdayweek/README.md @@ -0,0 +1,10 @@ +# fourdayweek-extractor + +Pulls listings from the public [4 Day Week API](https://4dayweek.io/api/jobs). + +- No authentication required. +- Paginated JSON (up to 3 pages). Filters client-side by title + stack tags + against pipeline search terms. +- No description in listings — links point to `https://4dayweek.io/job/{slug}`. +- Rich metadata: schedule type, work-life score, salary, tech stack, level. +- Caps results per term via the `fourdayweekMaxJobsPerTerm` setting (default 100). diff --git a/extractors/fourdayweek/manifest.ts b/extractors/fourdayweek/manifest.ts new file mode 100644 index 0000000..16ecbdb --- /dev/null +++ b/extractors/fourdayweek/manifest.ts @@ -0,0 +1,226 @@ +/** + * 4 Day Week public jobs API. + * + * https://4dayweek.io/api/jobs?page=N + * + * No auth. Paginated JSON. No description in listing response — + * we link to https://4dayweek.io/job/{slug} for details. + * Supports category filtering server-side; we also filter + * client-side by title + stack tags against pipeline search terms. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://4dayweek.io/api/jobs"; +const MAX_PAGES = 3; + +interface FdwCompany { + name?: string; + slug?: string; + logo_url?: string; +} + +interface FdwRemoteAllowed { + country?: string; + continent?: string; + is_primary?: boolean; +} + +interface FdwStackItem { + name?: string; + slug?: string; +} + +interface FdwJob { + id?: string; + title?: string; + slug?: string; + company_name?: string; + company?: FdwCompany; + work_arrangement?: string; + remote_allowed?: FdwRemoteAllowed[]; + timezones?: string[]; + posted?: number; + schedule_type?: string; + stack?: FdwStackItem[]; + category?: string; + level?: string; + salary?: string; + salary_lower?: number; + salary_upper?: number; + salary_currency?: string; + salary_period?: string; + is_expired?: boolean; + work_life_score?: number; +} + +interface FdwResponse { + jobs?: FdwJob[]; + total?: number; + page?: number; + has_more?: boolean; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed || undefined; +} + +function matchesTerm(job: FdwJob, term: string): boolean { + const lower = term.toLowerCase(); + if (job.title?.toLowerCase().includes(lower)) return true; + if (job.category?.toLowerCase().includes(lower)) return true; + if ( + Array.isArray(job.stack) && + job.stack.some( + (s) => typeof s.name === "string" && s.name.toLowerCase().includes(lower), + ) + ) + return true; + return false; +} + +function formatSchedule(raw: string | undefined): string { + if (!raw) return "4-day week"; + return raw.replace(/_/g, " "); +} + +function formatLocation(job: FdwJob): string { + const countries = Array.isArray(job.remote_allowed) + ? job.remote_allowed + .map((r) => r.country) + .filter((c): c is string => typeof c === "string") + : []; + if (countries.length > 0) return countries.join(", "); + return job.work_arrangement === "remote" ? "Remote" : "Unknown"; +} + +function formatSalary(job: FdwJob): string | undefined { + if (job.salary) return job.salary; + if (job.salary_lower == null && job.salary_upper == null) return undefined; + const cur = job.salary_currency ?? "USD"; + const period = job.salary_period ?? "year"; + if (job.salary_lower != null && job.salary_upper != null) { + return `${cur} ${(job.salary_lower / 100).toLocaleString()}–${(job.salary_upper / 100).toLocaleString()} / ${period}`; + } + const val = job.salary_lower ?? job.salary_upper; + return val != null + ? `${cur} ${(val / 100).toLocaleString()} / ${period}` + : undefined; +} + +function mapJob(raw: FdwJob): CreateJobInput | null { + const slug = asString(raw.slug); + if (!slug) return null; + + const jobUrl = `https://4dayweek.io/job/${slug}`; + const stackTags = Array.isArray(raw.stack) + ? raw.stack + .map((s) => s.name) + .filter((n): n is string => typeof n === "string") + : []; + + return { + source: "fourdayweek", + sourceJobId: raw.id ?? slug, + title: asString(raw.title) ?? "Unknown Title", + employer: raw.company?.name ?? raw.company_name ?? "Unknown Employer", + jobUrl, + applicationLink: jobUrl, + location: formatLocation(raw), + isRemote: raw.work_arrangement === "remote", + jobType: formatSchedule(raw.schedule_type), + companyLogo: raw.company?.logo_url ?? undefined, + datePosted: + typeof raw.posted === "number" + ? new Date(raw.posted * 1000).toISOString() + : undefined, + salary: formatSalary(raw), + disciplines: stackTags.length > 0 ? stackTags.join(", ") : undefined, + companyIndustry: asString(raw.category), + }; +} + +async function fetchPage(page: number): Promise { + const url = `${API_URL}?page=${page}`; + const response = await fetch(url, { + headers: { Accept: "application/json" }, + }); + if (!response.ok) { + throw new Error(`4 Day Week request failed with status ${response.status}`); + } + return (await response.json()) as FdwResponse; +} + +export const manifest: ExtractorManifest = { + id: "fourdayweek", + displayName: "4 Day Week", + providesSources: ["fourdayweek"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const maxJobs = context.settings.fourdayweekMaxJobsPerTerm + ? Number.parseInt(context.settings.fourdayweekMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : []; + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let page = 1; page <= MAX_PAGES; page += 1) { + if (context.shouldCancel?.()) break; + if (out.length >= maxJobs * Math.max(terms.length, 1)) break; + + context.onProgress?.({ + phase: "list", + termsProcessed: 0, + termsTotal: 1, + currentUrl: `page ${page}`, + detail: `4 Day Week: fetching page ${page}`, + }); + + const body = await fetchPage(page); + const jobs = Array.isArray(body.jobs) ? body.jobs : []; + + if (jobs.length === 0) break; + + for (const raw of jobs) { + if (raw.is_expired) continue; + if (terms.length > 0 && !terms.some((t) => matchesTerm(raw, t))) { + continue; + } + const mapped = mapJob(raw); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + } + + context.onProgress?.({ + phase: "list", + termsProcessed: 0, + termsTotal: 1, + currentUrl: `page ${page}`, + jobPagesProcessed: out.length, + detail: `4 Day Week: page ${page} done (${out.length} matched so far)`, + }); + + if (!body.has_more) break; + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/fourdayweek/package.json b/extractors/fourdayweek/package.json new file mode 100644 index 0000000..54e57e4 --- /dev/null +++ b/extractors/fourdayweek/package.json @@ -0,0 +1,17 @@ +{ + "name": "fourdayweek-extractor", + "version": "0.0.1", + "type": "module", + "description": "4 Day Week public jobs API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/fourdayweek/tsconfig.json b/extractors/fourdayweek/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/fourdayweek/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/greenhouse/README.md b/extractors/greenhouse/README.md new file mode 100644 index 0000000..eb24f3a --- /dev/null +++ b/extractors/greenhouse/README.md @@ -0,0 +1,8 @@ +# greenhouse-extractor + +Public Greenhouse Job Boards via +`GET https://boards-api.greenhouse.io/v1/boards/{company}/jobs?content=true`. + +- No auth. +- Configure target slugs via `greenhouseCompanies` (comma/newline) or + `GREENHOUSE_COMPANIES` env (e.g. `airbnb,stripe`). diff --git a/extractors/greenhouse/manifest.ts b/extractors/greenhouse/manifest.ts new file mode 100644 index 0000000..c321392 --- /dev/null +++ b/extractors/greenhouse/manifest.ts @@ -0,0 +1,188 @@ +/** + * Greenhouse public job boards API. + * + * https://developers.greenhouse.io/job-board.html + * GET https://boards-api.greenhouse.io/v1/boards/{company}/jobs?content=true + * + * No auth. Each entry in `greenhouseCompanies` is fetched independently. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +interface GhDepartment { + id?: number; + name?: string; +} +interface GhMetadata { + name?: string; + value?: unknown; +} +interface GhJob { + id?: number; + title?: string; + absolute_url?: string; + internal_job_id?: number; + updated_at?: string; + requisition_id?: string | null; + location?: { name?: string }; + content?: string; // HTML, may be entity-encoded + metadata?: GhMetadata[]; + departments?: GhDepartment[]; + offices?: Array<{ name?: string }>; +} +interface GhResponse { + jobs?: GhJob[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function readCompanies(raw: string | undefined): string[] { + if (!raw) return []; + try { + const parsed = JSON.parse(raw); + if (Array.isArray(parsed)) { + return parsed + .map((entry) => (typeof entry === "string" ? entry.trim() : "")) + .filter(Boolean); + } + } catch { + // fall through + } + return raw + .split(/[\n,;|]+/) + .map((entry) => entry.trim()) + .filter(Boolean); +} + +function decodeHtmlEntities(value: string): string { + return value + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, " "); +} + +function mapJob(job: GhJob, company: string): CreateJobInput | null { + const jobUrl = asString(job.absolute_url); + if (!jobUrl) return null; + + const employer = company + .split(/[-_]/) + .filter(Boolean) + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(" "); + const officeNames = + job.offices + ?.map((office) => asString(office.name)) + .filter((name): name is string => Boolean(name)) ?? []; + const departmentNames = + job.departments + ?.map((dept) => asString(dept.name)) + .filter((name): name is string => Boolean(name)) ?? []; + + const description = job.content ? decodeHtmlEntities(job.content) : undefined; + + return { + source: "greenhouse", + sourceJobId: job.id != null ? String(job.id) : undefined, + title: asString(job.title) ?? "Unknown Title", + employer: employer || company, + jobUrl, + applicationLink: jobUrl, + location: + asString(job.location?.name) ?? (officeNames.join("; ") || undefined), + jobFunction: + departmentNames.length > 0 ? departmentNames.join(", ") : undefined, + datePosted: asString(job.updated_at), + jobDescription: description, + }; +} + +async function fetchCompany(company: string): Promise { + const url = `https://boards-api.greenhouse.io/v1/boards/${encodeURIComponent(company)}/jobs?content=true`; + const response = await fetch(url, { + headers: { Accept: "application/json" }, + }); + if (response.status === 404) return []; + if (!response.ok) { + throw new Error( + `Greenhouse request for "${company}" failed with status ${response.status}`, + ); + } + const body = (await response.json()) as GhResponse; + return Array.isArray(body.jobs) ? body.jobs : []; +} + +export const manifest: ExtractorManifest = { + id: "greenhouse", + displayName: "Greenhouse (ATS)", + providesSources: ["greenhouse"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const companies = readCompanies(context.settings.greenhouseCompanies); + if (companies.length === 0) { + return { + success: true, + jobs: [], + error: + "No Greenhouse companies configured. Set GREENHOUSE_COMPANIES or the greenhouseCompanies setting (comma- or newline-separated slugs).", + }; + } + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < companies.length; i += 1) { + if (context.shouldCancel?.()) break; + const company = companies[i]; + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: companies.length, + currentUrl: company, + detail: `Greenhouse: ${company} (${i + 1}/${companies.length})`, + }); + + let added = 0; + const jobs = await fetchCompany(company); + for (const job of jobs) { + const mapped = mapJob(job, company); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + added += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: companies.length, + currentUrl: company, + jobPagesProcessed: out.length, + detail: `Greenhouse: ${company} → ${added} jobs (${out.length} total)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/greenhouse/package.json b/extractors/greenhouse/package.json new file mode 100644 index 0000000..9c7ec49 --- /dev/null +++ b/extractors/greenhouse/package.json @@ -0,0 +1,17 @@ +{ + "name": "greenhouse-extractor", + "version": "0.0.1", + "type": "module", + "description": "Greenhouse public ATS extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/greenhouse/tsconfig.json b/extractors/greenhouse/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/greenhouse/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/himalayas/README.md b/extractors/himalayas/README.md new file mode 100644 index 0000000..e45ad5c --- /dev/null +++ b/extractors/himalayas/README.md @@ -0,0 +1,10 @@ +# himalayas-extractor + +Pulls listings from the public [Himalayas API](https://himalayas.app/jobs/api). + +- No authentication required. +- Paginates with `limit` + `offset` (50 per page, up to 5 pages / 250 jobs). +- No server-side search — filters client-side by matching title + categories + against each pipeline search term. +- All listings are flagged `isRemote: true`. +- Caps results per term via the `himalayasMaxJobsPerTerm` setting (default 100). diff --git a/extractors/himalayas/manifest.ts b/extractors/himalayas/manifest.ts new file mode 100644 index 0000000..a944b8a --- /dev/null +++ b/extractors/himalayas/manifest.ts @@ -0,0 +1,195 @@ +/** + * Himalayas public remote-jobs API. + * + * https://himalayas.app/jobs/api?limit=N&offset=M + * + * No auth. Returns up to `limit` results per call. No server-side + * search — we paginate and filter client-side by title + categories. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://himalayas.app/jobs/api"; +const PAGE_SIZE = 50; +const MAX_PAGES = 5; + +interface HimalayasJob { + title?: string; + excerpt?: string; + companyName?: string; + companySlug?: string; + companyLogo?: string; + employmentType?: string; + minSalary?: number | null; + maxSalary?: number | null; + currency?: string; + seniority?: string[]; + locationRestrictions?: string[]; + timezoneRestrictions?: number[]; + categories?: string[]; + parentCategories?: string[]; + description?: string; + pubDate?: number; + expiryDate?: number; + applicationLink?: string; + guid?: string; +} + +interface HimalayasResponse { + jobs?: HimalayasJob[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed || undefined; +} + +function matchesTerm(job: HimalayasJob, term: string): boolean { + const lower = term.toLowerCase(); + if (job.title?.toLowerCase().includes(lower)) return true; + if ( + Array.isArray(job.categories) && + job.categories.some( + (c) => + typeof c === "string" && + c.toLowerCase().replace(/-/g, " ").includes(lower), + ) + ) + return true; + return false; +} + +function formatSalary(job: HimalayasJob): string | undefined { + if (job.minSalary == null && job.maxSalary == null) return undefined; + const cur = job.currency ?? "USD"; + if (job.minSalary != null && job.maxSalary != null) { + return `${cur} ${job.minSalary.toLocaleString()}–${job.maxSalary.toLocaleString()}`; + } + const val = job.minSalary ?? job.maxSalary; + return val != null ? `${cur} ${val.toLocaleString()}` : undefined; +} + +function mapJob(raw: HimalayasJob): CreateJobInput | null { + const jobUrl = asString(raw.applicationLink) ?? asString(raw.guid); + if (!jobUrl) return null; + + const categories = Array.isArray(raw.categories) + ? raw.categories.filter( + (c): c is string => typeof c === "string" && c.length > 0, + ) + : []; + + const locations = Array.isArray(raw.locationRestrictions) + ? raw.locationRestrictions.filter( + (l): l is string => typeof l === "string" && l.length > 0, + ) + : []; + + const datePosted = + typeof raw.pubDate === "number" + ? new Date(raw.pubDate * 1000).toISOString() + : undefined; + + return { + source: "himalayas", + sourceJobId: asString(raw.guid), + title: asString(raw.title) ?? "Unknown Title", + employer: asString(raw.companyName) ?? "Unknown Employer", + jobUrl, + applicationLink: jobUrl, + location: locations.length > 0 ? locations.join(", ") : "Remote", + isRemote: true, + jobType: asString(raw.employmentType), + companyLogo: asString(raw.companyLogo), + datePosted, + salary: formatSalary(raw), + jobDescription: asString(raw.description), + disciplines: categories.length > 0 ? categories.join(", ") : undefined, + }; +} + +async function fetchPage( + offset: number, + limit: number, +): Promise { + const url = `${API_URL}?limit=${limit}&offset=${offset}`; + const response = await fetch(url, { + headers: { Accept: "application/json" }, + }); + if (!response.ok) { + throw new Error(`Himalayas request failed with status ${response.status}`); + } + const body = (await response.json()) as HimalayasResponse; + return Array.isArray(body.jobs) ? body.jobs : []; +} + +export const manifest: ExtractorManifest = { + id: "himalayas", + displayName: "Himalayas", + providesSources: ["himalayas"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const maxJobs = context.settings.himalayasMaxJobsPerTerm + ? Number.parseInt(context.settings.himalayasMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : []; + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let page = 0; page < MAX_PAGES; page += 1) { + if (context.shouldCancel?.()) break; + if (out.length >= maxJobs * Math.max(terms.length, 1)) break; + + const offset = page * PAGE_SIZE; + context.onProgress?.({ + phase: "list", + termsProcessed: 0, + termsTotal: 1, + currentUrl: `offset ${offset}`, + detail: `Himalayas: fetching page ${page + 1}`, + }); + + const raw = await fetchPage(offset, PAGE_SIZE); + if (raw.length === 0) break; + + for (const item of raw) { + if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) { + continue; + } + const mapped = mapJob(item); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + } + + context.onProgress?.({ + phase: "list", + termsProcessed: 0, + termsTotal: 1, + currentUrl: `offset ${offset}`, + jobPagesProcessed: out.length, + detail: `Himalayas: page ${page + 1} done (${out.length} matched so far)`, + }); + + if (raw.length < PAGE_SIZE) break; + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/himalayas/package.json b/extractors/himalayas/package.json new file mode 100644 index 0000000..abd8da3 --- /dev/null +++ b/extractors/himalayas/package.json @@ -0,0 +1,17 @@ +{ + "name": "himalayas-extractor", + "version": "0.0.1", + "type": "module", + "description": "Himalayas public remote-jobs API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/himalayas/tsconfig.json b/extractors/himalayas/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/himalayas/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/jobicy/README.md b/extractors/jobicy/README.md new file mode 100644 index 0000000..276f59e --- /dev/null +++ b/extractors/jobicy/README.md @@ -0,0 +1,8 @@ +# jobicy-extractor + +Pulls remote jobs from the public [Jobicy v2 feed](https://jobicy.com/api/v2/remote-jobs). + +- No authentication required. +- Each pipeline `searchTerm` is sent as a `tag`; without terms we fetch the + generic remote feed. +- Caps results via the `jobicyMaxJobsPerTerm` setting (default 100). diff --git a/extractors/jobicy/manifest.ts b/extractors/jobicy/manifest.ts new file mode 100644 index 0000000..5fb11a9 --- /dev/null +++ b/extractors/jobicy/manifest.ts @@ -0,0 +1,186 @@ +/** + * Jobicy remote-jobs feed. + * + * Public, unauthenticated JSON endpoint: + * https://jobicy.com/api/v2/remote-jobs?count=50 + * + * The feed is intentionally remote-only; we still pass each `searchTerm` as a + * `tag` so the same pipeline-level term iteration drives results. We do *not* + * try to invent a country filter — Jobicy postings are remote-friendly by + * design and the registry already restricts ukOnly extractors elsewhere. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://jobicy.com/api/v2/remote-jobs"; + +interface JobicyRawJob { + id?: number | string; + url?: string; + jobTitle?: string; + companyName?: string; + companyLogo?: string; + jobIndustry?: string[] | string; + jobType?: string[] | string; + jobGeo?: string; + jobLevel?: string; + jobExcerpt?: string; + jobDescription?: string; + pubDate?: string; + annualSalaryMin?: number | string; + annualSalaryMax?: number | string; + salaryCurrency?: string; +} + +interface JobicyResponse { + jobs?: JobicyRawJob[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function joinList(value: unknown): string | undefined { + if (Array.isArray(value)) { + const cleaned = value + .map((item) => (typeof item === "string" ? item.trim() : "")) + .filter(Boolean); + return cleaned.length > 0 ? cleaned.join(", ") : undefined; + } + return asString(value); +} + +function toNumberOrUndefined(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value)) return value; + if (typeof value === "string") { + const parsed = Number.parseFloat(value); + return Number.isFinite(parsed) ? parsed : undefined; + } + return undefined; +} + +function mapJob(raw: JobicyRawJob): CreateJobInput | null { + const jobUrl = asString(raw.url); + if (!jobUrl) return null; + + const employer = asString(raw.companyName) ?? "Unknown Employer"; + const title = asString(raw.jobTitle) ?? "Unknown Title"; + const minSalary = toNumberOrUndefined(raw.annualSalaryMin); + const maxSalary = toNumberOrUndefined(raw.annualSalaryMax); + + return { + source: "jobicy", + sourceJobId: raw.id != null ? String(raw.id) : undefined, + title, + employer, + jobUrl, + applicationLink: jobUrl, + location: asString(raw.jobGeo) ?? "Remote", + isRemote: true, + jobType: joinList(raw.jobType), + jobLevel: asString(raw.jobLevel), + companyIndustry: joinList(raw.jobIndustry), + companyLogo: asString(raw.companyLogo), + datePosted: asString(raw.pubDate), + jobDescription: asString(raw.jobDescription) ?? asString(raw.jobExcerpt), + salaryMinAmount: minSalary, + salaryMaxAmount: maxSalary, + salaryCurrency: asString(raw.salaryCurrency), + salaryInterval: minSalary || maxSalary ? "yearly" : undefined, + }; +} + +async function fetchJobicy( + tag: string | null, + count: number, +): Promise { + const url = new URL(API_URL); + url.searchParams.set("count", String(Math.min(Math.max(count, 1), 50))); + if (tag) url.searchParams.set("tag", tag); + + const response = await fetch(url.toString(), { + headers: { Accept: "application/json" }, + }); + if (!response.ok) { + throw new Error(`Jobicy request failed with status ${response.status}`); + } + const body = (await response.json()) as JobicyResponse; + return Array.isArray(body.jobs) ? body.jobs : []; +} + +export const manifest: ExtractorManifest = { + id: "jobicy", + displayName: "Jobicy (Remote)", + providesSources: ["jobicy"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const maxJobsPerTerm = context.settings.jobicyMaxJobsPerTerm + ? Number.parseInt(context.settings.jobicyMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [null]; + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i]; + const tag = term ? term.trim().toLowerCase() : null; + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: tag ?? "(all remote)", + detail: `Jobicy: term ${i + 1}/${terms.length}`, + }); + + // Jobicy caps `count` at 50 per call; loop until we either hit the + // requested cap or the feed runs out (length < take). + let collected = 0; + let safetyHops = 0; + while (collected < maxJobsPerTerm && safetyHops < 10) { + const take = Math.min(50, maxJobsPerTerm - collected); + const raw = await fetchJobicy(tag, take); + if (raw.length === 0) break; + for (const item of raw) { + const mapped = mapJob(item); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + if (collected >= maxJobsPerTerm) break; + } + if (raw.length < take) break; + safetyHops += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: tag ?? "(all remote)", + jobPagesProcessed: out.length, + detail: `Jobicy: completed term ${i + 1}/${terms.length} (${collected} found)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/jobicy/package.json b/extractors/jobicy/package.json new file mode 100644 index 0000000..4fed077 --- /dev/null +++ b/extractors/jobicy/package.json @@ -0,0 +1,17 @@ +{ + "name": "jobicy-extractor", + "version": "0.0.1", + "type": "module", + "description": "Jobicy remote-jobs feed extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/jobicy/tsconfig.json b/extractors/jobicy/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/jobicy/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/jooble/README.md b/extractors/jooble/README.md new file mode 100644 index 0000000..e312f08 --- /dev/null +++ b/extractors/jooble/README.md @@ -0,0 +1,7 @@ +# jooble-extractor + +[Jooble](https://jooble.org/api/about) aggregator API extractor. + +- Requires `JOOBLE_API_KEY` (`joobleApiKey` setting). +- Iterates `searchTerms`; uses the first `searchCities` token as `location`. +- Capped per term via `joobleMaxJobsPerTerm` (default 100). diff --git a/extractors/jooble/manifest.ts b/extractors/jooble/manifest.ts new file mode 100644 index 0000000..ffdba15 --- /dev/null +++ b/extractors/jooble/manifest.ts @@ -0,0 +1,177 @@ +/** + * Jooble aggregator API. + * + * https://jooble.org/api/about — `POST https://jooble.org/api/{key}` with a + * JSON body of `{ keywords, location, page, ResultOnPage }`. + * + * Requires JOOBLE_API_KEY (`joobleApiKey` setting). + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_BASE = "https://jooble.org/api"; + +interface JoobleJob { + id?: number | string; + title?: string; + location?: string; + snippet?: string; + salary?: string; + source?: string; + type?: string; + link?: string; + company?: string; + updated?: string; +} +interface JoobleResponse { + totalCount?: number; + jobs?: JoobleJob[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function mapJob(raw: JoobleJob): CreateJobInput | null { + const jobUrl = asString(raw.link); + if (!jobUrl) return null; + return { + source: "jooble", + sourceJobId: raw.id != null ? String(raw.id) : undefined, + title: asString(raw.title) ?? "Unknown Title", + employer: asString(raw.company) ?? "Unknown Employer", + jobUrl, + applicationLink: jobUrl, + location: asString(raw.location), + jobType: asString(raw.type), + salary: asString(raw.salary), + datePosted: asString(raw.updated), + jobDescription: asString(raw.snippet), + companyDescription: asString(raw.source), + }; +} + +async function fetchPage(args: { + apiKey: string; + keywords: string; + location?: string; + page: number; + resultOnPage: number; +}): Promise { + const response = await fetch( + `${API_BASE}/${encodeURIComponent(args.apiKey)}`, + { + method: "POST", + headers: { + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify({ + keywords: args.keywords, + location: args.location ?? "", + page: String(args.page), + ResultOnPage: String(args.resultOnPage), + }), + }, + ); + if (!response.ok) { + throw new Error(`Jooble request failed with status ${response.status}`); + } + return (await response.json()) as JoobleResponse; +} + +export const manifest: ExtractorManifest = { + id: "jooble", + displayName: "Jooble", + providesSources: ["jooble"], + requiredEnvVars: ["JOOBLE_API_KEY"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const apiKey = + context.settings.joobleApiKey?.trim() || + process.env.JOOBLE_API_KEY?.trim(); + if (!apiKey) { + return { + success: false, + jobs: [], + error: "Jooble extractor requires JOOBLE_API_KEY", + }; + } + + const maxJobsPerTerm = context.settings.joobleMaxJobsPerTerm + ? Number.parseInt(context.settings.joobleMaxJobsPerTerm, 10) + : 100; + const resultOnPage = 50; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [""]; + const location = + context.settings.searchCities?.split("|")[0]?.trim() || undefined; + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i].trim(); + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: term || "(all)", + detail: `Jooble: term ${i + 1}/${terms.length}`, + }); + + let collected = 0; + let page = 1; + while (collected < maxJobsPerTerm && page < 50) { + if (context.shouldCancel?.()) break; + const body = await fetchPage({ + apiKey, + keywords: term, + location, + page, + resultOnPage, + }); + const items = Array.isArray(body.jobs) ? body.jobs : []; + if (items.length === 0) break; + for (const raw of items) { + const mapped = mapJob(raw); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + if (collected >= maxJobsPerTerm) break; + } + if (items.length < resultOnPage) break; + page += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: term || "(all)", + jobPagesProcessed: out.length, + detail: `Jooble: completed term ${i + 1}/${terms.length} (${collected} found)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/jooble/package.json b/extractors/jooble/package.json new file mode 100644 index 0000000..5952c22 --- /dev/null +++ b/extractors/jooble/package.json @@ -0,0 +1,17 @@ +{ + "name": "jooble-extractor", + "version": "0.0.1", + "type": "module", + "description": "Jooble aggregator API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/jooble/tsconfig.json b/extractors/jooble/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/jooble/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/lever/README.md b/extractors/lever/README.md new file mode 100644 index 0000000..83a5bba --- /dev/null +++ b/extractors/lever/README.md @@ -0,0 +1,9 @@ +# lever-extractor + +Public Lever ATS feeds via `GET https://api.lever.co/v0/postings/{company}?mode=json`. + +- No auth. +- Configure target slugs through `leverCompanies` (comma/newline list) or + `LEVER_COMPANIES` env (e.g. `figma,plaid,ramp`). +- Pulls every active posting from each company; pipeline filters handle terms + and country gating. diff --git a/extractors/lever/manifest.ts b/extractors/lever/manifest.ts new file mode 100644 index 0000000..469b2c5 --- /dev/null +++ b/extractors/lever/manifest.ts @@ -0,0 +1,182 @@ +/** + * Lever public job postings API. + * + * https://github.com/lever/postings-api/blob/master/README.md + * GET https://api.lever.co/v0/postings/{company}?mode=json + * + * No auth. We iterate `leverCompanies` (set in Settings or LEVER_COMPANIES env) + * and pull every active posting; downstream filtering by `searchTerms` / + * country happens in the pipeline. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +interface LeverCategories { + team?: string; + department?: string; + commitment?: string; + location?: string; + allLocations?: string[]; +} +interface LeverPosting { + id?: string; + text?: string; + hostedUrl?: string; + applyUrl?: string; + description?: string; + descriptionPlain?: string; + categories?: LeverCategories; + createdAt?: number; + workplaceType?: string; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function readCompanies(raw: string | undefined): string[] { + if (!raw) return []; + try { + const parsed = JSON.parse(raw); + if (Array.isArray(parsed)) { + return parsed + .map((entry) => + typeof entry === "string" ? entry.trim().toLowerCase() : "", + ) + .filter((entry) => entry.length > 0); + } + } catch { + // fall through to delimited-list parsing below + } + return raw + .split(/[\n,;|]+/) + .map((entry) => entry.trim().toLowerCase()) + .filter(Boolean); +} + +function locationFor(posting: LeverPosting): string | undefined { + const cats = posting.categories; + if (!cats) return undefined; + if (Array.isArray(cats.allLocations) && cats.allLocations.length > 0) { + return cats.allLocations.filter(Boolean).join("; "); + } + return asString(cats.location); +} + +function mapPosting( + posting: LeverPosting, + company: string, +): CreateJobInput | null { + const jobUrl = asString(posting.hostedUrl); + if (!jobUrl) return null; + const employer = company + .split("-") + .filter(Boolean) + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(" "); + return { + source: "lever", + sourceJobId: asString(posting.id), + title: asString(posting.text) ?? "Unknown Title", + employer: employer || company, + jobUrl, + applicationLink: asString(posting.applyUrl) ?? jobUrl, + location: locationFor(posting), + jobType: asString(posting.categories?.commitment), + jobFunction: asString(posting.categories?.team), + companyIndustry: asString(posting.categories?.department), + isRemote: + posting.workplaceType?.toLowerCase() === "remote" ? true : undefined, + datePosted: + typeof posting.createdAt === "number" + ? new Date(posting.createdAt).toISOString() + : undefined, + jobDescription: + asString(posting.descriptionPlain) ?? asString(posting.description), + }; +} + +async function fetchCompany(company: string): Promise { + const url = `https://api.lever.co/v0/postings/${encodeURIComponent(company)}?mode=json`; + const response = await fetch(url, { + headers: { Accept: "application/json" }, + }); + if (response.status === 404) return []; + if (!response.ok) { + throw new Error( + `Lever request for "${company}" failed with status ${response.status}`, + ); + } + const body = (await response.json()) as unknown; + return Array.isArray(body) ? (body as LeverPosting[]) : []; +} + +export const manifest: ExtractorManifest = { + id: "lever", + displayName: "Lever (ATS)", + providesSources: ["lever"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const companies = readCompanies(context.settings.leverCompanies); + if (companies.length === 0) { + return { + success: true, + jobs: [], + error: + "No Lever companies configured. Set LEVER_COMPANIES or the leverCompanies setting (comma- or newline-separated slugs).", + }; + } + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < companies.length; i += 1) { + if (context.shouldCancel?.()) break; + const company = companies[i]; + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: companies.length, + currentUrl: company, + detail: `Lever: ${company} (${i + 1}/${companies.length})`, + }); + + let added = 0; + const postings = await fetchCompany(company); + for (const posting of postings) { + const mapped = mapPosting(posting, company); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + added += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: companies.length, + currentUrl: company, + jobPagesProcessed: out.length, + detail: `Lever: ${company} → ${added} jobs (${out.length} total)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/lever/package.json b/extractors/lever/package.json new file mode 100644 index 0000000..5ea14b8 --- /dev/null +++ b/extractors/lever/package.json @@ -0,0 +1,17 @@ +{ + "name": "lever-extractor", + "version": "0.0.1", + "type": "module", + "description": "Lever public ATS extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/lever/tsconfig.json b/extractors/lever/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/lever/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/reed/README.md b/extractors/reed/README.md new file mode 100644 index 0000000..0f941f3 --- /dev/null +++ b/extractors/reed/README.md @@ -0,0 +1,8 @@ +# reed-extractor + +[Reed.co.uk Jobseeker API](https://www.reed.co.uk/developers/jobseeker). + +- Requires `REED_API_KEY` (`reedApiKey` setting), used as the HTTP Basic + username. +- UK-only: gated via `isSourceAllowedForCountry`. +- Capped per term via `reedMaxJobsPerTerm` (default 100). diff --git a/extractors/reed/manifest.ts b/extractors/reed/manifest.ts new file mode 100644 index 0000000..0f6cd1e --- /dev/null +++ b/extractors/reed/manifest.ts @@ -0,0 +1,188 @@ +/** + * Reed.co.uk Jobseeker API. + * + * https://www.reed.co.uk/developers/jobseeker + * GET https://www.reed.co.uk/api/1.0/search?... + * HTTP Basic with the API key as the username and an empty password. + * + * Requires REED_API_KEY (`reedApiKey` setting). The catalog gates this source + * to UK only via `isSourceAllowedForCountry`. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://www.reed.co.uk/api/1.0/search"; + +interface ReedJob { + jobId?: number; + jobTitle?: string; + employerName?: string; + employerProfileUrl?: string; + jobDescription?: string; + jobUrl?: string; + locationName?: string; + date?: string; + expirationDate?: string; + applications?: number; + currency?: string; + minimumSalary?: number; + maximumSalary?: number; + yearlyMinimumSalary?: number; + yearlyMaximumSalary?: number; +} +interface ReedResponse { + totalResults?: number; + results?: ReedJob[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function mapJob(raw: ReedJob): CreateJobInput | null { + const jobUrl = asString(raw.jobUrl); + if (!jobUrl) return null; + return { + source: "reed", + sourceJobId: raw.jobId != null ? String(raw.jobId) : undefined, + title: asString(raw.jobTitle) ?? "Unknown Title", + employer: asString(raw.employerName) ?? "Unknown Employer", + employerUrl: asString(raw.employerProfileUrl), + jobUrl, + applicationLink: jobUrl, + location: asString(raw.locationName), + datePosted: asString(raw.date), + deadline: asString(raw.expirationDate), + jobDescription: asString(raw.jobDescription), + salaryMinAmount: + typeof raw.minimumSalary === "number" ? raw.minimumSalary : undefined, + salaryMaxAmount: + typeof raw.maximumSalary === "number" ? raw.maximumSalary : undefined, + salaryCurrency: asString(raw.currency) ?? "GBP", + salaryInterval: raw.yearlyMinimumSalary != null ? "yearly" : undefined, + }; +} + +async function fetchPage(args: { + apiKey: string; + keywords: string; + locationName?: string; + resultsToTake: number; + resultsToSkip: number; +}): Promise { + const url = new URL(API_URL); + url.searchParams.set("keywords", args.keywords); + if (args.locationName) + url.searchParams.set("locationName", args.locationName); + url.searchParams.set("resultsToTake", String(args.resultsToTake)); + url.searchParams.set("resultsToSkip", String(args.resultsToSkip)); + + const auth = Buffer.from(`${args.apiKey}:`).toString("base64"); + const response = await fetch(url.toString(), { + headers: { + Accept: "application/json", + Authorization: `Basic ${auth}`, + }, + }); + if (!response.ok) { + throw new Error(`Reed request failed with status ${response.status}`); + } + return (await response.json()) as ReedResponse; +} + +export const manifest: ExtractorManifest = { + id: "reed", + displayName: "Reed", + providesSources: ["reed"], + requiredEnvVars: ["REED_API_KEY"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const apiKey = + context.settings.reedApiKey?.trim() || process.env.REED_API_KEY?.trim(); + if (!apiKey) { + return { + success: false, + jobs: [], + error: "Reed extractor requires REED_API_KEY", + }; + } + + const maxJobsPerTerm = context.settings.reedMaxJobsPerTerm + ? Number.parseInt(context.settings.reedMaxJobsPerTerm, 10) + : 100; + // Reed accepts up to 100 per page. + const pageSize = Math.min(100, maxJobsPerTerm); + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [""]; + const locationName = + context.settings.searchCities?.split("|")[0]?.trim() || undefined; + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i].trim(); + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: term || "(all)", + detail: `Reed: term ${i + 1}/${terms.length}`, + }); + + let collected = 0; + let resultsToSkip = 0; + while (collected < maxJobsPerTerm) { + if (context.shouldCancel?.()) break; + const body = await fetchPage({ + apiKey, + keywords: term, + locationName, + resultsToTake: pageSize, + resultsToSkip, + }); + const items = Array.isArray(body.results) ? body.results : []; + if (items.length === 0) break; + for (const raw of items) { + const mapped = mapJob(raw); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + if (collected >= maxJobsPerTerm) break; + } + if (items.length < pageSize) break; + resultsToSkip += pageSize; + if (resultsToSkip > 5000) break; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: term || "(all)", + jobPagesProcessed: out.length, + detail: `Reed: completed term ${i + 1}/${terms.length} (${collected} found)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/reed/package.json b/extractors/reed/package.json new file mode 100644 index 0000000..da1d0f4 --- /dev/null +++ b/extractors/reed/package.json @@ -0,0 +1,17 @@ +{ + "name": "reed-extractor", + "version": "0.0.1", + "type": "module", + "description": "Reed.co.uk Jobseeker API extractor (UK only)", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/reed/tsconfig.json b/extractors/reed/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/reed/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/remoteok/README.md b/extractors/remoteok/README.md new file mode 100644 index 0000000..98481ab --- /dev/null +++ b/extractors/remoteok/README.md @@ -0,0 +1,15 @@ +# remoteok-extractor + +Pulls listings from the public [Remote OK feed](https://remoteok.com/api). + +- No authentication required. +- The endpoint returns the entire active board in a single JSON array; the + first element is metadata/legal text. We fetch once and apply each pipeline + `searchTerm` as a case-insensitive filter over the job's `position` and + `tags`. +- Caps results per term via the `remoteokMaxJobsPerTerm` setting (default 100). +- Listings are flagged `isRemote: true` and labelled "Remote" if Remote OK + doesn't supply a city. +- Per Remote OK's TOS we send a descriptive User-Agent and preserve the legal + notice on the response. If you publish results, please link back to the + original posting URLs. diff --git a/extractors/remoteok/manifest.ts b/extractors/remoteok/manifest.ts new file mode 100644 index 0000000..f67bad5 --- /dev/null +++ b/extractors/remoteok/manifest.ts @@ -0,0 +1,190 @@ +/** + * Remote OK public feed. + * + * https://remoteok.com/api — single JSON endpoint that returns the entire + * active remote-jobs board in one shot. The first array element is metadata + * (legal/attribution); jobs follow. + * + * No auth, no server-side pagination, no per-term query — we fetch once per + * pipeline run and apply each `searchTerm` as a case-insensitive filter over + * `position` + `tags` so the orchestrator's per-term iteration still works. + * + * Per Remote OK's TOS we send a descriptive User-Agent so they can identify + * traffic; we do not strip the legal/attribution element from the response. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://remoteok.com/api"; +const USER_AGENT = + "Mozilla/5.0 (compatible; JobOps/1.0; +https://github.com/) job-search pipeline"; + +interface RemoteOkJob { + id?: string | number; + slug?: string; + position?: string; + company?: string; + company_logo?: string; + logo?: string; + location?: string; + tags?: string[]; + description?: string; + url?: string; + apply_url?: string; + date?: string; + epoch?: number; + salary_min?: number; + salary_max?: number; +} + +interface RemoteOkLegalEntry { + legal?: string; + last_updated?: number; +} + +type RemoteOkResponseEntry = RemoteOkJob | RemoteOkLegalEntry; + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function isJobEntry(entry: RemoteOkResponseEntry): entry is RemoteOkJob { + return ( + "id" in entry || "position" in entry || "url" in entry || "slug" in entry + ); +} + +function tagMatchesTerm(job: RemoteOkJob, normalizedTerm: string): boolean { + if (!normalizedTerm) return true; + const haystack = [ + job.position ?? "", + ...(Array.isArray(job.tags) ? job.tags : []), + ] + .join(" ") + .toLowerCase(); + return haystack.includes(normalizedTerm); +} + +function mapJob(job: RemoteOkJob): CreateJobInput | null { + const jobUrl = asString(job.url) ?? asString(job.apply_url); + if (!jobUrl) return null; + + // Remote OK reports salary as raw numbers; 0 means "not specified". + const minSalary = + typeof job.salary_min === "number" && job.salary_min > 0 + ? job.salary_min + : undefined; + const maxSalary = + typeof job.salary_max === "number" && job.salary_max > 0 + ? job.salary_max + : undefined; + + const tags = Array.isArray(job.tags) + ? job.tags.filter((tag): tag is string => typeof tag === "string") + : []; + + return { + source: "remoteok", + sourceJobId: job.id != null ? String(job.id) : asString(job.slug), + title: asString(job.position) ?? "Unknown Title", + employer: asString(job.company) ?? "Unknown Employer", + jobUrl, + applicationLink: asString(job.apply_url) ?? jobUrl, + location: asString(job.location) ?? "Remote", + isRemote: true, + datePosted: asString(job.date), + jobDescription: asString(job.description), + companyLogo: asString(job.company_logo) ?? asString(job.logo), + disciplines: tags.length > 0 ? tags.join(", ") : undefined, + salaryMinAmount: minSalary, + salaryMaxAmount: maxSalary, + salaryCurrency: minSalary || maxSalary ? "USD" : undefined, + salaryInterval: minSalary || maxSalary ? "yearly" : undefined, + }; +} + +async function fetchAll(): Promise { + const response = await fetch(API_URL, { + headers: { + Accept: "application/json", + "User-Agent": USER_AGENT, + }, + }); + if (!response.ok) { + throw new Error(`Remote OK request failed with status ${response.status}`); + } + const body = (await response.json()) as RemoteOkResponseEntry[]; + if (!Array.isArray(body)) return []; + return body.filter(isJobEntry); +} + +export const manifest: ExtractorManifest = { + id: "remoteok", + displayName: "Remote OK", + providesSources: ["remoteok"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const maxJobsPerTerm = context.settings.remoteokMaxJobsPerTerm + ? Number.parseInt(context.settings.remoteokMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [""]; + + let allJobs: RemoteOkJob[]; + try { + allJobs = await fetchAll(); + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: [], error: message }; + } + + const seen = new Set(); + const out: CreateJobInput[] = []; + + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i].trim(); + const normalizedTerm = term.toLowerCase(); + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: term || "(all remote)", + detail: `Remote OK: term ${i + 1}/${terms.length}`, + }); + + let collected = 0; + for (const job of allJobs) { + if (collected >= maxJobsPerTerm) break; + if (!tagMatchesTerm(job, normalizedTerm)) continue; + const mapped = mapJob(job); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: term || "(all remote)", + jobPagesProcessed: out.length, + detail: `Remote OK: completed term ${i + 1}/${terms.length} (${collected} matched)`, + }); + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/remoteok/package.json b/extractors/remoteok/package.json new file mode 100644 index 0000000..5cb01f9 --- /dev/null +++ b/extractors/remoteok/package.json @@ -0,0 +1,17 @@ +{ + "name": "remoteok-extractor", + "version": "0.0.1", + "type": "module", + "description": "Remote OK public job-feed extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/remoteok/tsconfig.json b/extractors/remoteok/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/remoteok/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/remotive/README.md b/extractors/remotive/README.md new file mode 100644 index 0000000..dd25c5c --- /dev/null +++ b/extractors/remotive/README.md @@ -0,0 +1,9 @@ +# remotive-extractor + +Pulls listings from the public [Remotive API](https://remotive.com/api/remote-jobs). + +- No authentication required. +- Each pipeline `searchTerm` is passed as the `search` query parameter; + without terms we fetch the generic remote feed. +- Caps results per term via the `remotiveMaxJobsPerTerm` setting (default 100). +- All listings are flagged `isRemote: true`. diff --git a/extractors/remotive/manifest.ts b/extractors/remotive/manifest.ts new file mode 100644 index 0000000..f3c2883 --- /dev/null +++ b/extractors/remotive/manifest.ts @@ -0,0 +1,153 @@ +/** + * Remotive public remote-jobs API. + * + * https://remotive.com/api/remote-jobs?limit=N&search=term + * + * No auth. Returns up to `limit` results per call with a `search` keyword + * filter. We iterate pipeline search terms as the `search` parameter. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://remotive.com/api/remote-jobs"; + +interface RemotiveJob { + id?: number; + url?: string; + title?: string; + company_name?: string; + company_logo?: string; + category?: string; + tags?: string[]; + job_type?: string; + publication_date?: string; + candidate_required_location?: string; + salary?: string; + description?: string; +} + +interface RemotiveResponse { + jobs?: RemotiveJob[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function normalizeJobType(raw: string | undefined): string | undefined { + if (!raw) return undefined; + return raw.replace(/_/g, " ").trim() || undefined; +} + +function mapJob(raw: RemotiveJob): CreateJobInput | null { + const jobUrl = asString(raw.url); + if (!jobUrl) return null; + + const tags = Array.isArray(raw.tags) + ? raw.tags.filter((t): t is string => typeof t === "string" && t.length > 0) + : []; + + return { + source: "remotive", + sourceJobId: raw.id != null ? String(raw.id) : undefined, + title: asString(raw.title) ?? "Unknown Title", + employer: asString(raw.company_name) ?? "Unknown Employer", + jobUrl, + applicationLink: jobUrl, + location: asString(raw.candidate_required_location) ?? "Remote", + isRemote: true, + jobType: normalizeJobType(raw.job_type), + companyIndustry: asString(raw.category), + companyLogo: asString(raw.company_logo), + datePosted: asString(raw.publication_date), + salary: asString(raw.salary), + jobDescription: asString(raw.description), + disciplines: tags.length > 0 ? tags.join(", ") : undefined, + }; +} + +async function fetchJobs( + search: string | null, + limit: number, +): Promise { + const url = new URL(API_URL); + url.searchParams.set("limit", String(Math.min(Math.max(limit, 1), 100))); + if (search) url.searchParams.set("search", search); + + const response = await fetch(url.toString(), { + headers: { Accept: "application/json" }, + }); + if (!response.ok) { + throw new Error(`Remotive request failed with status ${response.status}`); + } + const body = (await response.json()) as RemotiveResponse; + return Array.isArray(body.jobs) ? body.jobs : []; +} + +export const manifest: ExtractorManifest = { + id: "remotive", + displayName: "Remotive", + providesSources: ["remotive"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const maxJobsPerTerm = context.settings.remotiveMaxJobsPerTerm + ? Number.parseInt(context.settings.remotiveMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [null]; + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i]; + const search = term ? term.trim() : null; + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: search ?? "(all remote)", + detail: `Remotive: term ${i + 1}/${terms.length}`, + }); + + const raw = await fetchJobs(search, maxJobsPerTerm); + let collected = 0; + for (const item of raw) { + if (collected >= maxJobsPerTerm) break; + const mapped = mapJob(item); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: search ?? "(all remote)", + jobPagesProcessed: out.length, + detail: `Remotive: completed term ${i + 1}/${terms.length} (${collected} found)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/remotive/package.json b/extractors/remotive/package.json new file mode 100644 index 0000000..3f0ea20 --- /dev/null +++ b/extractors/remotive/package.json @@ -0,0 +1,17 @@ +{ + "name": "remotive-extractor", + "version": "0.0.1", + "type": "module", + "description": "Remotive public remote-jobs API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/remotive/tsconfig.json b/extractors/remotive/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/remotive/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/themuse/README.md b/extractors/themuse/README.md new file mode 100644 index 0000000..989d2d1 --- /dev/null +++ b/extractors/themuse/README.md @@ -0,0 +1,8 @@ +# themuse-extractor + +Pulls postings from [The Muse public jobs API](https://www.themuse.com/developers/api/v2). + +- Works without auth, but `themuseApiKey` (`THEMUSE_API_KEY`) raises rate limits. +- Each pipeline `searchTerm` is sent as a `category`. The Muse's first + `searchCities` token is forwarded as `location`. +- Capped per term via `themuseMaxJobsPerTerm` (default 100). diff --git a/extractors/themuse/manifest.ts b/extractors/themuse/manifest.ts new file mode 100644 index 0000000..af643f2 --- /dev/null +++ b/extractors/themuse/manifest.ts @@ -0,0 +1,224 @@ +/** + * The Muse public jobs API. + * + * https://www.themuse.com/api/public/jobs?page=0&category=...&location=... + * + * The endpoint works without auth but is heavily rate-limited; an API key + * (THEMUSE_API_KEY / `themuseApiKey` setting) lifts that. We pass each pipeline + * search term as a `category` to keep parity with how other extractors iterate + * search terms; if your role doesn't map to a Muse category it'll still match + * because Muse falls back to generic listings. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://www.themuse.com/api/public/jobs"; + +interface MuseLocation { + name?: string; +} +interface MuseCompany { + name?: string; + short_name?: string; +} +interface MuseRefs { + landing_page?: string; +} +interface MuseJob { + id?: number; + name?: string; + publication_date?: string; + type?: string; + contents?: string; + short_description?: string; + locations?: MuseLocation[]; + company?: MuseCompany; + refs?: MuseRefs; +} +interface MuseResponse { + page?: number; + page_count?: number; + results?: MuseJob[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function joinLocations( + locations: MuseLocation[] | undefined, +): string | undefined { + if (!locations || locations.length === 0) return undefined; + const cleaned = locations + .map((entry) => asString(entry.name)) + .filter((name): name is string => Boolean(name)); + return cleaned.length > 0 ? cleaned.join("; ") : undefined; +} + +function isRemoteFromLocations( + locations: MuseLocation[] | undefined, +): boolean | undefined { + if (!locations || locations.length === 0) return undefined; + return locations.some((loc) => + typeof loc.name === "string" + ? /\bflexible|remote\b/i.test(loc.name) + : false, + ); +} + +// The Muse `category` filter expects an exact, Title-Cased category name (e.g. +// "Software Engineer", "Engineering"). User-supplied search terms are commonly +// lowercase free-text, which the API silently ignores and returns zero results. +// Title-case the term so common values map to real categories; if the term +// still doesn't match a category the extractor will fall back to no filter. +function toMuseCategory(term: string): string | undefined { + const trimmed = term.trim(); + if (!trimmed) return undefined; + return trimmed + .toLowerCase() + .split(/\s+/) + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(" "); +} + +function mapJob(raw: MuseJob): CreateJobInput | null { + const jobUrl = asString(raw.refs?.landing_page); + if (!jobUrl) return null; + + return { + source: "themuse", + sourceJobId: raw.id != null ? String(raw.id) : undefined, + title: asString(raw.name) ?? "Unknown Title", + employer: asString(raw.company?.name) ?? "Unknown Employer", + jobUrl, + applicationLink: jobUrl, + location: joinLocations(raw.locations), + isRemote: isRemoteFromLocations(raw.locations), + jobType: asString(raw.type), + datePosted: asString(raw.publication_date), + jobDescription: + asString(raw.contents) ?? asString(raw.short_description) ?? undefined, + }; +} + +async function fetchPage(args: { + apiKey?: string; + page: number; + category?: string; + location?: string; +}): Promise { + const url = new URL(API_URL); + url.searchParams.set("page", String(args.page)); + if (args.category) url.searchParams.set("category", args.category); + if (args.location) url.searchParams.set("location", args.location); + if (args.apiKey) url.searchParams.set("api_key", args.apiKey); + + const response = await fetch(url.toString(), { + headers: { Accept: "application/json" }, + }); + if (!response.ok) { + throw new Error(`The Muse request failed with status ${response.status}`); + } + return (await response.json()) as MuseResponse; +} + +export const manifest: ExtractorManifest = { + id: "themuse", + displayName: "The Muse", + providesSources: ["themuse"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const apiKey = context.settings.themuseApiKey?.trim() || undefined; + const maxJobsPerTerm = context.settings.themuseMaxJobsPerTerm + ? Number.parseInt(context.settings.themuseMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [""]; + const locationHint = + context.settings.searchCities?.split("|")[0]?.trim() || undefined; + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i].trim(); + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: term || "(all)", + detail: `The Muse: term ${i + 1}/${terms.length}`, + }); + + let collected = 0; + let page = 0; + let pageCount = Number.POSITIVE_INFINITY; + // The Muse returns pageCount; cap pages defensively to avoid runaway + // loops if the API misbehaves. We try the term as a category first and, + // if the very first page is empty, drop the category filter once so an + // unknown category doesn't silently nuke the entire term. + let categoryToUse: string | undefined = toMuseCategory(term); + let droppedCategory = false; + while (collected < maxJobsPerTerm && page < pageCount && page < 100) { + if (context.shouldCancel?.()) break; + const body = await fetchPage({ + apiKey, + page, + category: categoryToUse, + location: locationHint, + }); + if (typeof body.page_count === "number") { + pageCount = body.page_count; + } + const results = Array.isArray(body.results) ? body.results : []; + if (results.length === 0) { + if (page === 0 && categoryToUse && !droppedCategory) { + categoryToUse = undefined; + droppedCategory = true; + pageCount = Number.POSITIVE_INFINITY; + continue; + } + break; + } + + for (const item of results) { + const mapped = mapJob(item); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + if (collected >= maxJobsPerTerm) break; + } + page += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: term || "(all)", + jobPagesProcessed: out.length, + detail: `The Muse: completed term ${i + 1}/${terms.length} (${collected} found)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/themuse/package.json b/extractors/themuse/package.json new file mode 100644 index 0000000..bff0b3c --- /dev/null +++ b/extractors/themuse/package.json @@ -0,0 +1,17 @@ +{ + "name": "themuse-extractor", + "version": "0.0.1", + "type": "module", + "description": "The Muse public jobs API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/themuse/tsconfig.json b/extractors/themuse/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/themuse/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/usajobs/README.md b/extractors/usajobs/README.md new file mode 100644 index 0000000..a07a888 --- /dev/null +++ b/extractors/usajobs/README.md @@ -0,0 +1,22 @@ +# usajobs-extractor + +US-government job listings via the public +[USAJOBS Search API](https://developer.usajobs.gov/api-reference/get-api-search) (`GET /api/Search` on `data.usajobs.gov`). + +## Getting API access + +1. Open the [USAJOBS Developer Site](https://developer.usajobs.gov/) and complete **USAJOBS API Access Request** (sign in / register as required). +2. After approval, USAJOBS emails your **API key** to the address you used—use that value for `USAJOBS_API_KEY`. +3. Set `USAJOBS_USER_AGENT` to a **real contact email** (same one you used for registration is typical). This is required by their terms, not optional branding—the HTTP `User-Agent` header must identify you. + +Reference material on the developer site includes API Reference, tutorials, and code lists (locations, pay plans, etc.). + +## Configuration + +| Env / setting | Required | Notes | +| --- | --- | --- | +| `USAJOBS_API_KEY` / `usajobsApiKey` | yes | From the email USAJOBS sends after API access is granted | +| `USAJOBS_USER_AGENT` / `usajobsUserAgent` | yes | Real contact email per USAJOBS TOS | +| `usajobsMaxJobsPerTerm` | no | Per-term cap (default 100) | + +The orchestrator's country gating restricts this source to the United States. diff --git a/extractors/usajobs/manifest.ts b/extractors/usajobs/manifest.ts new file mode 100644 index 0000000..175fcb1 --- /dev/null +++ b/extractors/usajobs/manifest.ts @@ -0,0 +1,263 @@ +/** + * USAJOBS public search API. + * + * https://developer.usajobs.gov/api-reference/get-api-search + * + * Requires: + * - USAJOBS_API_KEY (`usajobsApiKey` setting) + * - USAJOBS_USER_AGENT — must be a real contact email per their TOS + * + * The orchestrator already gates this source to United States via + * `isSourceAllowedForCountry`, so we don't re-validate country here. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const API_URL = "https://data.usajobs.gov/api/Search"; + +interface UsaJobsLocation { + LocationName?: string; + CountryCode?: string; +} +interface UsaJobsRemuneration { + MinimumRange?: string; + MaximumRange?: string; + RateIntervalCode?: string; +} +interface UsaJobsDescriptor { + PositionID?: string; + PositionTitle?: string; + PositionURI?: string; + ApplyURI?: string[]; + PositionLocationDisplay?: string; + PositionLocation?: UsaJobsLocation[]; + OrganizationName?: string; + DepartmentName?: string; + PublicationStartDate?: string; + PositionStartDate?: string; + PositionEndDate?: string; + PositionRemuneration?: UsaJobsRemuneration[]; + UserArea?: { Details?: { JobSummary?: string } }; + PositionSchedule?: Array<{ Name?: string }>; +} +interface UsaJobsSearchResultItem { + MatchedObjectDescriptor?: UsaJobsDescriptor; +} +interface UsaJobsSearchResult { + SearchResult?: { + SearchResultCountAll?: number; + SearchResultItems?: UsaJobsSearchResultItem[]; + }; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function toNumberOrUndefined(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value)) return value; + if (typeof value === "string") { + const parsed = Number.parseFloat(value); + return Number.isFinite(parsed) ? parsed : undefined; + } + return undefined; +} + +function mapInterval(code: string | undefined): string | undefined { + if (!code) return undefined; + switch (code.toLowerCase()) { + case "py": + case "pa": + return "yearly"; + case "ph": + return "hourly"; + case "pd": + return "daily"; + case "pm": + return "monthly"; + case "pw": + return "weekly"; + default: + return undefined; + } +} + +function mapJob(item: UsaJobsSearchResultItem): CreateJobInput | null { + const descriptor = item.MatchedObjectDescriptor; + if (!descriptor) return null; + const jobUrl = asString(descriptor.PositionURI); + if (!jobUrl) return null; + + const remuneration = descriptor.PositionRemuneration?.[0]; + const min = toNumberOrUndefined(remuneration?.MinimumRange); + const max = toNumberOrUndefined(remuneration?.MaximumRange); + const interval = mapInterval(remuneration?.RateIntervalCode); + const applyArr = descriptor.ApplyURI; + const applicationLink = + Array.isArray(applyArr) && applyArr.length > 0 + ? (asString(applyArr[0]) ?? jobUrl) + : jobUrl; + + return { + source: "usajobs", + sourceJobId: asString(descriptor.PositionID), + title: asString(descriptor.PositionTitle) ?? "Unknown Title", + employer: + asString(descriptor.OrganizationName) ?? + asString(descriptor.DepartmentName) ?? + "U.S. Federal Government", + jobUrl, + applicationLink, + location: asString(descriptor.PositionLocationDisplay), + datePosted: asString(descriptor.PublicationStartDate), + deadline: asString(descriptor.PositionEndDate), + jobDescription: asString(descriptor.UserArea?.Details?.JobSummary), + jobType: descriptor.PositionSchedule?.[0]?.Name?.trim() || undefined, + salaryMinAmount: min, + salaryMaxAmount: max, + salaryCurrency: min || max ? "USD" : undefined, + salaryInterval: interval, + }; +} + +async function fetchPage(args: { + apiKey: string; + userAgent: string; + keyword: string; + locationName?: string; + page: number; + resultsPerPage: number; +}): Promise { + const url = new URL(API_URL); + url.searchParams.set("Keyword", args.keyword); + if (args.locationName) { + url.searchParams.set("LocationName", args.locationName); + } + url.searchParams.set("ResultsPerPage", String(args.resultsPerPage)); + url.searchParams.set("Page", String(args.page)); + url.searchParams.set("SortField", "OpenDate"); + url.searchParams.set("SortDirection", "Desc"); + + const response = await fetch(url.toString(), { + headers: { + Host: "data.usajobs.gov", + "User-Agent": args.userAgent, + "Authorization-Key": args.apiKey, + Accept: "application/json", + }, + }); + if (!response.ok) { + throw new Error(`USAJOBS request failed with status ${response.status}`); + } + return (await response.json()) as UsaJobsSearchResult; +} + +export const manifest: ExtractorManifest = { + id: "usajobs", + displayName: "USAJOBS", + providesSources: ["usajobs"], + requiredEnvVars: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const apiKey = + context.settings.usajobsApiKey?.trim() || + process.env.USAJOBS_API_KEY?.trim(); + const userAgent = + context.settings.usajobsUserAgent?.trim() || + process.env.USAJOBS_USER_AGENT?.trim(); + if (!apiKey || !userAgent) { + return { + success: false, + jobs: [], + error: + "USAJOBS extractor requires USAJOBS_API_KEY and USAJOBS_USER_AGENT (a contact email)", + }; + } + + const maxJobsPerTerm = context.settings.usajobsMaxJobsPerTerm + ? Number.parseInt(context.settings.usajobsMaxJobsPerTerm, 10) + : 100; + // USAJOBS caps page size at 500, but smaller pages are friendlier on retry. + const resultsPerPage = 50; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [""]; + const locationName = + context.settings.searchCities?.split("|")[0]?.trim() || undefined; + + const seen = new Set(); + const out: CreateJobInput[] = []; + + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i].trim(); + context.onProgress?.({ + phase: "list", + termsProcessed: i, + termsTotal: terms.length, + currentUrl: term || "(all)", + detail: `USAJOBS: term ${i + 1}/${terms.length}`, + }); + + let collected = 0; + let page = 1; + let total = Number.POSITIVE_INFINITY; + while ( + collected < maxJobsPerTerm && + (page - 1) * resultsPerPage < total && + page < 200 + ) { + if (context.shouldCancel?.()) break; + const body = await fetchPage({ + apiKey, + userAgent, + keyword: term, + locationName, + page, + resultsPerPage, + }); + if (typeof body.SearchResult?.SearchResultCountAll === "number") { + total = body.SearchResult.SearchResultCountAll; + } + const items = body.SearchResult?.SearchResultItems ?? []; + if (items.length === 0) break; + for (const item of items) { + const mapped = mapJob(item); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + collected += 1; + if (collected >= maxJobsPerTerm) break; + } + if (items.length < resultsPerPage) break; + page += 1; + } + + context.onProgress?.({ + phase: "list", + termsProcessed: i + 1, + termsTotal: terms.length, + currentUrl: term || "(all)", + jobPagesProcessed: out.length, + detail: `USAJOBS: completed term ${i + 1}/${terms.length} (${collected} found)`, + }); + } + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: out, error: message }; + } + + return { success: true, jobs: out }; + }, +}; + +export default manifest; diff --git a/extractors/usajobs/package.json b/extractors/usajobs/package.json new file mode 100644 index 0000000..d2b8920 --- /dev/null +++ b/extractors/usajobs/package.json @@ -0,0 +1,17 @@ +{ + "name": "usajobs-extractor", + "version": "0.0.1", + "type": "module", + "description": "USAJOBS public search API extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/usajobs/tsconfig.json b/extractors/usajobs/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/usajobs/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/weworkremotely/README.md b/extractors/weworkremotely/README.md new file mode 100644 index 0000000..d93b984 --- /dev/null +++ b/extractors/weworkremotely/README.md @@ -0,0 +1,10 @@ +# weworkremotely-extractor + +Pulls listings from the public [We Work Remotely RSS feed](https://weworkremotely.com/remote-jobs.rss). + +- No authentication required. +- Single RSS fetch returns all recent listings; we filter client-side + by matching title + skills + category against pipeline search terms. +- Uses lightweight regex-based XML parsing (no external XML library). +- All listings are flagged `isRemote: true`. +- Caps results per term via the `weworkremotelyMaxJobsPerTerm` setting (default 100). diff --git a/extractors/weworkremotely/manifest.ts b/extractors/weworkremotely/manifest.ts new file mode 100644 index 0000000..741fa0d --- /dev/null +++ b/extractors/weworkremotely/manifest.ts @@ -0,0 +1,192 @@ +/** + * We Work Remotely — public RSS feed. + * + * https://weworkremotely.com/remote-jobs.rss + * + * No auth. Returns all recent listings in a single XML feed. + * We filter client-side by matching title + skills + category + * against each pipeline search term. + * + * Title format from WWR: "Company Name: Job Title" + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const RSS_URL = "https://weworkremotely.com/remote-jobs.rss"; + +interface WwrItem { + title?: string; + link?: string; + guid?: string; + description?: string; + pubDate?: string; + region?: string; + country?: string; + skills?: string; + category?: string; + type?: string; + logoUrl?: string; +} + +function xmlText(xml: string, tag: string): string | undefined { + const pattern = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)`); + const match = xml.match(pattern); + if (!match?.[1]) return undefined; + return ( + match[1].replace(//g, "$1").trim() || undefined + ); +} + +function parseItems(xml: string): WwrItem[] { + const items: WwrItem[] = []; + const blocks = xml.match(/([\s\S]*?)<\/item>/g) ?? []; + + for (const raw of blocks) { + const block = raw.replace(/^/, "").replace(/<\/item>$/, ""); + const logoMatch = block.match(/media:content\s+url="([^"]+)"/); + + items.push({ + title: xmlText(block, "title"), + link: xmlText(block, "link"), + guid: xmlText(block, "guid"), + description: xmlText(block, "description"), + pubDate: xmlText(block, "pubDate"), + region: xmlText(block, "region"), + country: xmlText(block, "country"), + skills: xmlText(block, "skills"), + category: xmlText(block, "category"), + type: xmlText(block, "type"), + logoUrl: logoMatch?.[1], + }); + } + + return items; +} + +function parseTitle(raw: string): { employer: string; title: string } { + const colonIdx = raw.indexOf(": "); + if (colonIdx > 0) { + return { + employer: raw.slice(0, colonIdx).trim(), + title: raw.slice(colonIdx + 2).trim(), + }; + } + return { employer: "Unknown Employer", title: raw.trim() }; +} + +function matchesTerm(item: WwrItem, term: string): boolean { + const lower = term.toLowerCase(); + if (item.title?.toLowerCase().includes(lower)) return true; + if (item.skills?.toLowerCase().includes(lower)) return true; + if (item.category?.toLowerCase().includes(lower)) return true; + return false; +} + +function decodeHtmlEntities(html: string): string { + return html + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">"); +} + +function mapJob(item: WwrItem): CreateJobInput | null { + const jobUrl = item.link || item.guid; + if (!jobUrl) return null; + + const rawTitle = item.title + ? decodeHtmlEntities(item.title) + : "Unknown Title"; + const { employer, title } = parseTitle(rawTitle); + + const location = + [item.region, item.country].filter(Boolean).join(" — ") || "Remote"; + + return { + source: "weworkremotely", + sourceJobId: item.guid ?? item.link, + title, + employer, + jobUrl, + applicationLink: jobUrl, + location, + isRemote: true, + jobType: item.type || undefined, + companyLogo: item.logoUrl, + datePosted: item.pubDate, + jobDescription: item.description + ? decodeHtmlEntities(item.description) + : undefined, + disciplines: item.skills || undefined, + companyIndustry: item.category || undefined, + }; +} + +export const manifest: ExtractorManifest = { + id: "weworkremotely", + displayName: "We Work Remotely", + providesSources: ["weworkremotely"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const maxJobs = context.settings.weworkremotelyMaxJobsPerTerm + ? Number.parseInt(context.settings.weworkremotelyMaxJobsPerTerm, 10) + : 100; + + const terms = context.searchTerms.length > 0 ? context.searchTerms : []; + + context.onProgress?.({ + phase: "list", + termsProcessed: 0, + termsTotal: 1, + currentUrl: RSS_URL, + detail: "We Work Remotely: fetching RSS feed", + }); + + try { + const response = await fetch(RSS_URL, { + headers: { Accept: "application/rss+xml, application/xml, text/xml" }, + }); + if (!response.ok) { + throw new Error(`WWR RSS failed with status ${response.status}`); + } + const xml = await response.text(); + const items = parseItems(xml); + + const seen = new Set(); + const out: CreateJobInput[] = []; + + for (const item of items) { + if (out.length >= maxJobs * Math.max(terms.length, 1)) break; + if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) { + continue; + } + const mapped = mapJob(item); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + } + + context.onProgress?.({ + phase: "list", + termsProcessed: 1, + termsTotal: 1, + currentUrl: RSS_URL, + jobPagesProcessed: out.length, + detail: `We Work Remotely: ${out.length} matched from ${items.length} total`, + }); + + return { success: true, jobs: out }; + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown error"; + return { success: false, jobs: [], error: message }; + } + }, +}; + +export default manifest; diff --git a/extractors/weworkremotely/package.json b/extractors/weworkremotely/package.json new file mode 100644 index 0000000..2cbd3c0 --- /dev/null +++ b/extractors/weworkremotely/package.json @@ -0,0 +1,17 @@ +{ + "name": "weworkremotely-extractor", + "version": "0.0.1", + "type": "module", + "description": "We Work Remotely RSS feed extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/weworkremotely/tsconfig.json b/extractors/weworkremotely/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/weworkremotely/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/extractors/workday/README.md b/extractors/workday/README.md new file mode 100644 index 0000000..7663edc --- /dev/null +++ b/extractors/workday/README.md @@ -0,0 +1,25 @@ +# workday-extractor + +Public Workday career sites via the JSON CXS endpoint +`POST {tenantUrl}/wday/cxs/{tenant}/{site}/jobs`. + +## Configuration + +Set `workdayTenants` (or `WORKDAY_TENANTS` env). Each entry is either: + +1. A career-site URL we'll auto-parse, e.g. + `https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite` +2. A JSON object with explicit fields: + +```json +{ + "company": "NVIDIA", + "tenantUrl": "https://nvidia.wd5.myworkdayjobs.com", + "tenant": "nvidia", + "site": "NVIDIAExternalCareerSite", + "locale": "en-US" +} +``` + +Multiple entries are separated by newlines or commas. Pipeline `searchTerms` +are passed as the request `searchText`. diff --git a/extractors/workday/manifest.ts b/extractors/workday/manifest.ts new file mode 100644 index 0000000..b0089e7 --- /dev/null +++ b/extractors/workday/manifest.ts @@ -0,0 +1,263 @@ +/** + * Workday public career-site extractor. + * + * Workday tenants expose their public job board over a JSON CXS endpoint: + * POST {tenantUrl}/wday/cxs/{tenant}/{site}/jobs + * { appliedFacets: {}, limit: 20, offset: 0, searchText: "..." } + * + * `workdayTenants` accepts entries shaped as JSON objects (preferred) or as + * career-page URLs we parse on a best-effort basis. When we can't recover the + * tenant + site we skip the entry and continue. + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +interface WorkdayTarget { + company: string; + tenantUrl: string; + tenant: string; + site: string; + locale?: string; +} + +interface WorkdayJobPosting { + title?: string; + externalPath?: string; + locationsText?: string; + postedOn?: string; + bulletFields?: string[]; +} +interface WorkdayResponse { + total?: number; + jobPostings?: WorkdayJobPosting[]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed ? trimmed : undefined; +} + +function inferTenantFromHost(host: string): string | null { + // host looks like `acme.wd5.myworkdayjobs.com` → tenant "acme" + const match = host.match(/^([^.]+)\.wd\d+\.myworkdayjobs\.com$/i); + return match ? match[1] : null; +} + +function parseTargetEntry(entry: string): WorkdayTarget | null { + const trimmed = entry.trim(); + if (!trimmed) return null; + // First, try JSON. + try { + const parsed = JSON.parse(trimmed) as Partial; + if ( + parsed && + typeof parsed.company === "string" && + typeof parsed.tenantUrl === "string" && + typeof parsed.tenant === "string" && + typeof parsed.site === "string" + ) { + return { + company: parsed.company, + tenantUrl: parsed.tenantUrl.replace(/\/$/, ""), + tenant: parsed.tenant, + site: parsed.site, + locale: typeof parsed.locale === "string" ? parsed.locale : undefined, + }; + } + } catch { + // Fall through to URL parsing. + } + + // URL form, e.g. + // https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite + try { + const url = new URL(trimmed); + const tenant = inferTenantFromHost(url.host); + if (!tenant) return null; + const segments = url.pathname.split("/").filter(Boolean); + if (segments.length < 2) return null; + const [maybeLocale, site] = segments; + return { + company: tenant, + tenantUrl: `${url.protocol}//${url.host}`, + tenant, + site, + locale: maybeLocale, + }; + } catch { + return null; + } +} + +function readTargets(raw: string | undefined): WorkdayTarget[] { + if (!raw) return []; + const out: WorkdayTarget[] = []; + // settings store stringifies JSON arrays; if we got a JSON array of strings + // we still need to parse each entry individually. + let entries: string[] = []; + try { + const parsed = JSON.parse(raw); + if (Array.isArray(parsed)) { + entries = parsed + .map((entry) => + typeof entry === "string" ? entry : JSON.stringify(entry), + ) + .filter(Boolean); + } + } catch { + entries = raw + .split(/\n+/) + .map((line) => line.trim()) + .filter(Boolean); + } + if (entries.length === 0) { + entries = raw + .split(/\n+/) + .map((line) => line.trim()) + .filter(Boolean); + } + for (const entry of entries) { + const target = parseTargetEntry(entry); + if (target) out.push(target); + } + return out; +} + +function mapPosting( + posting: WorkdayJobPosting, + target: WorkdayTarget, +): CreateJobInput | null { + const externalPath = asString(posting.externalPath); + if (!externalPath) return null; + const locale = target.locale ?? "en-US"; + const jobUrl = `${target.tenantUrl}/${locale}/${target.site}${externalPath}`; + return { + source: "workday", + sourceJobId: externalPath, + title: asString(posting.title) ?? "Unknown Title", + employer: target.company, + jobUrl, + applicationLink: jobUrl, + location: asString(posting.locationsText), + datePosted: asString(posting.postedOn), + jobType: posting.bulletFields?.find((field) => field?.length)?.trim(), + }; +} + +async function fetchPage(args: { + target: WorkdayTarget; + searchText: string; + offset: number; + limit: number; +}): Promise { + const url = `${args.target.tenantUrl}/wday/cxs/${encodeURIComponent(args.target.tenant)}/${encodeURIComponent(args.target.site)}/jobs`; + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Accept: "application/json", + }, + body: JSON.stringify({ + appliedFacets: {}, + limit: args.limit, + offset: args.offset, + searchText: args.searchText, + }), + }); + if (!response.ok) { + throw new Error( + `Workday request for "${args.target.company}" failed with status ${response.status}`, + ); + } + return (await response.json()) as WorkdayResponse; +} + +export const manifest: ExtractorManifest = { + id: "workday", + displayName: "Workday (ATS)", + providesSources: ["workday"], + async run(context): Promise { + if (context.shouldCancel?.()) return { success: true, jobs: [] }; + + const targets = readTargets(context.settings.workdayTenants); + if (targets.length === 0) { + return { + success: true, + jobs: [], + error: + "No Workday tenants configured. Set WORKDAY_TENANTS or the workdayTenants setting to a list of career-site URLs (or JSON entries with company/tenantUrl/tenant/site).", + }; + } + + const terms = context.searchTerms.length > 0 ? context.searchTerms : [""]; + const seen = new Set(); + const out: CreateJobInput[] = []; + const limit = 20; + const errors: string[] = []; + + for (let t = 0; t < targets.length; t += 1) { + if (context.shouldCancel?.()) break; + const target = targets[t]; + try { + for (let i = 0; i < terms.length; i += 1) { + if (context.shouldCancel?.()) break; + const term = terms[i].trim(); + context.onProgress?.({ + phase: "list", + termsProcessed: t * terms.length + i, + termsTotal: targets.length * terms.length, + currentUrl: `${target.company} (${term || "all"})`, + detail: `Workday: ${target.company} term ${i + 1}/${terms.length}`, + }); + + let offset = 0; + let total = Number.POSITIVE_INFINITY; + while (offset < total && offset < 1000) { + if (context.shouldCancel?.()) break; + const body = await fetchPage({ + target, + searchText: term, + offset, + limit, + }); + if (typeof body.total === "number") total = body.total; + const postings = Array.isArray(body.jobPostings) + ? body.jobPostings + : []; + if (postings.length === 0) break; + for (const posting of postings) { + const mapped = mapPosting(posting, target); + if (!mapped) continue; + const key = mapped.sourceJobId || mapped.jobUrl; + if (seen.has(key)) continue; + seen.add(key); + out.push(mapped); + } + offset += postings.length; + if (postings.length < limit) break; + } + } + } catch (error) { + const message = + error instanceof Error ? error.message : "Unknown error"; + errors.push(`${target.company}: ${message}`); + } + } + + if (out.length === 0 && errors.length > 0) { + return { success: false, jobs: out, error: errors.join("; ") }; + } + return { + success: true, + jobs: out, + error: errors.length > 0 ? errors.join("; ") : undefined, + }; + }, +}; + +export default manifest; diff --git a/extractors/workday/package.json b/extractors/workday/package.json new file mode 100644 index 0000000..7306050 --- /dev/null +++ b/extractors/workday/package.json @@ -0,0 +1,17 @@ +{ + "name": "workday-extractor", + "version": "0.0.1", + "type": "module", + "description": "Workday public career-site extractor", + "main": "manifest.ts", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + }, + "scripts": { + "check:types": "tsc --noEmit" + } +} diff --git a/extractors/workday/tsconfig.json b/extractors/workday/tsconfig.json new file mode 100644 index 0000000..4f7ce58 --- /dev/null +++ b/extractors/workday/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "outDir": "dist", + "strict": true, + "noUnusedLocals": false, + "lib": ["ES2022", "DOM"], + "types": ["node"], + "baseUrl": ".", + "paths": { + "@shared/*": ["../../shared/src/*"] + } + }, + "include": ["./manifest.ts", "./src/**/*"] +} diff --git a/orchestrator/package.json b/orchestrator/package.json index ee6fd26..f916b96 100644 --- a/orchestrator/package.json +++ b/orchestrator/package.json @@ -1,6 +1,6 @@ { "name": "job-ops-orchestrator", - "version": "0.2.0", + "version": "0.2.1", "type": "module", "description": "Unified orchestrator for job application pipeline", "main": "src/server/index.ts", diff --git a/orchestrator/src/client/api/client.ts b/orchestrator/src/client/api/client.ts index 4829538..62bc583 100644 --- a/orchestrator/src/client/api/client.ts +++ b/orchestrator/src/client/api/client.ts @@ -502,11 +502,33 @@ async function fetchApi( options?: RequestInit, ): Promise { const method = (options?.method || "GET").toUpperCase(); - let authHeader = cachedBasicAuthCredentials - ? encodeBasicAuthHeaderValue(cachedBasicAuthCredentials) - : undefined; + const activeCreds = getActiveBasicAuthCredentials(); + let authHeader = activeCreds ? encodeBasicAuthHeaderValue(activeCreds) : undefined; let authAttempt = 0; - let usernameHint = cachedBasicAuthCredentials?.username; + let usernameHint = activeCreds?.username; + + const shouldPromptForAuth = (args: { + method: string; + endpoint: string; + response: Response; + parsed: ApiResponse | LegacyApiResponse; + }): boolean => { + if (!basicAuthPromptHandler) return false; + if (authAttempt >= 2) return false; + if (!isUnauthorizedResponse(args.response, args.parsed)) return false; + + // By default we only prompt for write methods. However, some parts of the UI + // (Settings/Profile) require auth even for reads. Without prompting, those + // screens appear to "forget" saved data after refresh. + if (isWriteMethod(args.method)) return true; + + const readAuthEndpoints = new Set([ + "/settings", + "/profile", + "/profiles", + ]); + return readAuthEndpoints.has(args.endpoint); + }; while (true) { const { response, parsed } = await fetchAndParse( @@ -515,12 +537,7 @@ async function fetchApi( authHeader, ); - if ( - isWriteMethod(method) && - isUnauthorizedResponse(response, parsed) && - basicAuthPromptHandler && - authAttempt < 2 - ) { + if (shouldPromptForAuth({ method, endpoint, response, parsed })) { const credentials = await requestBasicAuthCredentials({ endpoint, method, diff --git a/orchestrator/src/client/pages/orchestrator/utils.test.ts b/orchestrator/src/client/pages/orchestrator/utils.test.ts index f5ddd91..e4b6a7a 100644 --- a/orchestrator/src/client/pages/orchestrator/utils.test.ts +++ b/orchestrator/src/client/pages/orchestrator/utils.test.ts @@ -21,6 +21,12 @@ describe("orchestrator utils", () => { expect(getEnabledSources(createAppSettings())).toContain("startupjobs"); }); + it("enables jobicy and themuse without credentials", () => { + const enabled = getEnabledSources(createAppSettings()); + expect(enabled).toContain("jobicy"); + expect(enabled).toContain("themuse"); + }); + it("counts processing jobs in ready and discovered tabs", () => { const jobs = [ createJob({ id: "ready", status: "ready", closedAt: null }), diff --git a/orchestrator/src/client/pages/orchestrator/utils.ts b/orchestrator/src/client/pages/orchestrator/utils.ts index 816c472..a597827 100644 --- a/orchestrator/src/client/pages/orchestrator/utils.ts +++ b/orchestrator/src/client/pages/orchestrator/utils.ts @@ -195,6 +195,21 @@ export const getEnabledSources = ( const hasAdzunaAuth = Boolean( settings.adzunaAppId?.trim() && settings.adzunaAppKeyHint, ); + const hasUsajobsAuth = Boolean( + settings.usajobsUserAgent?.trim() && settings.usajobsApiKeyHint, + ); + const hasJoobleAuth = Boolean(settings.joobleApiKeyHint); + const hasCareerjetAuth = Boolean( + settings.careerjetAffid?.trim() && + settings.careerjetReferer?.trim() && + settings.careerjetUserIp?.trim(), + ); + const hasReedAuth = Boolean(settings.reedApiKeyHint); + const hasLeverCompanies = (settings.leverCompanies?.value ?? []).length > 0; + const hasAshbyCompanies = (settings.ashbyCompanies?.value ?? []).length > 0; + const hasGreenhouseCompanies = + (settings.greenhouseCompanies?.value ?? []).length > 0; + const hasWorkdayTenants = (settings.workdayTenants?.value ?? []).length > 0; for (const source of orderedSources) { if (source === "gradcracker") { @@ -209,6 +224,22 @@ export const getEnabledSources = ( if (hasAdzunaAuth) enabled.push(source); continue; } + if (source === "usajobs") { + if (hasUsajobsAuth) enabled.push(source); + continue; + } + if (source === "jooble") { + if (hasJoobleAuth) enabled.push(source); + continue; + } + if (source === "careerjet") { + if (hasCareerjetAuth) enabled.push(source); + continue; + } + if (source === "reed") { + if (hasReedAuth) enabled.push(source); + continue; + } if (source === "hiringcafe") { enabled.push(source); continue; @@ -217,10 +248,45 @@ export const getEnabledSources = ( enabled.push(source); continue; } + if (source === "jobicy") { + enabled.push(source); + continue; + } + if (source === "themuse") { + enabled.push(source); + continue; + } + if (source === "lever") { + if (hasLeverCompanies) enabled.push(source); + continue; + } + if (source === "ashby") { + if (hasAshbyCompanies) enabled.push(source); + continue; + } + if (source === "greenhouse") { + if (hasGreenhouseCompanies) enabled.push(source); + continue; + } + if (source === "workday") { + if (hasWorkdayTenants) enabled.push(source); + continue; + } if ( source === "indeed" || source === "linkedin" || source === "glassdoor" + ) { + enabled.push(source); + continue; + } + if ( + source === "remoteok" || + source === "remotive" || + source === "arbeitnow" || + source === "himalayas" || + source === "weworkremotely" || + source === "fourdayweek" ) { enabled.push(source); } diff --git a/orchestrator/src/server/api/routes/profiles.ts b/orchestrator/src/server/api/routes/profiles.ts index 98d9586..8473ef2 100644 --- a/orchestrator/src/server/api/routes/profiles.ts +++ b/orchestrator/src/server/api/routes/profiles.ts @@ -1,4 +1,4 @@ -import { badRequest, forbidden } from "@infra/errors"; +import { badRequest, forbidden, unauthorized } from "@infra/errors"; import { asyncRoute, fail, ok } from "@infra/http"; import { logger } from "@infra/logger"; import { @@ -28,10 +28,11 @@ function profileMatchesBasicAuthUser( function assertProfileVisibleToRequest( req: Request, profile: SearchProfile, -): boolean { +): true | "unauthorized" | "forbidden" { if (!isBasicAuthEnabled()) return true; const username = parseBasicAuthUsername(req.headers.authorization); - return profileMatchesBasicAuthUser(profile, username); + if (!username?.trim()) return "unauthorized"; + return profileMatchesBasicAuthUser(profile, username) ? true : "forbidden"; } profilesRouter.get( @@ -53,7 +54,11 @@ profilesRouter.get( if (!profile) { return fail(res, badRequest("Profile not found")); } - if (!assertProfileVisibleToRequest(req, profile)) { + const visible = assertProfileVisibleToRequest(req, profile); + if (visible === "unauthorized") { + return fail(res, unauthorized("Authentication required")); + } + if (visible === "forbidden") { return fail(res, forbidden("You cannot access this profile")); } return ok(res, profile); @@ -93,7 +98,11 @@ profilesRouter.patch( if (!existing) { return fail(res, badRequest("Profile not found")); } - if (!assertProfileVisibleToRequest(req, existing)) { + const visible = assertProfileVisibleToRequest(req, existing); + if (visible === "unauthorized") { + return fail(res, unauthorized("Authentication required")); + } + if (visible === "forbidden") { return fail(res, forbidden("You cannot update this profile")); } const { name, data } = req.body; @@ -139,7 +148,11 @@ profilesRouter.delete( if (!existing) { return fail(res, badRequest("Profile not found")); } - if (!assertProfileVisibleToRequest(req, existing)) { + const visible = assertProfileVisibleToRequest(req, existing); + if (visible === "unauthorized") { + return fail(res, unauthorized("Authentication required")); + } + if (visible === "forbidden") { return fail(res, forbidden("You cannot delete this profile")); } const deleted = await profilesRepo.deleteProfile(req.params.id); @@ -157,7 +170,11 @@ profilesRouter.post( if (!profile) { return fail(res, badRequest("Profile not found")); } - if (!assertProfileVisibleToRequest(req, profile)) { + const visible = assertProfileVisibleToRequest(req, profile); + if (visible === "unauthorized") { + return fail(res, unauthorized("Authentication required")); + } + if (visible === "forbidden") { return fail(res, forbidden("You cannot activate this profile")); } const basicUser = parseBasicAuthUsername(req.headers.authorization)?.trim(); diff --git a/orchestrator/src/server/api/routes/settings.ts b/orchestrator/src/server/api/routes/settings.ts index d44dad2..f145714 100644 --- a/orchestrator/src/server/api/routes/settings.ts +++ b/orchestrator/src/server/api/routes/settings.ts @@ -10,6 +10,10 @@ import { asyncRoute, fail, ok } from "@infra/http"; import { logger } from "@infra/logger"; import { getRequestId } from "@infra/request-context"; import { isDemoMode, sendDemoBlocked } from "@server/config/demo"; +import { getJobOwnerProfileId } from "@server/infra/request-context"; +import { DEFAULT_JOB_OWNER_PROFILE_ID } from "@server/infra/job-owner-context"; +import { parseBasicAuthUsername } from "@server/infra/basic-auth-credentials"; +import * as profilesRepo from "@server/repositories/profiles"; import { getSetting } from "@server/repositories/settings"; import { setBackupSettings } from "@server/services/backup/index"; import { LlmService } from "@server/services/llm/service"; @@ -30,6 +34,7 @@ import { type UpdateSettingsInput, updateSettingsSchema, } from "@shared/settings-schema"; +import { jobSearchProfileSchema } from "@shared/settings-registry"; import { type Request, type Response, Router } from "express"; export const settingsRouter = Router(); @@ -232,6 +237,28 @@ settingsRouter.patch( clearProfileCache(); } + // When Basic Auth is enabled, the effective job search profile comes from + // the authenticated user's saved SearchProfile row (basicAuthUser), not the + // `jobSearchProfile` setting override. If the client is saving a + // jobSearchProfile, persist it directly to the request owner profile so the + // next settings fetch immediately reflects the changes (and doesn't "snap back"). + if (Object.hasOwn(input, "jobSearchProfile") && input.jobSearchProfile) { + const ownerId = getJobOwnerProfileId(); + if ( + ownerId && + ownerId !== DEFAULT_JOB_OWNER_PROFILE_ID && + ownerId !== "__unmapped__" + ) { + const parsed = jobSearchProfileSchema.safeParse(input.jobSearchProfile); + if (parsed.success) { + const username = parseBasicAuthUsername(req.headers.authorization)?.trim(); + const dataWithOwner = + username ? { ...parsed.data, basicAuthUser: username } : parsed.data; + await profilesRepo.updateProfile(ownerId, { data: dataWithOwner }); + } + } + } + const data = await getEffectiveSettings(); if (plan.shouldRefreshBackupScheduler) { diff --git a/orchestrator/src/server/config/demo-defaults.data.ts b/orchestrator/src/server/config/demo-defaults.data.ts index 8565d2b..055379a 100644 --- a/orchestrator/src/server/config/demo-defaults.data.ts +++ b/orchestrator/src/server/config/demo-defaults.data.ts @@ -256,6 +256,22 @@ export const DEMO_SOURCE_BASE_URLS: Record = { adzuna: "https://www.adzuna.com", hiringcafe: "https://hiring.cafe", startupjobs: "https://startup.jobs", + usajobs: "https://www.usajobs.gov", + jobicy: "https://jobicy.com", + themuse: "https://www.themuse.com", + jooble: "https://jooble.org", + careerjet: "https://www.careerjet.com", + reed: "https://www.reed.co.uk", + remoteok: "https://remoteok.com", + remotive: "https://remotive.com", + arbeitnow: "https://www.arbeitnow.com", + himalayas: "https://himalayas.app", + weworkremotely: "https://weworkremotely.com", + fourdayweek: "https://4dayweek.io", + ashby: "https://jobs.ashbyhq.com", + lever: "https://jobs.lever.co", + greenhouse: "https://boards.greenhouse.io", + workday: "https://workday.com", manual: "https://example.com", }; diff --git a/orchestrator/src/server/db/migrate.ts b/orchestrator/src/server/db/migrate.ts index e0a8b73..2d50c13 100644 --- a/orchestrator/src/server/db/migrate.ts +++ b/orchestrator/src/server/db/migrate.ts @@ -781,6 +781,13 @@ const migrations = [ `CREATE INDEX IF NOT EXISTS idx_jobs_status_discovered_at ON jobs(status, discovered_at)`, `CREATE INDEX IF NOT EXISTS idx_jobs_owner_profile_id ON jobs(owner_profile_id)`, + // Cross-source dedup: store a normalized (employer, title) fingerprint and + // index it per owner so import-time skip lookups are fast. Backfill happens + // lazily in the repository on next insert; existing rows just get NULL until + // they're re-imported or rewritten. + `ALTER TABLE jobs ADD COLUMN content_fingerprint TEXT`, + `CREATE INDEX IF NOT EXISTS idx_jobs_owner_profile_content_fingerprint ON jobs(owner_profile_id, content_fingerprint)`, + // Seed default job-search personas (INSERT OR IGNORE — safe on existing DBs). sqlInsertSearchProfileSeed({ id: "685b0000-0000-4000-8000-000000000001", diff --git a/orchestrator/src/server/db/schema.ts b/orchestrator/src/server/db/schema.ts index adbc35d..9fd87b1 100644 --- a/orchestrator/src/server/db/schema.ts +++ b/orchestrator/src/server/db/schema.ts @@ -45,6 +45,12 @@ export const jobs = sqliteTable( employer: text("employer").notNull(), employerUrl: text("employer_url"), jobUrl: text("job_url").notNull(), + /** + * Cross-source dedup key derived from normalized (employer, title). + * Nullable because some legacy rows / very thin postings can't produce a + * fingerprint. New imports skip when this matches an existing row. + */ + contentFingerprint: text("content_fingerprint"), applicationLink: text("application_link"), disciplines: text("disciplines"), deadline: text("deadline"), @@ -126,6 +132,9 @@ export const jobs = sqliteTable( table.ownerProfileId, table.jobUrl, ), + ownerContentFingerprintIndex: index( + "idx_jobs_owner_profile_content_fingerprint", + ).on(table.ownerProfileId, table.contentFingerprint), }), ); diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.ts index 1df99c3..e16b407 100644 --- a/orchestrator/src/server/pipeline/steps/discover-jobs.ts +++ b/orchestrator/src/server/pipeline/steps/discover-jobs.ts @@ -12,6 +12,7 @@ import { normalizeCountryKey, } from "@shared/location-support.js"; import { resolveBlockedCompanyKeywordsFromStoredString } from "@shared/resolve-blocked-company-keywords.js"; +import { jobSearchProfileSchema } from "@shared/settings-registry.js"; import { inferCountryKeyFromSearchGeography, matchesRequestedCity, @@ -67,6 +68,127 @@ function filterJobsByRequestedCities(args: { ); } +const ROLE_TOKEN_STOPWORDS = new Set([ + "a", + "an", + "and", + "the", + "of", + "to", + "for", + "in", + "on", + "with", + "at", + "by", + "from", + "senior", + "sr", + "jr", + "junior", + "lead", + "principal", + "staff", + "i", + "ii", + "iii", + "iv", + "v", + "remote", + "hybrid", + "onsite", + // These are too generic and cause massive false positives. + "software", + "development", + "developer", + "engineer", + "engineering", +]); + +function normalizeText(value: string | null | undefined): string { + return (value ?? "") + .toLowerCase() + .replace(/\s+/g, " ") + .trim(); +} + +function buildRoleMatchers(phrases: string[]): { + phraseMatchers: string[]; + tokenMatchers: string[]; +} { + const phraseMatchers = phrases + .map((p) => normalizeText(p)) + .filter(Boolean); + + const tokenSet = new Set(); + for (const phrase of phraseMatchers) { + for (const token of phrase.split(/[^a-z0-9+.#]+/g)) { + const cleaned = token.trim(); + if (!cleaned) continue; + if (cleaned.length < 2) continue; + if (ROLE_TOKEN_STOPWORDS.has(cleaned)) continue; + tokenSet.add(cleaned); + } + } + + // Ensure common QA acronyms remain even if user only typed long-form roles. + for (const token of ["qa", "sdet", "test", "testing", "automation"]) { + tokenSet.add(token); + } + + return { phraseMatchers, tokenMatchers: [...tokenSet] }; +} + +function matchesAny(text: string, needles: string[]): boolean { + if (!text) return false; + for (const needle of needles) { + if (needle && text.includes(needle)) return true; + } + return false; +} + +function filterJobsBySearchProfile(args: { + jobs: CreateJobInput[]; + targetRolePhrases: string[]; + mustHaveSkills: string[]; + dealBreakers: string[]; +}): { jobs: CreateJobInput[]; dropped: number } { + const { jobs, targetRolePhrases, mustHaveSkills, dealBreakers } = args; + + const roleMatchers = buildRoleMatchers(targetRolePhrases); + const mustHaveLower = mustHaveSkills.map(normalizeText).filter(Boolean); + const dealBreakersLower = dealBreakers.map(normalizeText).filter(Boolean); + + const filtered = jobs.filter((job) => { + const title = normalizeText(job.title); + const body = normalizeText(job.jobDescription); + const haystack = `${title}\n${body}`; + + if (dealBreakersLower.length > 0 && matchesAny(haystack, dealBreakersLower)) { + return false; + } + + // If the user specified target roles, enforce a strict role match so we + // don't surface irrelevant jobs (e.g. legal/sales/finance) in Discovered. + if (roleMatchers.phraseMatchers.length > 0) { + const roleMatch = + matchesAny(title, roleMatchers.phraseMatchers) || + matchesAny(title, roleMatchers.tokenMatchers) || + matchesAny(body, roleMatchers.phraseMatchers) || + matchesAny(body, roleMatchers.tokenMatchers); + if (!roleMatch) return false; + } + + if (mustHaveLower.length > 0 && !matchesAny(haystack, mustHaveLower)) { + return false; + } + + return true; + }); + + return { jobs: filtered, dropped: jobs.length - filtered.length }; +} + export async function discoverJobsStep(args: { mergedConfig: PipelineConfig; shouldCancel?: () => boolean; @@ -98,6 +220,9 @@ export async function discoverJobsStep(args: { const ownerProfileId = args.mergedConfig.ownerProfileId ?? DEFAULT_JOB_OWNER_PROFILE_ID; + let searchProfileTargetRoles: string[] = []; + let searchProfileMustHaveSkills: string[] = []; + let searchProfileDealBreakers: string[] = []; const mergeTargetRoles = (targetRoles: unknown) => { if (!Array.isArray(targetRoles) || targetRoles.length === 0) return; @@ -120,19 +245,38 @@ export async function discoverJobsStep(args: { if (ownerProfileId && ownerProfileId !== DEFAULT_JOB_OWNER_PROFILE_ID) { const row = await getProfileById(ownerProfileId); - if (row?.data?.targetRoles?.length) { - mergeTargetRoles(row.data.targetRoles); + if (row?.data) { + const parsed = jobSearchProfileSchema.safeParse(row.data); + if (parsed.success) { + searchProfileTargetRoles = parsed.data.targetRoles ?? []; + searchProfileMustHaveSkills = parsed.data.mustHaveSkills ?? []; + searchProfileDealBreakers = parsed.data.dealBreakers ?? []; + if (searchProfileTargetRoles.length > 0) { + mergeTargetRoles(searchProfileTargetRoles); + } + } else if (row.data.targetRoles?.length) { + // Legacy profile shapes: keep augmenting terms but we won't enforce strict filtering. + mergeTargetRoles(row.data.targetRoles); + } } } else { const profileSetting = settings.jobSearchProfile; if (profileSetting) { try { const profile = JSON.parse(profileSetting); - if ( - Array.isArray(profile.targetRoles) && - profile.targetRoles.length > 0 + const parsed = jobSearchProfileSchema.safeParse(profile); + if (parsed.success) { + searchProfileTargetRoles = parsed.data.targetRoles ?? []; + searchProfileMustHaveSkills = parsed.data.mustHaveSkills ?? []; + searchProfileDealBreakers = parsed.data.dealBreakers ?? []; + if (searchProfileTargetRoles.length > 0) { + mergeTargetRoles(searchProfileTargetRoles); + } + } else if ( + Array.isArray((profile as { targetRoles?: unknown }).targetRoles) && + (profile as { targetRoles: unknown[] }).targetRoles.length > 0 ) { - mergeTargetRoles(profile.targetRoles); + mergeTargetRoles((profile as { targetRoles: unknown }).targetRoles); } } catch { // malformed profile JSON, continue with existing terms @@ -406,7 +550,32 @@ export async function discoverJobsStep(args: { return { discoveredJobs: filteredDiscoveredJobs, sourceErrors }; } - if (filteredDiscoveredJobs.length === 0 && sourceErrors.length > 0) { + const strictProfileFilteringEnabled = + searchProfileTargetRoles.length > 0 || + searchProfileMustHaveSkills.length > 0 || + searchProfileDealBreakers.length > 0; + const profileFiltered = strictProfileFilteringEnabled + ? filterJobsBySearchProfile({ + jobs: filteredDiscoveredJobs, + targetRolePhrases: searchProfileTargetRoles.length + ? searchProfileTargetRoles + : searchTerms, + mustHaveSkills: searchProfileMustHaveSkills, + dealBreakers: searchProfileDealBreakers, + }) + : { jobs: filteredDiscoveredJobs, dropped: 0 }; + + if (profileFiltered.dropped > 0) { + logger.info("Dropped discovered jobs that didn't match search profile", { + step: "discover-jobs", + droppedCount: profileFiltered.dropped, + targetRolesCount: searchProfileTargetRoles.length, + mustHaveSkillsCount: searchProfileMustHaveSkills.length, + dealBreakersCount: searchProfileDealBreakers.length, + }); + } + + if (profileFiltered.jobs.length === 0 && sourceErrors.length > 0) { throw new Error(`All sources failed: ${sourceErrors.join("; ")}`); } @@ -414,9 +583,9 @@ export async function discoverJobsStep(args: { logger.warn("Some discovery sources failed", { sourceErrors }); } - progressHelpers.crawlingComplete(filteredDiscoveredJobs.length); + progressHelpers.crawlingComplete(profileFiltered.jobs.length); - const stamped = filteredDiscoveredJobs.map((job) => ({ + const stamped = profileFiltered.jobs.map((job) => ({ ...job, ownerProfileId, })); diff --git a/orchestrator/src/server/repositories/jobs.ts b/orchestrator/src/server/repositories/jobs.ts index bbcc402..d3b17d0 100644 --- a/orchestrator/src/server/repositories/jobs.ts +++ b/orchestrator/src/server/repositories/jobs.ts @@ -5,6 +5,7 @@ import { randomUUID } from "node:crypto"; import { getJobOwnerProfileId } from "@infra/request-context"; import { DEFAULT_JOB_OWNER_PROFILE_ID } from "@server/infra/job-owner-context"; +import { buildJobContentFingerprint } from "@shared/job-fingerprint"; import { canonicalizeJobUrl } from "@shared/job-url-canonical"; import type { CreateJobInput, @@ -40,12 +41,16 @@ function resolveOwnerForCreate(input: CreateJobInput): string { async function loadJobDedupIndexes(ownerProfileId: string): Promise<{ existingCanonicalSet: Set; existingSourceJobKeySet: Set; + existingContentFingerprintSet: Set; }> { const rows = await db .select({ jobUrl: jobs.jobUrl, source: jobs.source, sourceJobId: jobs.sourceJobId, + contentFingerprint: jobs.contentFingerprint, + employer: jobs.employer, + title: jobs.title, }) .from(jobs) .where(eq(jobs.ownerProfileId, ownerProfileId)); @@ -60,7 +65,29 @@ async function loadJobDedupIndexes(ownerProfileId: string): Promise<{ ) .map((r) => sourceJobKey(r.source, String(r.sourceJobId))), ); - return { existingCanonicalSet, existingSourceJobKeySet }; + // Cross-source dedup: prefer the persisted fingerprint, but fall back to + // recomputing it from (employer, title) so legacy rows participate in + // dedup until they're rewritten. + const existingContentFingerprintSet = new Set(); + for (const row of rows) { + const stored = row.contentFingerprint?.trim(); + if (stored) { + existingContentFingerprintSet.add(stored); + continue; + } + const recomputed = buildJobContentFingerprint({ + employer: row.employer, + title: row.title, + }); + if (recomputed) { + existingContentFingerprintSet.add(recomputed); + } + } + return { + existingCanonicalSet, + existingSourceJobKeySet, + existingContentFingerprintSet, + }; } async function findJobByCanonicalUrl( @@ -87,6 +114,46 @@ async function findJobByCanonicalUrl( return null; } +async function findJobByContentFingerprint( + fingerprint: string, + ownerProfileId: string, +): Promise { + // Fast path: stored fingerprint match. + const [stored] = await db + .select() + .from(jobs) + .where( + and( + eq(jobs.ownerProfileId, ownerProfileId), + eq(jobs.contentFingerprint, fingerprint), + ), + ) + .limit(1); + if (stored) return mapRowToJob(stored); + + // Fallback for legacy rows without a persisted fingerprint: scan and + // recompute. Owner-scoped table size keeps this cheap in practice. + const allRows = await db + .select() + .from(jobs) + .where( + and( + eq(jobs.ownerProfileId, ownerProfileId), + isNull(jobs.contentFingerprint), + ), + ); + for (const row of allRows) { + const recomputed = buildJobContentFingerprint({ + employer: row.employer, + title: row.title, + }); + if (recomputed === fingerprint) { + return mapRowToJob(row); + } + } + return null; +} + async function getJobBySourceAndExternalId( source: string, sourceJobId: string, @@ -302,6 +369,10 @@ async function insertJob(input: CreateJobInput): Promise { const now = new Date().toISOString(); const ownerProfileId = resolveOwnerForCreate(input); + const contentFingerprint = buildJobContentFingerprint({ + employer: input.employer, + title: input.title, + }); await db.insert(jobs).values({ id, @@ -314,6 +385,7 @@ async function insertJob(input: CreateJobInput): Promise { employer: input.employer, employerUrl: input.employerUrl ?? null, jobUrl: input.jobUrl, + contentFingerprint, applicationLink: input.applicationLink ?? null, disciplines: input.disciplines ?? null, deadline: input.deadline ?? null, @@ -395,8 +467,11 @@ export async function createJobs( ...normalized, ownerProfileId, }; - const { existingCanonicalSet, existingSourceJobKeySet } = - await loadJobDedupIndexes(ownerProfileId); + const { + existingCanonicalSet, + existingSourceJobKeySet, + existingContentFingerprintSet, + } = await loadJobDedupIndexes(ownerProfileId); const sid = normalized.sourceJobId?.trim(); if (sid) { @@ -419,6 +494,18 @@ export async function createJobs( if (existing) return existing; } + const fingerprint = buildJobContentFingerprint({ + employer: normalized.employer, + title: normalized.title, + }); + if (fingerprint && existingContentFingerprintSet.has(fingerprint)) { + const existing = await findJobByContentFingerprint( + fingerprint, + ownerProfileId, + ); + if (existing) return existing; + } + const inserted = await tryInsertJob(normalizedWithOwner); if (inserted) return inserted; @@ -437,14 +524,18 @@ export async function createJobs( } const ownerProfileId = resolveOwnerForCreate(inputOrInputs[0] ?? {}); - const { existingCanonicalSet, existingSourceJobKeySet } = - await loadJobDedupIndexes(ownerProfileId); + const { + existingCanonicalSet, + existingSourceJobKeySet, + existingContentFingerprintSet, + } = await loadJobDedupIndexes(ownerProfileId); const batchBuckets = new Map< string, { input: CreateJobInput; count: number; + fingerprint: string | null; } >(); @@ -454,21 +545,30 @@ export async function createJobs( ownerProfileId, }); const sidForKey = normalized.sourceJobId?.trim(); - const batchKey = sidForKey - ? `sid:${sourceJobKey(normalized.source, sidForKey)}` - : `url:${normalized.jobUrl}`; + const fingerprint = buildJobContentFingerprint({ + employer: normalized.employer, + title: normalized.title, + }); + // Coalesce duplicates within a single batch, preferring fingerprint when + // available so two different feeds posting the same role merge into one + // bucket. Fall back to source-job-id, then canonical URL. + const batchKey = fingerprint + ? `fp:${fingerprint}` + : sidForKey + ? `sid:${sourceJobKey(normalized.source, sidForKey)}` + : `url:${normalized.jobUrl}`; const prev = batchBuckets.get(batchKey); if (prev) { prev.count += 1; } else { - batchBuckets.set(batchKey, { input: normalized, count: 1 }); + batchBuckets.set(batchKey, { input: normalized, count: 1, fingerprint }); } } let created = 0; let skipped = 0; - for (const { input, count } of batchBuckets.values()) { + for (const { input, count, fingerprint } of batchBuckets.values()) { const canonical = input.jobUrl; const sid = input.sourceJobId?.trim(); const sk = sid ? sourceJobKey(input.source, sid) : null; @@ -481,6 +581,10 @@ export async function createJobs( skipped += count; continue; } + if (fingerprint && existingContentFingerprintSet.has(fingerprint)) { + skipped += count; + continue; + } const inserted = await tryInsertJob(input); if (!inserted) { @@ -494,6 +598,9 @@ export async function createJobs( if (sk) { existingSourceJobKeySet.add(sk); } + if (fingerprint) { + existingContentFingerprintSet.add(fingerprint); + } } return { created, skipped }; diff --git a/orchestrator/src/server/services/settings.ts b/orchestrator/src/server/services/settings.ts index cae4672..8d7a91f 100644 --- a/orchestrator/src/server/services/settings.ts +++ b/orchestrator/src/server/services/settings.ts @@ -151,6 +151,20 @@ export async function getEffectiveSettings(): Promise { ...envSettings, }; + // In Basic Auth mode, the "active" search profile is derived from the request + // context (basic auth username → profile.basicAuthUser), not from the stored + // activeProfileId setting. Expose that derived id so the UI updates the right + // profile record on save. + const requestOwnerProfileId = getJobOwnerProfileId(); + if ( + tenantJobSearchProfile && + requestOwnerProfileId && + requestOwnerProfileId !== DEFAULT_JOB_OWNER_PROFILE_ID && + requestOwnerProfileId !== "__unmapped__" + ) { + result.activeProfileId = requestOwnerProfileId; + } + const rawModel = overrides.model; const modelDef = settingsRegistry.model; const overrideModel = normalizeModelForProviderCompatibility( diff --git a/package-lock.json b/package-lock.json index 8baf9ea..9989819 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,6 +13,7 @@ ], "devDependencies": { "@types/node": "^25.2.3", + "dotenv": "^17.2.3", "knip": "^5.83.1", "tsx": "^4.19.2", "typescript": "^5.9.3" @@ -103,6 +104,90 @@ "undici-types": "~7.16.0" } }, + "extractors/arbeitnow": { + "name": "arbeitnow-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/arbeitnow/node_modules/@types/node": { + "version": "24.12.4", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz", + "integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/ashby": { + "name": "ashby-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/ashby/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/careerjet": { + "name": "careerjet-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/careerjet/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/fourdayweek": { + "name": "fourdayweek-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/fourdayweek/node_modules/@types/node": { + "version": "24.12.4", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz", + "integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, "extractors/gradcracker": { "name": "gradcracker-extractor", "version": "0.0.1", @@ -154,6 +239,48 @@ "undici-types": "~7.16.0" } }, + "extractors/greenhouse": { + "name": "greenhouse-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/greenhouse/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/himalayas": { + "name": "himalayas-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/himalayas/node_modules/@types/node": { + "version": "24.12.4", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz", + "integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, "extractors/hiringcafe": { "name": "hiringcafe-extractor", "version": "0.0.1", @@ -181,6 +308,132 @@ "undici-types": "~7.16.0" } }, + "extractors/jobicy": { + "name": "jobicy-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/jobicy/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/jooble": { + "name": "jooble-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/jooble/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/lever": { + "name": "lever-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/lever/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/reed": { + "name": "reed-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/reed/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/remoteok": { + "name": "remoteok-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/remoteok/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/remotive": { + "name": "remotive-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/remotive/node_modules/@types/node": { + "version": "24.12.4", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz", + "integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, "extractors/startupjobs": { "name": "startupjobs-extractor", "version": "0.0.1", @@ -202,6 +455,27 @@ "undici-types": "~7.16.0" } }, + "extractors/themuse": { + "name": "themuse-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/themuse/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, "extractors/ukvisajobs": { "name": "ukvisajobs-extractor", "version": "0.0.1", @@ -234,6 +508,69 @@ "undici-types": "~7.16.0" } }, + "extractors/usajobs": { + "name": "usajobs-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/usajobs/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/weworkremotely": { + "name": "weworkremotely-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/weworkremotely/node_modules/@types/node": { + "version": "24.12.4", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz", + "integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "extractors/workday": { + "name": "workday-extractor", + "version": "0.0.1", + "dependencies": { + "job-ops-shared": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^24.0.0", + "typescript": "~5.9.0" + } + }, + "extractors/workday/node_modules/@types/node": { + "version": "24.12.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz", + "integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, "node_modules/@algolia/abtesting": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/@algolia/abtesting/-/abtesting-1.14.1.tgz", @@ -9384,6 +9721,10 @@ "type-fest": "^4.0.0" } }, + "node_modules/arbeitnow-extractor": { + "resolved": "extractors/arbeitnow", + "link": true + }, "node_modules/arg": { "version": "5.0.2", "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz", @@ -9423,6 +9764,10 @@ "node": ">=8" } }, + "node_modules/ashby-extractor": { + "resolved": "extractors/ashby", + "link": true + }, "node_modules/asn1js": { "version": "3.0.7", "resolved": "https://registry.npmjs.org/asn1js/-/asn1js-3.0.7.tgz", @@ -10208,6 +10553,10 @@ ], "license": "CC-BY-4.0" }, + "node_modules/careerjet-extractor": { + "resolved": "extractors/careerjet", + "link": true + }, "node_modules/ccount": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", @@ -11899,6 +12248,18 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/dotenv": { + "version": "17.4.2", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.4.2.tgz", + "integrity": "sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -13168,6 +13529,10 @@ "node": ">= 0.6" } }, + "node_modules/fourdayweek-extractor": { + "resolved": "extractors/fourdayweek", + "link": true + }, "node_modules/fraction.js": { "version": "5.3.4", "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-5.3.4.tgz", @@ -13695,6 +14060,10 @@ "js-yaml": "bin/js-yaml.js" } }, + "node_modules/greenhouse-extractor": { + "resolved": "extractors/greenhouse", + "link": true + }, "node_modules/gzip-size": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/gzip-size/-/gzip-size-6.0.0.tgz", @@ -13995,6 +14364,10 @@ "node": ">=16.0.0" } }, + "node_modules/himalayas-extractor": { + "resolved": "extractors/himalayas", + "link": true + }, "node_modules/hiringcafe-extractor": { "resolved": "extractors/hiringcafe", "link": true @@ -15261,6 +15634,10 @@ "resolved": "shared", "link": true }, + "node_modules/jobicy-extractor": { + "resolved": "extractors/jobicy", + "link": true + }, "node_modules/joi": { "version": "17.13.3", "resolved": "https://registry.npmjs.org/joi/-/joi-17.13.3.tgz", @@ -15274,6 +15651,10 @@ "@sideway/pinpoint": "^2.0.0" } }, + "node_modules/jooble-extractor": { + "resolved": "extractors/jooble", + "link": true + }, "node_modules/jquery": { "version": "3.7.1", "resolved": "https://registry.npmjs.org/jquery/-/jquery-3.7.1.tgz", @@ -15573,6 +15954,10 @@ "node": ">=6" } }, + "node_modules/lever-extractor": { + "resolved": "extractors/lever", + "link": true + }, "node_modules/lightningcss-android-arm64": { "version": "1.30.2", "resolved": "https://registry.npmjs.org/lightningcss-android-arm64/-/lightningcss-android-arm64-1.30.2.tgz", @@ -21836,6 +22221,10 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/reed-extractor": { + "resolved": "extractors/reed", + "link": true + }, "node_modules/reflect-metadata": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/reflect-metadata/-/reflect-metadata-0.2.2.tgz", @@ -22089,6 +22478,14 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/remoteok-extractor": { + "resolved": "extractors/remoteok", + "link": true + }, + "node_modules/remotive-extractor": { + "resolved": "extractors/remotive", + "link": true + }, "node_modules/renderkid": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/renderkid/-/renderkid-3.0.0.tgz", @@ -23752,6 +24149,10 @@ "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", "license": "MIT" }, + "node_modules/themuse-extractor": { + "resolved": "extractors/themuse", + "link": true + }, "node_modules/thingies": { "version": "2.5.0", "resolved": "https://registry.npmjs.org/thingies/-/thingies-2.5.0.tgz", @@ -24601,6 +25002,10 @@ "url": "https://opencollective.com/webpack" } }, + "node_modules/usajobs-extractor": { + "resolved": "extractors/usajobs", + "link": true + }, "node_modules/use-callback-ref": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/use-callback-ref/-/use-callback-ref-1.3.3.tgz", @@ -25191,6 +25596,10 @@ "node": ">=0.8.0" } }, + "node_modules/weworkremotely-extractor": { + "resolved": "extractors/weworkremotely", + "link": true + }, "node_modules/whatwg-encoding": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", @@ -25324,6 +25733,10 @@ "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==", "license": "MIT" }, + "node_modules/workday-extractor": { + "resolved": "extractors/workday", + "link": true + }, "node_modules/wrap-ansi": { "version": "6.2.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz", @@ -25640,7 +26053,7 @@ }, "orchestrator": { "name": "job-ops-orchestrator", - "version": "0.2.0", + "version": "0.2.1", "dependencies": { "@hookform/resolvers": "^5.2.2", "@paralleldrive/cuid2": "^3.0.6", @@ -27412,16 +27825,6 @@ "csstype": "^3.0.2" } }, - "orchestrator/node_modules/dotenv": { - "version": "17.2.3", - "license": "BSD-2-Clause", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://dotenvx.com" - } - }, "orchestrator/node_modules/drizzle-kit": { "version": "0.30.6", "dev": true, diff --git a/package.json b/package.json index af5c682..43a87a5 100644 --- a/package.json +++ b/package.json @@ -24,6 +24,7 @@ "knip": "knip" }, "devDependencies": { + "dotenv": "^17.2.3", "@types/node": "^25.2.3", "knip": "^5.83.1", "tsx": "^4.19.2", diff --git a/scripts/jobber-cron-cherepaha.env.example b/scripts/jobber-cron-cherepaha.env.example new file mode 100644 index 0000000..06f5c29 --- /dev/null +++ b/scripts/jobber-cron-cherepaha.env.example @@ -0,0 +1,24 @@ +# +# Jobber cron env — cherepaha +# Copy to /root/.jobber-cron-cherepaha.env (chmod 600) +# +# Used by: scripts/jobber-pipeline-telegram.sh +# + +# JobOps base URL (where the app is reachable from the cron host) +JOBOPS_URL=http://127.0.0.1:3005 + +# Optional: limit number of jobs linked in Telegram message +JOB_TELEGRAM_MAX_JOBS=25 + +# Optional: comma-separated sources to run (leave empty to use server defaults) +# JOBBER_PIPELINE_SOURCES=adzuna,gradcracker,ukvisajobs + +# App-level Basic Auth (enables per-user separation when set on the server) +BASIC_AUTH_USER=cherepaha +BASIC_AUTH_PASSWORD=CHANGEME + +# Telegram bot + chat destination +TELEGRAM_BOT_TOKEN=CHANGEME +TELEGRAM_CHAT_ID=CHANGEME + diff --git a/scripts/jobber-cron-dobkin.env.example b/scripts/jobber-cron-dobkin.env.example new file mode 100644 index 0000000..4362f9f --- /dev/null +++ b/scripts/jobber-cron-dobkin.env.example @@ -0,0 +1,24 @@ +# +# Jobber cron env — dobkin +# Copy to /root/.jobber-cron-dobkin.env (chmod 600) +# +# Used by: scripts/jobber-pipeline-telegram.sh +# + +# JobOps base URL (where the app is reachable from the cron host) +JOBOPS_URL=http://127.0.0.1:3005 + +# Optional: limit number of jobs linked in Telegram message +JOB_TELEGRAM_MAX_JOBS=25 + +# Optional: comma-separated sources to run (leave empty to use server defaults) +# JOBBER_PIPELINE_SOURCES=adzuna,gradcracker,ukvisajobs + +# App-level Basic Auth (enables per-user separation when set on the server) +BASIC_AUTH_USER=dobkin +BASIC_AUTH_PASSWORD=CHANGEME + +# Telegram bot + chat destination +TELEGRAM_BOT_TOKEN=CHANGEME +TELEGRAM_CHAT_ID=CHANGEME + diff --git a/scripts/smoke-extractors.ts b/scripts/smoke-extractors.ts new file mode 100644 index 0000000..9869c97 --- /dev/null +++ b/scripts/smoke-extractors.ts @@ -0,0 +1,219 @@ +/** + * Tiny smoke-test for new extractors: imports each manifest, runs it with a + * minimal context, and prints the count of mapped jobs + a few samples. + * + * Run from repo root: npx tsx scripts/smoke-extractors.ts [comma,separated,ids] + * + * Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain + * `tsx` does not read `.env` automatically). + */ + +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { config as loadEnv } from "dotenv"; +import type { + ExtractorManifest, + ExtractorRuntimeContext, +} from "../shared/src/types/extractors"; + +const repoRoot = path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + "..", +); +loadEnv({ path: path.join(repoRoot, ".env") }); + +interface Target { + id: string; + importPath: string; + needs?: string[]; // env vars required to run; skipped if missing + settings?: Record; +} + +const ALL_TARGETS: Target[] = [ + { + id: "jobicy", + importPath: "../extractors/jobicy/manifest", + settings: { jobicyMaxJobsPerTerm: "10" }, + }, + { + id: "themuse", + importPath: "../extractors/themuse/manifest", + settings: { themuseMaxJobsPerTerm: "10" }, + }, + { + id: "usajobs", + importPath: "../extractors/usajobs/manifest", + needs: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"], + settings: { usajobsMaxJobsPerTerm: "10" }, + }, + { + id: "jooble", + importPath: "../extractors/jooble/manifest", + needs: ["JOOBLE_API_KEY"], + settings: { joobleMaxJobsPerTerm: "10" }, + }, + { + id: "careerjet", + importPath: "../extractors/careerjet/manifest", + needs: ["CAREERJET_AFFID", "CAREERJET_REFERER", "CAREERJET_USER_IP"], + settings: { careerjetMaxJobsPerTerm: "10" }, + }, + { + id: "reed", + importPath: "../extractors/reed/manifest", + needs: ["REED_API_KEY"], + settings: { reedMaxJobsPerTerm: "10" }, + }, + { + id: "lever", + importPath: "../extractors/lever/manifest", + settings: { + // Known active public Lever board used purely as a connectivity check. + leverCompanies: JSON.stringify(["palantir", "netflix"]), + }, + }, + { + id: "ashby", + importPath: "../extractors/ashby/manifest", + settings: { + ashbyCompanies: JSON.stringify(["ramp", "linear"]), + }, + }, + { + id: "greenhouse", + importPath: "../extractors/greenhouse/manifest", + settings: { + greenhouseCompanies: JSON.stringify(["stripe", "airbnb"]), + }, + }, + { + id: "workday", + importPath: "../extractors/workday/manifest", + settings: { + workdayTenants: JSON.stringify([ + "https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite", + ]), + }, + }, + { + id: "remoteok", + importPath: "../extractors/remoteok/manifest", + settings: { remoteokMaxJobsPerTerm: "10" }, + }, + { + id: "remotive", + importPath: "../extractors/remotive/manifest", + settings: { remotiveMaxJobsPerTerm: "10" }, + }, + { + id: "arbeitnow", + importPath: "../extractors/arbeitnow/manifest", + settings: { arbeitnowMaxJobsPerTerm: "10" }, + }, + { + id: "himalayas", + importPath: "../extractors/himalayas/manifest", + settings: { himalayasMaxJobsPerTerm: "10" }, + }, + { + id: "weworkremotely", + importPath: "../extractors/weworkremotely/manifest", + settings: { weworkremotelyMaxJobsPerTerm: "10" }, + }, + { + id: "fourdayweek", + importPath: "../extractors/fourdayweek/manifest", + settings: { fourdayweekMaxJobsPerTerm: "10" }, + }, +]; + +function buildContext( + source: string, + settings: Record, +): ExtractorRuntimeContext { + return { + source, + selectedSources: [source], + settings, + searchTerms: ["software engineer"], + selectedCountry: "United States", + getExistingJobUrls: async () => [], + shouldCancel: () => false, + onProgress: () => {}, + }; +} + +function pad(s: string, n: number): string { + return s.length >= n ? s : s + " ".repeat(n - s.length); +} + +async function runOne(target: Target): Promise { + const missing = (target.needs ?? []).filter((k) => !process.env[k]); + if (missing.length > 0) { + console.log( + `${pad(target.id, 12)} SKIP missing env: ${missing.join(", ")}`, + ); + return; + } + + let mod: { manifest?: ExtractorManifest; default?: ExtractorManifest }; + try { + mod = await import(target.importPath); + } catch (err) { + console.log( + `${pad(target.id, 12)} FAIL import error: ${(err as Error).message}`, + ); + return; + } + + const manifest = mod.manifest ?? mod.default; + if (!manifest) { + console.log(`${pad(target.id, 12)} FAIL manifest export missing`); + return; + } + + const started = Date.now(); + try { + const ctx = buildContext(target.id, target.settings ?? {}); + const result = await manifest.run(ctx); + const ms = Date.now() - started; + const status = result.success ? "OK " : "ERR "; + const sample = result.jobs[0]; + const sampleStr = sample + ? ` | first: "${sample.title}" @ ${sample.employer}` + : ""; + console.log( + `${pad(target.id, 12)} ${status} jobs=${result.jobs.length} ${ms}ms${result.error ? ` | error: ${result.error}` : ""}${sampleStr}`, + ); + } catch (err) { + const ms = Date.now() - started; + console.log( + `${pad(target.id, 12)} CRASH ${ms}ms ${(err as Error).message}`, + ); + } +} + +async function main() { + const requested = (process.argv[2] ?? "").trim(); + const filter = requested + ? new Set( + requested + .split(",") + .map((s) => s.trim()) + .filter(Boolean), + ) + : null; + const targets = filter + ? ALL_TARGETS.filter((t) => filter.has(t.id)) + : ALL_TARGETS; + + console.log(`Smoke testing ${targets.length} extractor(s)...\n`); + for (const t of targets) { + await runOne(t); + } +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/shared/src/extractors/index.ts b/shared/src/extractors/index.ts index 261dc4c..ed9030f 100644 --- a/shared/src/extractors/index.ts +++ b/shared/src/extractors/index.ts @@ -9,6 +9,24 @@ export const EXTRACTOR_SOURCE_IDS = [ "adzuna", "hiringcafe", "startupjobs", + // --- Public APIs / feeds --- + "usajobs", + "jobicy", + "themuse", + "jooble", + "careerjet", + "reed", + "remoteok", + "remotive", + "arbeitnow", + "himalayas", + "weworkremotely", + "fourdayweek", + // --- Public ATS / career-page sources --- + "ashby", + "lever", + "greenhouse", + "workday", "manual", ] as const; @@ -20,6 +38,10 @@ export interface ExtractorSourceMetadata { category: "pipeline" | "manual"; requiresCredentials?: boolean; ukOnly?: boolean; + /** Country gating: when set, only run/show this source for these country keys. */ + countryAllowlist?: readonly string[]; + /** Region tag for grouping / filtering in the UI. */ + region?: "us" | "uk" | "global" | "remote"; } export const EXTRACTOR_SOURCE_METADATA: Record< @@ -31,26 +53,157 @@ export const EXTRACTOR_SOURCE_METADATA: Record< order: 10, category: "pipeline", ukOnly: true, + region: "uk", + }, + indeed: { + label: "Indeed", + order: 20, + category: "pipeline", + region: "global", + }, + linkedin: { + label: "LinkedIn", + order: 30, + category: "pipeline", + region: "global", + }, + glassdoor: { + label: "Glassdoor", + order: 40, + category: "pipeline", + region: "global", }, - indeed: { label: "Indeed", order: 20, category: "pipeline" }, - linkedin: { label: "LinkedIn", order: 30, category: "pipeline" }, - glassdoor: { label: "Glassdoor", order: 40, category: "pipeline" }, ukvisajobs: { label: "UK Visa Jobs", order: 50, category: "pipeline", requiresCredentials: true, ukOnly: true, + region: "uk", }, adzuna: { label: "Adzuna", order: 60, category: "pipeline", requiresCredentials: true, + region: "global", }, - hiringcafe: { label: "Hiring Cafe", order: 70, category: "pipeline" }, - startupjobs: { label: "startup.jobs", order: 80, category: "pipeline" }, - manual: { label: "Manual", order: 90, category: "manual" }, + hiringcafe: { + label: "Hiring Cafe", + order: 70, + category: "pipeline", + region: "global", + }, + startupjobs: { + label: "startup.jobs", + order: 80, + category: "pipeline", + region: "global", + }, + usajobs: { + label: "USAJOBS", + order: 110, + category: "pipeline", + requiresCredentials: true, + countryAllowlist: ["united states", "usa", "us"], + region: "us", + }, + jobicy: { + label: "Jobicy (Remote)", + order: 120, + category: "pipeline", + region: "remote", + }, + themuse: { + label: "The Muse", + order: 130, + category: "pipeline", + region: "global", + }, + jooble: { + label: "Jooble", + order: 140, + category: "pipeline", + requiresCredentials: true, + region: "global", + }, + careerjet: { + label: "Careerjet", + order: 150, + category: "pipeline", + requiresCredentials: true, + region: "global", + }, + reed: { + label: "Reed", + order: 160, + category: "pipeline", + requiresCredentials: true, + ukOnly: true, + countryAllowlist: ["united kingdom", "uk", "great britain", "england"], + region: "uk", + }, + remoteok: { + label: "Remote OK", + order: 170, + category: "pipeline", + region: "remote", + }, + remotive: { + label: "Remotive", + order: 175, + category: "pipeline", + region: "remote", + }, + arbeitnow: { + label: "Arbeitnow", + order: 180, + category: "pipeline", + region: "global", + }, + himalayas: { + label: "Himalayas", + order: 185, + category: "pipeline", + region: "remote", + }, + weworkremotely: { + label: "We Work Remotely", + order: 190, + category: "pipeline", + region: "remote", + }, + fourdayweek: { + label: "4 Day Week", + order: 195, + category: "pipeline", + region: "remote", + }, + ashby: { + label: "Ashby (ATS)", + order: 210, + category: "pipeline", + region: "global", + }, + lever: { + label: "Lever (ATS)", + order: 220, + category: "pipeline", + region: "global", + }, + greenhouse: { + label: "Greenhouse (ATS)", + order: 230, + category: "pipeline", + region: "global", + }, + workday: { + label: "Workday (ATS)", + order: 240, + category: "pipeline", + region: "global", + }, + manual: { label: "Manual", order: 900, category: "manual" }, }; export const PIPELINE_EXTRACTOR_SOURCE_IDS = EXTRACTOR_SOURCE_IDS.filter( diff --git a/shared/src/index.ts b/shared/src/index.ts index 01bd213..6ab1295 100644 --- a/shared/src/index.ts +++ b/shared/src/index.ts @@ -1,4 +1,5 @@ export * from "./extractors"; +export * from "./job-fingerprint"; export * from "./job-url-canonical"; export * from "./location-support"; export * from "./types"; diff --git a/shared/src/job-fingerprint.test.ts b/shared/src/job-fingerprint.test.ts new file mode 100644 index 0000000..6c47b5c --- /dev/null +++ b/shared/src/job-fingerprint.test.ts @@ -0,0 +1,80 @@ +import { describe, expect, it } from "vitest"; +import { + buildJobContentFingerprint, + normalizeEmployerForFingerprint, + normalizeTitleForFingerprint, +} from "./job-fingerprint"; + +describe("buildJobContentFingerprint", () => { + it("collapses the same role across sources", () => { + const a = buildJobContentFingerprint({ + employer: "Stripe, Inc.", + title: "Senior Software Engineer (Backend) - Toronto, ON", + }); + const b = buildJobContentFingerprint({ + employer: "stripe inc", + title: "Senior Software Engineer (Backend)", + }); + expect(a).toBe(b); + expect(a).not.toBeNull(); + }); + + it("ignores trailing location decorations on titles", () => { + const a = buildJobContentFingerprint({ + employer: "Acme", + title: "Software Engineer — Remote", + }); + const b = buildJobContentFingerprint({ + employer: "Acme", + title: "Software Engineer", + }); + expect(a).toBe(b); + }); + + it("strips diacritics and punctuation", () => { + const a = buildJobContentFingerprint({ + employer: "Café Münchën", + title: "Étudiant – Stage", + }); + const b = buildJobContentFingerprint({ + employer: "cafe munchen", + title: "Etudiant Stage", + }); + expect(a).toBe(b); + }); + + it("returns null when employer or title is empty", () => { + expect( + buildJobContentFingerprint({ employer: "", title: "Engineer" }), + ).toBeNull(); + expect( + buildJobContentFingerprint({ employer: "Acme", title: "" }), + ).toBeNull(); + }); + + it("does not collapse different roles at the same employer", () => { + const a = buildJobContentFingerprint({ + employer: "Acme", + title: "Software Engineer", + }); + const b = buildJobContentFingerprint({ + employer: "Acme", + title: "Product Designer", + }); + expect(a).not.toBe(b); + }); + + describe("normalizers", () => { + it("normalizeEmployerForFingerprint strips legal suffixes", () => { + expect(normalizeEmployerForFingerprint("Acme Corporation")).toBe("acme"); + expect(normalizeEmployerForFingerprint("Acme, LLC")).toBe("acme"); + expect(normalizeEmployerForFingerprint("Acme GmbH")).toBe("acme"); + }); + + it("normalizeTitleForFingerprint drops leading repost markers", () => { + expect(normalizeTitleForFingerprint("[Reposted] Software Engineer")).toBe( + "softwareengineer", + ); + }); + }); +}); diff --git a/shared/src/job-fingerprint.ts b/shared/src/job-fingerprint.ts new file mode 100644 index 0000000..4bd4aae --- /dev/null +++ b/shared/src/job-fingerprint.ts @@ -0,0 +1,77 @@ +/** + * Cross-source duplicate detection. + * + * Two postings from different sources almost always describe the same role + * when their employer + title agree once you strip noise (case, punctuation, + * tracking suffixes, common decorations like "(Remote)", "- Toronto, ON" etc). + * The fingerprint is intentionally coarse so we err on the side of skipping + * a duplicate rather than re-showing it from a second source. + */ + +const PUNCTUATION_RE = /[\p{P}\p{S}]+/gu; +const WHITESPACE_RE = /\s+/g; +const LEADING_NOISE_RE = /^(?:re-?post(?:ed)?|new|hot|urgent)\s*[-:]?\s*/i; +const PARENS_RE = /\s*[([][^)\]]*[)\]]/g; +// Trailing decorations we know are location / arrangement metadata, not role +// suffix. Matched after the title body and stripped before fingerprinting. +// Examples we want to strip: +// "Software Engineer — Remote" +// "Senior Engineer - Toronto, ON" +// "Designer | Hybrid" +// Examples we must NOT strip (otherwise we'd collide unrelated roles): +// "Etudiant – Stage" (Stage is a role qualifier in French postings) +// "Designer — Senior" (level qualifier) +const TRAILING_LOCATION_KEYWORDS_RE = + /\s+[-|–—]\s+(?:remote|hybrid|on[\s-]?site|wfh|telework|anywhere)\s*$/i; +// Escape the ASCII hyphen explicitly so it doesn't form a character range +// with the surrounding delimiters (which would silently swallow letters). +const TRAILING_CITY_REGION_RE = /\s+[-|–—]\s+[^,\-|–—]+,\s*[^,\-|–—]+\s*$/; +const COMPANY_LEGAL_SUFFIX_RE = + /\b(?:inc|inc\.|ltd|ltd\.|llc|gmbh|s\.a\.|s\.r\.l|sa|nv|bv|plc|corp|corporation|co|company|holdings|holding)\b/g; + +function stripDiacritics(input: string): string { + return input.normalize("NFKD").replace(/[\u0300-\u036f]/g, ""); +} + +export function normalizeEmployerForFingerprint( + employer: string | null | undefined, +): string { + if (!employer) return ""; + let value = stripDiacritics(employer.toLowerCase()).trim(); + value = value.replace(PARENS_RE, " "); + value = value.replace(COMPANY_LEGAL_SUFFIX_RE, " "); + value = value.replace(PUNCTUATION_RE, " "); + value = value.replace(WHITESPACE_RE, "").trim(); + return value; +} + +export function normalizeTitleForFingerprint( + title: string | null | undefined, +): string { + if (!title) return ""; + let value = stripDiacritics(title.toLowerCase()).trim(); + value = value.replace(LEADING_NOISE_RE, ""); + value = value.replace(PARENS_RE, " "); + value = value.replace(TRAILING_LOCATION_KEYWORDS_RE, " "); + value = value.replace(TRAILING_CITY_REGION_RE, " "); + value = value.replace(PUNCTUATION_RE, " "); + value = value.replace(WHITESPACE_RE, "").trim(); + return value; +} + +/** + * Build a stable, source-agnostic fingerprint for a posting. + * + * Returns `null` when employer or title is empty after normalization, so + * callers fall back to URL/sourceJobId equality and don't accidentally + * collapse unrelated rows under the empty key. + */ +export function buildJobContentFingerprint(args: { + employer: string | null | undefined; + title: string | null | undefined; +}): string | null { + const employer = normalizeEmployerForFingerprint(args.employer); + const title = normalizeTitleForFingerprint(args.title); + if (!employer || !title) return null; + return `${employer}::${title}`; +} diff --git a/shared/src/location-support.ts b/shared/src/location-support.ts index a9586fd..c77da1e 100644 --- a/shared/src/location-support.ts +++ b/shared/src/location-support.ts @@ -99,7 +99,12 @@ export const SUPPORTED_COUNTRY_INPUTS = [ "worldwide", ] as const; -const UK_ONLY_SOURCES = new Set(["gradcracker", "ukvisajobs"]); +const UK_ONLY_SOURCES = new Set([ + "gradcracker", + "ukvisajobs", + "reed", +]); +const US_ONLY_SOURCES = new Set(["usajobs"]); const GLASSDOOR_SUPPORTED_COUNTRIES = new Set( [ "australia", @@ -170,6 +175,10 @@ export function isUkCountry(country: string | null | undefined): boolean { return normalizeCountryKey(country) === "united kingdom"; } +export function isUsCountry(country: string | null | undefined): boolean { + return normalizeCountryKey(country) === "united states"; +} + export function isGlassdoorCountry( country: string | null | undefined, ): boolean { @@ -187,6 +196,7 @@ export function isSourceAllowedForCountry( country: string | null | undefined, ): boolean { if (UK_ONLY_SOURCES.has(source)) return isUkCountry(country); + if (US_ONLY_SOURCES.has(source)) return isUsCountry(country); if (source === "glassdoor") return isGlassdoorCountry(country); if (source === "adzuna") return getAdzunaCountryCode(country) !== null; return true; diff --git a/shared/src/settings-registry.ts b/shared/src/settings-registry.ts index 05db688..7772968 100644 --- a/shared/src/settings-registry.ts +++ b/shared/src/settings-registry.ts @@ -28,6 +28,24 @@ function parseJsonArrayOrNull(raw: string | undefined): string[] | null { } } +/** + * Parse a delimited list (comma / newline / pipe) into a deduped, trimmed + * array. Used for env-backed defaults like LEVER_COMPANIES="acme,stripe". + */ +function parseCompanyList(raw: string | undefined | null): string[] { + if (!raw) return []; + const out: string[] = []; + const seen = new Set(); + for (const piece of raw.split(/[\n,;|]+/)) { + const value = piece.trim(); + if (!value) continue; + if (seen.has(value)) continue; + seen.add(value); + out.push(value); + } + return out; +} + function parseBitBoolOrNull(raw: string | undefined): boolean | null { if (!raw) return null; return raw === "true" || raw === "1"; @@ -336,6 +354,145 @@ export const settingsRegistry = { parse: parseIntOrNull, serialize: serializeNullableNumber, }, + // --- New extractor caps & per-source target lists --- + usajobsMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => + parseInt( + typeof process !== "undefined" + ? process.env.USAJOBS_MAX_JOBS_PER_TERM || "100" + : "100", + 10, + ), + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + jobicyMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + themuseMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + joobleMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + careerjetMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + reedMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + remoteokMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + remotiveMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + arbeitnowMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + himalayasMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + weworkremotelyMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + fourdayweekMaxJobsPerTerm: { + kind: "typed" as const, + schema: z.number().int().min(1).max(1000), + default: (): number => 100, + parse: parseIntOrNull, + serialize: serializeNullableNumber, + }, + /** + * Comma- or newline-separated company slugs to fetch from public ATS feeds. + * `lever`, `ashby`, and `greenhouse` each take one entry per company. + */ + leverCompanies: { + kind: "typed" as const, + schema: z.array(z.string().trim().min(1).max(100)).max(200), + default: (): string[] => + parseCompanyList( + typeof process !== "undefined" ? process.env.LEVER_COMPANIES : "", + ), + parse: parseJsonArrayOrNull, + serialize: serializeNullableJsonArray, + }, + ashbyCompanies: { + kind: "typed" as const, + schema: z.array(z.string().trim().min(1).max(100)).max(200), + default: (): string[] => + parseCompanyList( + typeof process !== "undefined" ? process.env.ASHBY_COMPANIES : "", + ), + parse: parseJsonArrayOrNull, + serialize: serializeNullableJsonArray, + }, + greenhouseCompanies: { + kind: "typed" as const, + schema: z.array(z.string().trim().min(1).max(100)).max(200), + default: (): string[] => + parseCompanyList( + typeof process !== "undefined" ? process.env.GREENHOUSE_COMPANIES : "", + ), + parse: parseJsonArrayOrNull, + serialize: serializeNullableJsonArray, + }, + /** + * Workday tenant configurations as JSON, e.g. + * `[{"company":"Acme","tenantUrl":"https://acme.wd1.myworkdayjobs.com","sites":["External"]}]`. + */ + workdayTenants: { + kind: "typed" as const, + schema: z.array(z.string().trim().min(1).max(2000)).max(50), + default: (): string[] => + parseCompanyList( + typeof process !== "undefined" ? process.env.WORKDAY_TENANTS : "", + ), + parse: parseJsonArrayOrNull, + serialize: serializeNullableJsonArray, + }, searchTerms: { kind: "typed" as const, schema: z.array(z.string().trim().min(1).max(200)).max(100), @@ -626,6 +783,40 @@ export const settingsRegistry = { envKey: "ADZUNA_APP_ID", schema: z.string().trim().max(200), }, + // --- New extractor keys / identifiers (non-secret) --- + usajobsUserAgent: { + kind: "string" as const, + envKey: "USAJOBS_USER_AGENT", + schema: z.string().trim().max(200), + }, + themuseApiKey: { + kind: "string" as const, + envKey: "THEMUSE_API_KEY", + schema: z.string().trim().max(200), + }, + /** Publisher API key (Basic auth user); Careerjet labels this “API key” in the dashboard. */ + careerjetAffid: { + kind: "string" as const, + envKey: "CAREERJET_AFFID", + schema: z.string().trim().max(200), + }, + /** Required Referer URL for v4 (your job-search page that triggers API use). */ + careerjetReferer: { + kind: "string" as const, + envKey: "CAREERJET_REFERER", + schema: z.string().trim().max(500), + }, + /** Must match an IP allowlisted in Careerjet (usually your server egress IP). */ + careerjetUserIp: { + kind: "string" as const, + envKey: "CAREERJET_USER_IP", + schema: z.string().trim().max(80), + }, + careerjetUserAgent: { + kind: "string" as const, + envKey: "CAREERJET_USER_AGENT", + schema: z.string().trim().max(500), + }, basicAuthUser: { kind: "string" as const, envKey: "BASIC_AUTH_USER", @@ -658,6 +849,22 @@ export const settingsRegistry = { envKey: "ADZUNA_APP_KEY", schema: z.string().trim().max(2000), }, + // --- Secrets for new extractors --- + usajobsApiKey: { + kind: "secret" as const, + envKey: "USAJOBS_API_KEY", + schema: z.string().trim().max(2000), + }, + joobleApiKey: { + kind: "secret" as const, + envKey: "JOOBLE_API_KEY", + schema: z.string().trim().max(2000), + }, + reedApiKey: { + kind: "secret" as const, + envKey: "REED_API_KEY", + schema: z.string().trim().max(2000), + }, basicAuthPassword: { kind: "secret" as const, envKey: "BASIC_AUTH_PASSWORD", diff --git a/shared/src/testing/factories.ts b/shared/src/testing/factories.ts index fa85100..ec807a0 100644 --- a/shared/src/testing/factories.ts +++ b/shared/src/testing/factories.ts @@ -188,6 +188,22 @@ export const createAppSettings = ( adzunaMaxJobsPerTerm: { value: 50, default: 50, override: null }, gradcrackerMaxJobsPerTerm: { value: 50, default: 50, override: null }, startupjobsMaxJobsPerTerm: { value: 50, default: 50, override: null }, + usajobsMaxJobsPerTerm: { value: 50, default: 50, override: null }, + jobicyMaxJobsPerTerm: { value: 50, default: 50, override: null }, + themuseMaxJobsPerTerm: { value: 50, default: 50, override: null }, + joobleMaxJobsPerTerm: { value: 50, default: 50, override: null }, + careerjetMaxJobsPerTerm: { value: 50, default: 50, override: null }, + reedMaxJobsPerTerm: { value: 50, default: 50, override: null }, + remoteokMaxJobsPerTerm: { value: 50, default: 50, override: null }, + remotiveMaxJobsPerTerm: { value: 50, default: 50, override: null }, + arbeitnowMaxJobsPerTerm: { value: 50, default: 50, override: null }, + himalayasMaxJobsPerTerm: { value: 50, default: 50, override: null }, + weworkremotelyMaxJobsPerTerm: { value: 50, default: 50, override: null }, + fourdayweekMaxJobsPerTerm: { value: 50, default: 50, override: null }, + leverCompanies: { value: [], default: [], override: null }, + ashbyCompanies: { value: [], default: [], override: null }, + greenhouseCompanies: { value: [], default: [], override: null }, + workdayTenants: { value: [], default: [], override: null }, searchTerms: { value: ["Software Engineer"], default: ["Software Engineer"], @@ -256,6 +272,15 @@ export const createAppSettings = ( adzunaAppId: null, adzunaAppKeyHint: null, webhookSecretHint: null, + usajobsUserAgent: null, + themuseApiKey: null, + careerjetAffid: null, + careerjetReferer: null, + careerjetUserIp: null, + careerjetUserAgent: null, + usajobsApiKeyHint: null, + joobleApiKeyHint: null, + reedApiKeyHint: null, basicAuthActive: false, localResumeFileConfigured: false, backupEnabled: { value: false, default: false, override: null }, diff --git a/shared/src/types/settings.ts b/shared/src/types/settings.ts index c1b00f3..65edf63 100644 --- a/shared/src/types/settings.ts +++ b/shared/src/types/settings.ts @@ -201,6 +201,22 @@ export interface AppSettings { adzunaMaxJobsPerTerm: Resolved; gradcrackerMaxJobsPerTerm: Resolved; startupjobsMaxJobsPerTerm: Resolved; + usajobsMaxJobsPerTerm: Resolved; + jobicyMaxJobsPerTerm: Resolved; + themuseMaxJobsPerTerm: Resolved; + joobleMaxJobsPerTerm: Resolved; + careerjetMaxJobsPerTerm: Resolved; + reedMaxJobsPerTerm: Resolved; + remoteokMaxJobsPerTerm: Resolved; + remotiveMaxJobsPerTerm: Resolved; + arbeitnowMaxJobsPerTerm: Resolved; + himalayasMaxJobsPerTerm: Resolved; + weworkremotelyMaxJobsPerTerm: Resolved; + fourdayweekMaxJobsPerTerm: Resolved; + leverCompanies: Resolved; + ashbyCompanies: Resolved; + greenhouseCompanies: Resolved; + workdayTenants: Resolved; searchTerms: Resolved; workplaceTypes: Resolved>; blockedCompanyKeywords: Resolved; @@ -241,6 +257,12 @@ export interface AppSettings { ukvisajobsEmail: string | null; adzunaAppId: string | null; basicAuthUser: string | null; + usajobsUserAgent: string | null; + themuseApiKey: string | null; + careerjetAffid: string | null; + careerjetReferer: string | null; + careerjetUserIp: string | null; + careerjetUserAgent: string | null; // Secret hints: llmApiKeyHint: string | null; @@ -250,6 +272,9 @@ export interface AppSettings { adzunaAppKeyHint: string | null; basicAuthPasswordHint: string | null; webhookSecretHint: string | null; + usajobsApiKeyHint: string | null; + joobleApiKeyHint: string | null; + reedApiKeyHint: string | null; // Computed: basicAuthActive: boolean;