feat(extractors): add 17 job source extractors and cross-source dedup
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s

Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ilia 2026-05-12 20:17:52 -04:00
parent b72612fd06
commit 7b3dfb002a
91 changed files with 5849 additions and 57 deletions

View File

@ -97,3 +97,116 @@ ADZUNA_APP_KEY=
# ============================================================================= # =============================================================================
# Filter for remote-only jobs (default: 0 = disabled) # Filter for remote-only jobs (default: 0 = disabled)
# JOBSPY_IS_REMOTE=0 # JOBSPY_IS_REMOTE=0
# =============================================================================
# USAJOBS API (US federal jobs) - optional, US-only
# =============================================================================
# Register at https://developer.usajobs.gov/APIRequest/Index
# USAJOBS requires a User-Agent that is a real contact email (per their TOS).
# Leave unset to disable the source.
# USAJOBS_API_KEY=
# USAJOBS_USER_AGENT=you@example.com
# USAJOBS_MAX_JOBS_PER_TERM=100
# =============================================================================
# Jobicy (remote jobs feed) - optional, no auth
# =============================================================================
# Public JSON endpoint, capped at 50 results per call.
# JOBICY_MAX_JOBS_PER_TERM=100
# =============================================================================
# The Muse (jobs API) - optional, API key recommended
# =============================================================================
# https://www.themuse.com/developers/api/v2 — works without a key but is
# heavily rate-limited. Set THEMUSE_API_KEY for higher quotas.
# THEMUSE_API_KEY=
# THEMUSE_MAX_JOBS_PER_TERM=100
# =============================================================================
# Jooble (aggregator API) - optional
# =============================================================================
# Sign up at https://jooble.org/api/about for an API key.
# JOOBLE_API_KEY=
# JOOBLE_MAX_JOBS_PER_TERM=100
# =============================================================================
# Careerjet (publisher API v4) - optional
# =============================================================================
# Register at https://www.careerjet.com/partners/api/ — declare API key + server IP(s).
# CAREERJET_AFFID=your_api_key
# CAREERJET_REFERER=https://your-site.com/path-to-job-search/
# CAREERJET_USER_IP=203.0.113.1
# Optional override for the required user_agent query param:
# CAREERJET_USER_AGENT=Mozilla/5.0 ...
# CAREERJET_MAX_JOBS_PER_TERM=100
# =============================================================================
# Reed.co.uk (UK jobs API) - optional, UK-only
# =============================================================================
# Register at https://www.reed.co.uk/developers/jobseeker for an API key.
# REED_API_KEY=
# REED_MAX_JOBS_PER_TERM=100
# =============================================================================
# Remote OK (remote jobs feed) - optional, no auth
# =============================================================================
# Public single-shot JSON feed at https://remoteok.com/api. We filter
# client-side by your search terms (matched against position + tags).
# Per Remote OK's TOS, link back to the original posting URLs when republishing.
# REMOTEOK_MAX_JOBS_PER_TERM=100
# =============================================================================
# Remotive (remote jobs feed) - optional, no auth
# =============================================================================
# Public JSON API at https://remotive.com/api/remote-jobs?limit=N&search=term.
# Each search term is sent as the `search` parameter.
# REMOTIVE_MAX_JOBS_PER_TERM=100
# =============================================================================
# Arbeitnow (multi-ATS aggregator) - optional, no auth
# =============================================================================
# Public JSON API at https://www.arbeitnow.com/api/job-board-api?page=N.
# Aggregates from Greenhouse, SmartRecruiters, Join, TeamTailor, Recruitee,
# and Comeet. No server-side search; filtering is done client-side.
# ARBEITNOW_MAX_JOBS_PER_TERM=100
# =============================================================================
# Himalayas (remote jobs feed) - optional, no auth
# =============================================================================
# Public JSON API at https://himalayas.app/jobs/api?limit=N&offset=M.
# No server-side search; filtering is done client-side by title + categories.
# HIMALAYAS_MAX_JOBS_PER_TERM=100
# =============================================================================
# We Work Remotely (RSS feed) - optional, no auth
# =============================================================================
# Public RSS at https://weworkremotely.com/remote-jobs.rss (all categories).
# Single fetch; filtering is done client-side by title + skills + category.
# WEWORKREMOTELY_MAX_JOBS_PER_TERM=100
# =============================================================================
# 4 Day Week (reduced-schedule jobs) - optional, no auth
# =============================================================================
# Public JSON API at https://4dayweek.io/api/jobs?page=N.
# Paginated; filtering is done client-side by title + tech stack.
# No job description in listings; links to 4dayweek.io for details.
# FOURDAYWEEK_MAX_JOBS_PER_TERM=100
# =============================================================================
# Public ATS sources (Lever / Ashby / Greenhouse) - optional
# =============================================================================
# Comma- or newline-separated company slugs. The slug is the path segment used
# in each provider's public job board, e.g. `lever.co/some-company` → "some-company".
# LEVER_COMPANIES=netflix,figma
# ASHBY_COMPANIES=ramp,linear
# GREENHOUSE_COMPANIES=stripe,airbnb
# =============================================================================
# Workday (public career sites) - optional
# =============================================================================
# Newline- or comma-separated entries. Each entry is either:
# 1) A career-site URL we'll auto-parse, e.g.
# https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite
# 2) A JSON object with explicit fields:
# {"company":"NVIDIA","tenantUrl":"https://nvidia.wd5.myworkdayjobs.com","tenant":"nvidia","site":"NVIDIAExternalCareerSite","locale":"en-US"}
# WORKDAY_TENANTS=

4
.gitignore vendored
View File

@ -15,8 +15,8 @@ docs-site/build/
# Data directory (bind mount in Docker) # Data directory (bind mount in Docker)
data/ data/
# Extractor storage outputs and cached auth # Extractor storage outputs and cached auth (per-extractor runtime data)
extractors/ukvisajobs/storage/ extractors/*/storage/
# OS files # OS files
.DS_Store .DS_Store

View File

@ -0,0 +1,11 @@
# arbeitnow-extractor
Pulls listings from the public [Arbeitnow API](https://www.arbeitnow.com/api/job-board-api).
- No authentication required.
- Returns 100 jobs per page; we paginate up to 5 pages (500 jobs).
- No server-side search — we filter client-side by matching title + tags
against each pipeline search term.
- Aggregates postings from Greenhouse, SmartRecruiters, Join, TeamTailor,
Recruitee, and Comeet.
- Caps results per term via the `arbeitnowMaxJobsPerTerm` setting (default 100).

View File

@ -0,0 +1,172 @@
/**
* Arbeitnow public job board API.
*
* https://www.arbeitnow.com/api/job-board-api?page=N
*
* No auth. Returns 100 results per page, sorted by creation date.
* No server-side search we paginate and filter client-side by
* title + tags against each pipeline search term.
*
* Aggregates listings from Greenhouse, SmartRecruiters, Join,
* TeamTailor, Recruitee, and Comeet.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://www.arbeitnow.com/api/job-board-api";
const MAX_PAGES = 5;
interface ArbeitnowJob {
slug?: string;
company_name?: string;
title?: string;
description?: string;
remote?: boolean;
url?: string;
tags?: string[];
job_types?: string[];
location?: string;
created_at?: number;
}
interface ArbeitnowResponse {
data?: ArbeitnowJob[];
links?: { next?: string | null };
meta?: { current_page?: number };
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed || undefined;
}
function matchesTerm(job: ArbeitnowJob, term: string): boolean {
const lower = term.toLowerCase();
if (job.title?.toLowerCase().includes(lower)) return true;
if (
Array.isArray(job.tags) &&
job.tags.some(
(t) => typeof t === "string" && t.toLowerCase().includes(lower),
)
)
return true;
return false;
}
function mapJob(raw: ArbeitnowJob): CreateJobInput | null {
const jobUrl = asString(raw.url);
if (!jobUrl) return null;
const tags = Array.isArray(raw.tags)
? raw.tags.filter((t): t is string => typeof t === "string" && t.length > 0)
: [];
const jobTypes = Array.isArray(raw.job_types)
? raw.job_types
.filter((t): t is string => typeof t === "string" && t.length > 0)
.join(", ")
: undefined;
const datePosted =
typeof raw.created_at === "number"
? new Date(raw.created_at * 1000).toISOString()
: undefined;
return {
source: "arbeitnow",
sourceJobId: asString(raw.slug),
title: asString(raw.title) ?? "Unknown Title",
employer: asString(raw.company_name) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: asString(raw.location) ?? "Unknown",
isRemote: raw.remote === true,
jobType: jobTypes || undefined,
datePosted,
jobDescription: asString(raw.description),
disciplines: tags.length > 0 ? tags.join(", ") : undefined,
};
}
async function fetchPage(page: number): Promise<ArbeitnowResponse> {
const url = `${API_URL}?page=${page}`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`Arbeitnow request failed with status ${response.status}`);
}
return (await response.json()) as ArbeitnowResponse;
}
export const manifest: ExtractorManifest = {
id: "arbeitnow",
displayName: "Arbeitnow",
providesSources: ["arbeitnow"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.arbeitnowMaxJobsPerTerm
? Number.parseInt(context.settings.arbeitnowMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let page = 1; page <= MAX_PAGES; page += 1) {
if (context.shouldCancel?.()) break;
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `page ${page}`,
detail: `Arbeitnow: fetching page ${page}`,
});
const body = await fetchPage(page);
const jobs = Array.isArray(body.data) ? body.data : [];
if (jobs.length === 0) break;
for (const raw of jobs) {
if (terms.length > 0 && !terms.some((t) => matchesTerm(raw, t))) {
continue;
}
const mapped = mapJob(raw);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `page ${page}`,
jobPagesProcessed: out.length,
detail: `Arbeitnow: page ${page} done (${out.length} matched so far)`,
});
if (!body.links?.next) break;
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "arbeitnow-extractor",
"version": "0.0.1",
"type": "module",
"description": "Arbeitnow public job board API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,8 @@
# ashby-extractor
Public Ashby job-board feeds via
`GET https://api.ashbyhq.com/posting-api/job-board/{company}`.
- No auth.
- Configure target slugs via `ashbyCompanies` (comma/newline) or
`ASHBY_COMPANIES` env.

View File

@ -0,0 +1,185 @@
/**
* Ashby public job board API.
*
* https://developers.ashbyhq.com/reference/posting-api-job-board
* GET https://api.ashbyhq.com/posting-api/job-board/{company}
*
* No auth. Each entry in `ashbyCompanies` is fetched independently.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface AshbyAddress {
postalAddress?: {
addressLocality?: string;
addressRegion?: string;
addressCountry?: string;
};
}
interface AshbyJob {
id?: string;
title?: string;
jobUrl?: string;
applyUrl?: string;
publishedAt?: string;
employmentType?: string;
isRemote?: boolean;
team?: string;
department?: string;
location?: string;
locationName?: string;
secondaryLocations?: Array<{ location?: string; locationName?: string }>;
address?: AshbyAddress;
descriptionPlain?: string;
descriptionHtml?: string;
}
interface AshbyResponse {
jobs?: AshbyJob[];
apiVersion?: string;
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function readCompanies(raw: string | undefined): string[] {
if (!raw) return [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
.filter(Boolean);
}
} catch {
// fall through
}
return raw
.split(/[\n,;|]+/)
.map((entry) => entry.trim())
.filter(Boolean);
}
function locationFor(job: AshbyJob): string | undefined {
const primary =
asString(job.locationName) ?? asString(job.location) ?? undefined;
const secondary =
job.secondaryLocations
?.map((entry) => asString(entry.locationName) ?? asString(entry.location))
.filter((value): value is string => Boolean(value)) ?? [];
const all = [primary, ...secondary].filter((value): value is string =>
Boolean(value),
);
return all.length > 0 ? all.join("; ") : undefined;
}
function mapJob(job: AshbyJob, company: string): CreateJobInput | null {
const jobUrl = asString(job.jobUrl) ?? asString(job.applyUrl);
if (!jobUrl) return null;
const employer = company
.split(/[-_]/)
.filter(Boolean)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
return {
source: "ashby",
sourceJobId: asString(job.id),
title: asString(job.title) ?? "Unknown Title",
employer: employer || company,
jobUrl,
applicationLink: asString(job.applyUrl) ?? jobUrl,
location: locationFor(job),
isRemote: typeof job.isRemote === "boolean" ? job.isRemote : undefined,
jobType: asString(job.employmentType),
jobFunction: asString(job.team),
companyIndustry: asString(job.department),
datePosted: asString(job.publishedAt),
jobDescription:
asString(job.descriptionPlain) ?? asString(job.descriptionHtml),
};
}
async function fetchCompany(company: string): Promise<AshbyJob[]> {
const url = `https://api.ashbyhq.com/posting-api/job-board/${encodeURIComponent(company)}`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (response.status === 404) return [];
if (!response.ok) {
throw new Error(
`Ashby request for "${company}" failed with status ${response.status}`,
);
}
const body = (await response.json()) as AshbyResponse;
return Array.isArray(body.jobs) ? body.jobs : [];
}
export const manifest: ExtractorManifest = {
id: "ashby",
displayName: "Ashby (ATS)",
providesSources: ["ashby"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const companies = readCompanies(context.settings.ashbyCompanies);
if (companies.length === 0) {
return {
success: true,
jobs: [],
error:
"No Ashby companies configured. Set ASHBY_COMPANIES or the ashbyCompanies setting (comma- or newline-separated slugs).",
};
}
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < companies.length; i += 1) {
if (context.shouldCancel?.()) break;
const company = companies[i];
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: companies.length,
currentUrl: company,
detail: `Ashby: ${company} (${i + 1}/${companies.length})`,
});
let added = 0;
const jobs = await fetchCompany(company);
for (const job of jobs) {
const mapped = mapJob(job, company);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
added += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: companies.length,
currentUrl: company,
jobPagesProcessed: out.length,
detail: `Ashby: ${company}${added} jobs (${out.length} total)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "ashby-extractor",
"version": "0.0.1",
"type": "module",
"description": "Ashby public ATS extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,12 @@
# careerjet-extractor
[Careerjet publisher API v4](https://www.careerjet.com/partners/api/) (`https://search.api.careerjet.net/v4/query`).
## Required configuration
- **`CAREERJET_AFFID`** — Your publisher **API key** (settings key `careerjetAffid`). Used as the Basic auth **username**; password is empty.
- **`CAREERJET_REFERER`** — The `Referer` header Careerjet requires: the full URL of the job-search page on your registered site (e.g. `https://yoursite.com/find-jobs/`).
- **`CAREERJET_USER_IP`** — The `user_ip` query parameter. In the [publisher dashboard](https://www.careerjet.com/partners/), add your **servers outbound IP** (and any dev machine IP) under “Server IP address”; this value should match an allowlisted address.
- **`CAREERJET_USER_AGENT`** (optional) — Override the default `user_agent` param if Careerjet asks for a specific string.
`selectedCountry` maps to `locale_code`; the first `searchCities` token is sent as `location`. Capped per term via `careerjetMaxJobsPerTerm` (default 100). The v4 API allows up to **10** pages per query.

View File

@ -0,0 +1,267 @@
/**
* Careerjet publisher search API (v4).
*
* https://www.careerjet.com/partners/api/
* GET https://search.api.careerjet.net/v4/query
*
* Uses Basic auth (username = publisher API key, password empty). Requires a
* Referer header and `user_ip` / `user_agent` query params. Register your
* server's outbound IP(s) in the Careerjet publisher dashboard.
*
* Env: CAREERJET_AFFID (API key), CAREERJET_REFERER (job-search page URL),
* CAREERJET_USER_IP (must match an allowlisted IP), optional CAREERJET_USER_AGENT.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://search.api.careerjet.net/v4/query";
const DEFAULT_USER_AGENT =
"Mozilla/5.0 (compatible; JobOps/1.0; job-search pipeline)";
interface CareerjetJob {
title?: string;
description?: string;
company?: string;
salary?: string;
date?: string;
url?: string;
site?: string;
locations?: string;
}
interface CareerjetResponse {
type?: string;
jobs?: CareerjetJob[];
hits?: number;
pages?: number;
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function mapJob(raw: CareerjetJob): CreateJobInput | null {
const jobUrl = asString(raw.url);
if (!jobUrl) return null;
return {
source: "careerjet",
title: asString(raw.title) ?? "Unknown Title",
employer: asString(raw.company) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: asString(raw.locations),
salary: asString(raw.salary),
datePosted: asString(raw.date),
jobDescription: asString(raw.description),
companyDescription: asString(raw.site),
};
}
function localeForCountry(country: string): string {
const key = country.trim().toLowerCase();
switch (key) {
case "united kingdom":
case "uk":
return "en_GB";
case "united states":
case "usa":
case "us":
return "en_US";
case "canada":
return "en_CA";
case "australia":
return "en_AU";
case "germany":
return "de_DE";
case "france":
return "fr_FR";
case "spain":
return "es_ES";
case "italy":
return "it_IT";
case "netherlands":
return "nl_NL";
default:
return "en_GB";
}
}
function basicAuthorizationHeader(apiKey: string): string {
const credentials = `${apiKey}:`;
const encoded = Buffer.from(credentials, "utf8").toString("base64");
return `Basic ${encoded}`;
}
async function fetchPage(args: {
apiKey: string;
keywords: string;
location?: string;
page: number;
pageSize: number;
localeCode: string;
referer: string;
userIp: string;
userAgent: string;
}): Promise<CareerjetResponse> {
const url = new URL(API_URL);
url.searchParams.set("locale_code", args.localeCode);
url.searchParams.set("keywords", args.keywords);
if (args.location) url.searchParams.set("location", args.location);
url.searchParams.set("page", String(args.page));
url.searchParams.set("page_size", String(args.pageSize));
url.searchParams.set("user_ip", args.userIp);
url.searchParams.set("user_agent", args.userAgent);
const response = await fetch(url.toString(), {
headers: {
Accept: "application/json",
Authorization: basicAuthorizationHeader(args.apiKey),
Referer: args.referer,
},
});
if (!response.ok) {
const snippet = (await response.text()).slice(0, 200);
throw new Error(
`Careerjet request failed with status ${response.status}${snippet ? `: ${snippet}` : ""}`,
);
}
return (await response.json()) as CareerjetResponse;
}
export const manifest: ExtractorManifest = {
id: "careerjet",
displayName: "Careerjet",
providesSources: ["careerjet"],
requiredEnvVars: [
"CAREERJET_AFFID",
"CAREERJET_REFERER",
"CAREERJET_USER_IP",
],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const apiKey =
context.settings.careerjetAffid?.trim() ||
process.env.CAREERJET_AFFID?.trim();
const referer =
context.settings.careerjetReferer?.trim() ||
process.env.CAREERJET_REFERER?.trim();
const userIp =
context.settings.careerjetUserIp?.trim() ||
process.env.CAREERJET_USER_IP?.trim();
const userAgent =
context.settings.careerjetUserAgent?.trim() ||
process.env.CAREERJET_USER_AGENT?.trim() ||
DEFAULT_USER_AGENT;
if (!apiKey) {
return {
success: false,
jobs: [],
error:
"Careerjet requires CAREERJET_AFFID (publisher API key for Basic auth).",
};
}
if (!referer) {
return {
success: false,
jobs: [],
error:
"Careerjet v4 requires CAREERJET_REFERER (the Referer URL of your job-search page, per Careerjet docs).",
};
}
if (!userIp) {
return {
success: false,
jobs: [],
error:
"Careerjet v4 requires CAREERJET_USER_IP. Use an IP you have allowlisted in the Careerjet publisher dashboard (typically your server's public egress IP).",
};
}
const maxJobsPerTerm = context.settings.careerjetMaxJobsPerTerm
? Number.parseInt(context.settings.careerjetMaxJobsPerTerm, 10)
: 100;
const pageSize = 50;
const localeCode = localeForCountry(context.selectedCountry || "");
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const location =
context.settings.searchCities?.split("|")[0]?.trim() || undefined;
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all)",
detail: `Careerjet: term ${i + 1}/${terms.length}`,
});
let collected = 0;
let page = 1;
let totalPages = Number.POSITIVE_INFINITY;
while (
collected < maxJobsPerTerm &&
page <= totalPages &&
page <= 10
) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
apiKey,
keywords: term,
location,
page,
pageSize,
localeCode,
referer,
userIp,
userAgent,
});
if (typeof body.pages === "number") totalPages = body.pages;
const items = Array.isArray(body.jobs) ? body.jobs : [];
if (items.length === 0) break;
for (const raw of items) {
const mapped = mapJob(raw);
if (!mapped) continue;
const key = mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
page += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all)",
jobPagesProcessed: out.length,
detail: `Careerjet: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "careerjet-extractor",
"version": "0.0.1",
"type": "module",
"description": "Careerjet public search API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,10 @@
# fourdayweek-extractor
Pulls listings from the public [4 Day Week API](https://4dayweek.io/api/jobs).
- No authentication required.
- Paginated JSON (up to 3 pages). Filters client-side by title + stack tags
against pipeline search terms.
- No description in listings — links point to `https://4dayweek.io/job/{slug}`.
- Rich metadata: schedule type, work-life score, salary, tech stack, level.
- Caps results per term via the `fourdayweekMaxJobsPerTerm` setting (default 100).

View File

@ -0,0 +1,226 @@
/**
* 4 Day Week public jobs API.
*
* https://4dayweek.io/api/jobs?page=N
*
* No auth. Paginated JSON. No description in listing response
* we link to https://4dayweek.io/job/{slug} for details.
* Supports category filtering server-side; we also filter
* client-side by title + stack tags against pipeline search terms.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://4dayweek.io/api/jobs";
const MAX_PAGES = 3;
interface FdwCompany {
name?: string;
slug?: string;
logo_url?: string;
}
interface FdwRemoteAllowed {
country?: string;
continent?: string;
is_primary?: boolean;
}
interface FdwStackItem {
name?: string;
slug?: string;
}
interface FdwJob {
id?: string;
title?: string;
slug?: string;
company_name?: string;
company?: FdwCompany;
work_arrangement?: string;
remote_allowed?: FdwRemoteAllowed[];
timezones?: string[];
posted?: number;
schedule_type?: string;
stack?: FdwStackItem[];
category?: string;
level?: string;
salary?: string;
salary_lower?: number;
salary_upper?: number;
salary_currency?: string;
salary_period?: string;
is_expired?: boolean;
work_life_score?: number;
}
interface FdwResponse {
jobs?: FdwJob[];
total?: number;
page?: number;
has_more?: boolean;
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed || undefined;
}
function matchesTerm(job: FdwJob, term: string): boolean {
const lower = term.toLowerCase();
if (job.title?.toLowerCase().includes(lower)) return true;
if (job.category?.toLowerCase().includes(lower)) return true;
if (
Array.isArray(job.stack) &&
job.stack.some(
(s) => typeof s.name === "string" && s.name.toLowerCase().includes(lower),
)
)
return true;
return false;
}
function formatSchedule(raw: string | undefined): string {
if (!raw) return "4-day week";
return raw.replace(/_/g, " ");
}
function formatLocation(job: FdwJob): string {
const countries = Array.isArray(job.remote_allowed)
? job.remote_allowed
.map((r) => r.country)
.filter((c): c is string => typeof c === "string")
: [];
if (countries.length > 0) return countries.join(", ");
return job.work_arrangement === "remote" ? "Remote" : "Unknown";
}
function formatSalary(job: FdwJob): string | undefined {
if (job.salary) return job.salary;
if (job.salary_lower == null && job.salary_upper == null) return undefined;
const cur = job.salary_currency ?? "USD";
const period = job.salary_period ?? "year";
if (job.salary_lower != null && job.salary_upper != null) {
return `${cur} ${(job.salary_lower / 100).toLocaleString()}${(job.salary_upper / 100).toLocaleString()} / ${period}`;
}
const val = job.salary_lower ?? job.salary_upper;
return val != null
? `${cur} ${(val / 100).toLocaleString()} / ${period}`
: undefined;
}
function mapJob(raw: FdwJob): CreateJobInput | null {
const slug = asString(raw.slug);
if (!slug) return null;
const jobUrl = `https://4dayweek.io/job/${slug}`;
const stackTags = Array.isArray(raw.stack)
? raw.stack
.map((s) => s.name)
.filter((n): n is string => typeof n === "string")
: [];
return {
source: "fourdayweek",
sourceJobId: raw.id ?? slug,
title: asString(raw.title) ?? "Unknown Title",
employer: raw.company?.name ?? raw.company_name ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: formatLocation(raw),
isRemote: raw.work_arrangement === "remote",
jobType: formatSchedule(raw.schedule_type),
companyLogo: raw.company?.logo_url ?? undefined,
datePosted:
typeof raw.posted === "number"
? new Date(raw.posted * 1000).toISOString()
: undefined,
salary: formatSalary(raw),
disciplines: stackTags.length > 0 ? stackTags.join(", ") : undefined,
companyIndustry: asString(raw.category),
};
}
async function fetchPage(page: number): Promise<FdwResponse> {
const url = `${API_URL}?page=${page}`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`4 Day Week request failed with status ${response.status}`);
}
return (await response.json()) as FdwResponse;
}
export const manifest: ExtractorManifest = {
id: "fourdayweek",
displayName: "4 Day Week",
providesSources: ["fourdayweek"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.fourdayweekMaxJobsPerTerm
? Number.parseInt(context.settings.fourdayweekMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let page = 1; page <= MAX_PAGES; page += 1) {
if (context.shouldCancel?.()) break;
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `page ${page}`,
detail: `4 Day Week: fetching page ${page}`,
});
const body = await fetchPage(page);
const jobs = Array.isArray(body.jobs) ? body.jobs : [];
if (jobs.length === 0) break;
for (const raw of jobs) {
if (raw.is_expired) continue;
if (terms.length > 0 && !terms.some((t) => matchesTerm(raw, t))) {
continue;
}
const mapped = mapJob(raw);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `page ${page}`,
jobPagesProcessed: out.length,
detail: `4 Day Week: page ${page} done (${out.length} matched so far)`,
});
if (!body.has_more) break;
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "fourdayweek-extractor",
"version": "0.0.1",
"type": "module",
"description": "4 Day Week public jobs API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,8 @@
# greenhouse-extractor
Public Greenhouse Job Boards via
`GET https://boards-api.greenhouse.io/v1/boards/{company}/jobs?content=true`.
- No auth.
- Configure target slugs via `greenhouseCompanies` (comma/newline) or
`GREENHOUSE_COMPANIES` env (e.g. `airbnb,stripe`).

View File

@ -0,0 +1,188 @@
/**
* Greenhouse public job boards API.
*
* https://developers.greenhouse.io/job-board.html
* GET https://boards-api.greenhouse.io/v1/boards/{company}/jobs?content=true
*
* No auth. Each entry in `greenhouseCompanies` is fetched independently.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface GhDepartment {
id?: number;
name?: string;
}
interface GhMetadata {
name?: string;
value?: unknown;
}
interface GhJob {
id?: number;
title?: string;
absolute_url?: string;
internal_job_id?: number;
updated_at?: string;
requisition_id?: string | null;
location?: { name?: string };
content?: string; // HTML, may be entity-encoded
metadata?: GhMetadata[];
departments?: GhDepartment[];
offices?: Array<{ name?: string }>;
}
interface GhResponse {
jobs?: GhJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function readCompanies(raw: string | undefined): string[] {
if (!raw) return [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
.filter(Boolean);
}
} catch {
// fall through
}
return raw
.split(/[\n,;|]+/)
.map((entry) => entry.trim())
.filter(Boolean);
}
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, " ");
}
function mapJob(job: GhJob, company: string): CreateJobInput | null {
const jobUrl = asString(job.absolute_url);
if (!jobUrl) return null;
const employer = company
.split(/[-_]/)
.filter(Boolean)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
const officeNames =
job.offices
?.map((office) => asString(office.name))
.filter((name): name is string => Boolean(name)) ?? [];
const departmentNames =
job.departments
?.map((dept) => asString(dept.name))
.filter((name): name is string => Boolean(name)) ?? [];
const description = job.content ? decodeHtmlEntities(job.content) : undefined;
return {
source: "greenhouse",
sourceJobId: job.id != null ? String(job.id) : undefined,
title: asString(job.title) ?? "Unknown Title",
employer: employer || company,
jobUrl,
applicationLink: jobUrl,
location:
asString(job.location?.name) ?? (officeNames.join("; ") || undefined),
jobFunction:
departmentNames.length > 0 ? departmentNames.join(", ") : undefined,
datePosted: asString(job.updated_at),
jobDescription: description,
};
}
async function fetchCompany(company: string): Promise<GhJob[]> {
const url = `https://boards-api.greenhouse.io/v1/boards/${encodeURIComponent(company)}/jobs?content=true`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (response.status === 404) return [];
if (!response.ok) {
throw new Error(
`Greenhouse request for "${company}" failed with status ${response.status}`,
);
}
const body = (await response.json()) as GhResponse;
return Array.isArray(body.jobs) ? body.jobs : [];
}
export const manifest: ExtractorManifest = {
id: "greenhouse",
displayName: "Greenhouse (ATS)",
providesSources: ["greenhouse"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const companies = readCompanies(context.settings.greenhouseCompanies);
if (companies.length === 0) {
return {
success: true,
jobs: [],
error:
"No Greenhouse companies configured. Set GREENHOUSE_COMPANIES or the greenhouseCompanies setting (comma- or newline-separated slugs).",
};
}
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < companies.length; i += 1) {
if (context.shouldCancel?.()) break;
const company = companies[i];
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: companies.length,
currentUrl: company,
detail: `Greenhouse: ${company} (${i + 1}/${companies.length})`,
});
let added = 0;
const jobs = await fetchCompany(company);
for (const job of jobs) {
const mapped = mapJob(job, company);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
added += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: companies.length,
currentUrl: company,
jobPagesProcessed: out.length,
detail: `Greenhouse: ${company}${added} jobs (${out.length} total)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "greenhouse-extractor",
"version": "0.0.1",
"type": "module",
"description": "Greenhouse public ATS extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,10 @@
# himalayas-extractor
Pulls listings from the public [Himalayas API](https://himalayas.app/jobs/api).
- No authentication required.
- Paginates with `limit` + `offset` (50 per page, up to 5 pages / 250 jobs).
- No server-side search — filters client-side by matching title + categories
against each pipeline search term.
- All listings are flagged `isRemote: true`.
- Caps results per term via the `himalayasMaxJobsPerTerm` setting (default 100).

View File

@ -0,0 +1,195 @@
/**
* Himalayas public remote-jobs API.
*
* https://himalayas.app/jobs/api?limit=N&offset=M
*
* No auth. Returns up to `limit` results per call. No server-side
* search we paginate and filter client-side by title + categories.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://himalayas.app/jobs/api";
const PAGE_SIZE = 50;
const MAX_PAGES = 5;
interface HimalayasJob {
title?: string;
excerpt?: string;
companyName?: string;
companySlug?: string;
companyLogo?: string;
employmentType?: string;
minSalary?: number | null;
maxSalary?: number | null;
currency?: string;
seniority?: string[];
locationRestrictions?: string[];
timezoneRestrictions?: number[];
categories?: string[];
parentCategories?: string[];
description?: string;
pubDate?: number;
expiryDate?: number;
applicationLink?: string;
guid?: string;
}
interface HimalayasResponse {
jobs?: HimalayasJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed || undefined;
}
function matchesTerm(job: HimalayasJob, term: string): boolean {
const lower = term.toLowerCase();
if (job.title?.toLowerCase().includes(lower)) return true;
if (
Array.isArray(job.categories) &&
job.categories.some(
(c) =>
typeof c === "string" &&
c.toLowerCase().replace(/-/g, " ").includes(lower),
)
)
return true;
return false;
}
function formatSalary(job: HimalayasJob): string | undefined {
if (job.minSalary == null && job.maxSalary == null) return undefined;
const cur = job.currency ?? "USD";
if (job.minSalary != null && job.maxSalary != null) {
return `${cur} ${job.minSalary.toLocaleString()}${job.maxSalary.toLocaleString()}`;
}
const val = job.minSalary ?? job.maxSalary;
return val != null ? `${cur} ${val.toLocaleString()}` : undefined;
}
function mapJob(raw: HimalayasJob): CreateJobInput | null {
const jobUrl = asString(raw.applicationLink) ?? asString(raw.guid);
if (!jobUrl) return null;
const categories = Array.isArray(raw.categories)
? raw.categories.filter(
(c): c is string => typeof c === "string" && c.length > 0,
)
: [];
const locations = Array.isArray(raw.locationRestrictions)
? raw.locationRestrictions.filter(
(l): l is string => typeof l === "string" && l.length > 0,
)
: [];
const datePosted =
typeof raw.pubDate === "number"
? new Date(raw.pubDate * 1000).toISOString()
: undefined;
return {
source: "himalayas",
sourceJobId: asString(raw.guid),
title: asString(raw.title) ?? "Unknown Title",
employer: asString(raw.companyName) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: locations.length > 0 ? locations.join(", ") : "Remote",
isRemote: true,
jobType: asString(raw.employmentType),
companyLogo: asString(raw.companyLogo),
datePosted,
salary: formatSalary(raw),
jobDescription: asString(raw.description),
disciplines: categories.length > 0 ? categories.join(", ") : undefined,
};
}
async function fetchPage(
offset: number,
limit: number,
): Promise<HimalayasJob[]> {
const url = `${API_URL}?limit=${limit}&offset=${offset}`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`Himalayas request failed with status ${response.status}`);
}
const body = (await response.json()) as HimalayasResponse;
return Array.isArray(body.jobs) ? body.jobs : [];
}
export const manifest: ExtractorManifest = {
id: "himalayas",
displayName: "Himalayas",
providesSources: ["himalayas"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.himalayasMaxJobsPerTerm
? Number.parseInt(context.settings.himalayasMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let page = 0; page < MAX_PAGES; page += 1) {
if (context.shouldCancel?.()) break;
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
const offset = page * PAGE_SIZE;
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `offset ${offset}`,
detail: `Himalayas: fetching page ${page + 1}`,
});
const raw = await fetchPage(offset, PAGE_SIZE);
if (raw.length === 0) break;
for (const item of raw) {
if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) {
continue;
}
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `offset ${offset}`,
jobPagesProcessed: out.length,
detail: `Himalayas: page ${page + 1} done (${out.length} matched so far)`,
});
if (raw.length < PAGE_SIZE) break;
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "himalayas-extractor",
"version": "0.0.1",
"type": "module",
"description": "Himalayas public remote-jobs API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,8 @@
# jobicy-extractor
Pulls remote jobs from the public [Jobicy v2 feed](https://jobicy.com/api/v2/remote-jobs).
- No authentication required.
- Each pipeline `searchTerm` is sent as a `tag`; without terms we fetch the
generic remote feed.
- Caps results via the `jobicyMaxJobsPerTerm` setting (default 100).

View File

@ -0,0 +1,186 @@
/**
* Jobicy remote-jobs feed.
*
* Public, unauthenticated JSON endpoint:
* https://jobicy.com/api/v2/remote-jobs?count=50
*
* The feed is intentionally remote-only; we still pass each `searchTerm` as a
* `tag` so the same pipeline-level term iteration drives results. We do *not*
* try to invent a country filter Jobicy postings are remote-friendly by
* design and the registry already restricts ukOnly extractors elsewhere.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://jobicy.com/api/v2/remote-jobs";
interface JobicyRawJob {
id?: number | string;
url?: string;
jobTitle?: string;
companyName?: string;
companyLogo?: string;
jobIndustry?: string[] | string;
jobType?: string[] | string;
jobGeo?: string;
jobLevel?: string;
jobExcerpt?: string;
jobDescription?: string;
pubDate?: string;
annualSalaryMin?: number | string;
annualSalaryMax?: number | string;
salaryCurrency?: string;
}
interface JobicyResponse {
jobs?: JobicyRawJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function joinList(value: unknown): string | undefined {
if (Array.isArray(value)) {
const cleaned = value
.map((item) => (typeof item === "string" ? item.trim() : ""))
.filter(Boolean);
return cleaned.length > 0 ? cleaned.join(", ") : undefined;
}
return asString(value);
}
function toNumberOrUndefined(value: unknown): number | undefined {
if (typeof value === "number" && Number.isFinite(value)) return value;
if (typeof value === "string") {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
return undefined;
}
function mapJob(raw: JobicyRawJob): CreateJobInput | null {
const jobUrl = asString(raw.url);
if (!jobUrl) return null;
const employer = asString(raw.companyName) ?? "Unknown Employer";
const title = asString(raw.jobTitle) ?? "Unknown Title";
const minSalary = toNumberOrUndefined(raw.annualSalaryMin);
const maxSalary = toNumberOrUndefined(raw.annualSalaryMax);
return {
source: "jobicy",
sourceJobId: raw.id != null ? String(raw.id) : undefined,
title,
employer,
jobUrl,
applicationLink: jobUrl,
location: asString(raw.jobGeo) ?? "Remote",
isRemote: true,
jobType: joinList(raw.jobType),
jobLevel: asString(raw.jobLevel),
companyIndustry: joinList(raw.jobIndustry),
companyLogo: asString(raw.companyLogo),
datePosted: asString(raw.pubDate),
jobDescription: asString(raw.jobDescription) ?? asString(raw.jobExcerpt),
salaryMinAmount: minSalary,
salaryMaxAmount: maxSalary,
salaryCurrency: asString(raw.salaryCurrency),
salaryInterval: minSalary || maxSalary ? "yearly" : undefined,
};
}
async function fetchJobicy(
tag: string | null,
count: number,
): Promise<JobicyRawJob[]> {
const url = new URL(API_URL);
url.searchParams.set("count", String(Math.min(Math.max(count, 1), 50)));
if (tag) url.searchParams.set("tag", tag);
const response = await fetch(url.toString(), {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`Jobicy request failed with status ${response.status}`);
}
const body = (await response.json()) as JobicyResponse;
return Array.isArray(body.jobs) ? body.jobs : [];
}
export const manifest: ExtractorManifest = {
id: "jobicy",
displayName: "Jobicy (Remote)",
providesSources: ["jobicy"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobsPerTerm = context.settings.jobicyMaxJobsPerTerm
? Number.parseInt(context.settings.jobicyMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [null];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i];
const tag = term ? term.trim().toLowerCase() : null;
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: tag ?? "(all remote)",
detail: `Jobicy: term ${i + 1}/${terms.length}`,
});
// Jobicy caps `count` at 50 per call; loop until we either hit the
// requested cap or the feed runs out (length < take).
let collected = 0;
let safetyHops = 0;
while (collected < maxJobsPerTerm && safetyHops < 10) {
const take = Math.min(50, maxJobsPerTerm - collected);
const raw = await fetchJobicy(tag, take);
if (raw.length === 0) break;
for (const item of raw) {
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
if (raw.length < take) break;
safetyHops += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: tag ?? "(all remote)",
jobPagesProcessed: out.length,
detail: `Jobicy: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "jobicy-extractor",
"version": "0.0.1",
"type": "module",
"description": "Jobicy remote-jobs feed extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,7 @@
# jooble-extractor
[Jooble](https://jooble.org/api/about) aggregator API extractor.
- Requires `JOOBLE_API_KEY` (`joobleApiKey` setting).
- Iterates `searchTerms`; uses the first `searchCities` token as `location`.
- Capped per term via `joobleMaxJobsPerTerm` (default 100).

View File

@ -0,0 +1,177 @@
/**
* Jooble aggregator API.
*
* https://jooble.org/api/about — `POST https://jooble.org/api/{key}` with a
* JSON body of `{ keywords, location, page, ResultOnPage }`.
*
* Requires JOOBLE_API_KEY (`joobleApiKey` setting).
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_BASE = "https://jooble.org/api";
interface JoobleJob {
id?: number | string;
title?: string;
location?: string;
snippet?: string;
salary?: string;
source?: string;
type?: string;
link?: string;
company?: string;
updated?: string;
}
interface JoobleResponse {
totalCount?: number;
jobs?: JoobleJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function mapJob(raw: JoobleJob): CreateJobInput | null {
const jobUrl = asString(raw.link);
if (!jobUrl) return null;
return {
source: "jooble",
sourceJobId: raw.id != null ? String(raw.id) : undefined,
title: asString(raw.title) ?? "Unknown Title",
employer: asString(raw.company) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: asString(raw.location),
jobType: asString(raw.type),
salary: asString(raw.salary),
datePosted: asString(raw.updated),
jobDescription: asString(raw.snippet),
companyDescription: asString(raw.source),
};
}
async function fetchPage(args: {
apiKey: string;
keywords: string;
location?: string;
page: number;
resultOnPage: number;
}): Promise<JoobleResponse> {
const response = await fetch(
`${API_BASE}/${encodeURIComponent(args.apiKey)}`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
Accept: "application/json",
},
body: JSON.stringify({
keywords: args.keywords,
location: args.location ?? "",
page: String(args.page),
ResultOnPage: String(args.resultOnPage),
}),
},
);
if (!response.ok) {
throw new Error(`Jooble request failed with status ${response.status}`);
}
return (await response.json()) as JoobleResponse;
}
export const manifest: ExtractorManifest = {
id: "jooble",
displayName: "Jooble",
providesSources: ["jooble"],
requiredEnvVars: ["JOOBLE_API_KEY"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const apiKey =
context.settings.joobleApiKey?.trim() ||
process.env.JOOBLE_API_KEY?.trim();
if (!apiKey) {
return {
success: false,
jobs: [],
error: "Jooble extractor requires JOOBLE_API_KEY",
};
}
const maxJobsPerTerm = context.settings.joobleMaxJobsPerTerm
? Number.parseInt(context.settings.joobleMaxJobsPerTerm, 10)
: 100;
const resultOnPage = 50;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const location =
context.settings.searchCities?.split("|")[0]?.trim() || undefined;
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all)",
detail: `Jooble: term ${i + 1}/${terms.length}`,
});
let collected = 0;
let page = 1;
while (collected < maxJobsPerTerm && page < 50) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
apiKey,
keywords: term,
location,
page,
resultOnPage,
});
const items = Array.isArray(body.jobs) ? body.jobs : [];
if (items.length === 0) break;
for (const raw of items) {
const mapped = mapJob(raw);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
if (items.length < resultOnPage) break;
page += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all)",
jobPagesProcessed: out.length,
detail: `Jooble: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "jooble-extractor",
"version": "0.0.1",
"type": "module",
"description": "Jooble aggregator API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,9 @@
# lever-extractor
Public Lever ATS feeds via `GET https://api.lever.co/v0/postings/{company}?mode=json`.
- No auth.
- Configure target slugs through `leverCompanies` (comma/newline list) or
`LEVER_COMPANIES` env (e.g. `figma,plaid,ramp`).
- Pulls every active posting from each company; pipeline filters handle terms
and country gating.

View File

@ -0,0 +1,182 @@
/**
* Lever public job postings API.
*
* https://github.com/lever/postings-api/blob/master/README.md
* GET https://api.lever.co/v0/postings/{company}?mode=json
*
* No auth. We iterate `leverCompanies` (set in Settings or LEVER_COMPANIES env)
* and pull every active posting; downstream filtering by `searchTerms` /
* country happens in the pipeline.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface LeverCategories {
team?: string;
department?: string;
commitment?: string;
location?: string;
allLocations?: string[];
}
interface LeverPosting {
id?: string;
text?: string;
hostedUrl?: string;
applyUrl?: string;
description?: string;
descriptionPlain?: string;
categories?: LeverCategories;
createdAt?: number;
workplaceType?: string;
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function readCompanies(raw: string | undefined): string[] {
if (!raw) return [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed
.map((entry) =>
typeof entry === "string" ? entry.trim().toLowerCase() : "",
)
.filter((entry) => entry.length > 0);
}
} catch {
// fall through to delimited-list parsing below
}
return raw
.split(/[\n,;|]+/)
.map((entry) => entry.trim().toLowerCase())
.filter(Boolean);
}
function locationFor(posting: LeverPosting): string | undefined {
const cats = posting.categories;
if (!cats) return undefined;
if (Array.isArray(cats.allLocations) && cats.allLocations.length > 0) {
return cats.allLocations.filter(Boolean).join("; ");
}
return asString(cats.location);
}
function mapPosting(
posting: LeverPosting,
company: string,
): CreateJobInput | null {
const jobUrl = asString(posting.hostedUrl);
if (!jobUrl) return null;
const employer = company
.split("-")
.filter(Boolean)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
return {
source: "lever",
sourceJobId: asString(posting.id),
title: asString(posting.text) ?? "Unknown Title",
employer: employer || company,
jobUrl,
applicationLink: asString(posting.applyUrl) ?? jobUrl,
location: locationFor(posting),
jobType: asString(posting.categories?.commitment),
jobFunction: asString(posting.categories?.team),
companyIndustry: asString(posting.categories?.department),
isRemote:
posting.workplaceType?.toLowerCase() === "remote" ? true : undefined,
datePosted:
typeof posting.createdAt === "number"
? new Date(posting.createdAt).toISOString()
: undefined,
jobDescription:
asString(posting.descriptionPlain) ?? asString(posting.description),
};
}
async function fetchCompany(company: string): Promise<LeverPosting[]> {
const url = `https://api.lever.co/v0/postings/${encodeURIComponent(company)}?mode=json`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (response.status === 404) return [];
if (!response.ok) {
throw new Error(
`Lever request for "${company}" failed with status ${response.status}`,
);
}
const body = (await response.json()) as unknown;
return Array.isArray(body) ? (body as LeverPosting[]) : [];
}
export const manifest: ExtractorManifest = {
id: "lever",
displayName: "Lever (ATS)",
providesSources: ["lever"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const companies = readCompanies(context.settings.leverCompanies);
if (companies.length === 0) {
return {
success: true,
jobs: [],
error:
"No Lever companies configured. Set LEVER_COMPANIES or the leverCompanies setting (comma- or newline-separated slugs).",
};
}
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < companies.length; i += 1) {
if (context.shouldCancel?.()) break;
const company = companies[i];
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: companies.length,
currentUrl: company,
detail: `Lever: ${company} (${i + 1}/${companies.length})`,
});
let added = 0;
const postings = await fetchCompany(company);
for (const posting of postings) {
const mapped = mapPosting(posting, company);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
added += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: companies.length,
currentUrl: company,
jobPagesProcessed: out.length,
detail: `Lever: ${company}${added} jobs (${out.length} total)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "lever-extractor",
"version": "0.0.1",
"type": "module",
"description": "Lever public ATS extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,8 @@
# reed-extractor
[Reed.co.uk Jobseeker API](https://www.reed.co.uk/developers/jobseeker).
- Requires `REED_API_KEY` (`reedApiKey` setting), used as the HTTP Basic
username.
- UK-only: gated via `isSourceAllowedForCountry`.
- Capped per term via `reedMaxJobsPerTerm` (default 100).

188
extractors/reed/manifest.ts Normal file
View File

@ -0,0 +1,188 @@
/**
* Reed.co.uk Jobseeker API.
*
* https://www.reed.co.uk/developers/jobseeker
* GET https://www.reed.co.uk/api/1.0/search?...
* HTTP Basic with the API key as the username and an empty password.
*
* Requires REED_API_KEY (`reedApiKey` setting). The catalog gates this source
* to UK only via `isSourceAllowedForCountry`.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://www.reed.co.uk/api/1.0/search";
interface ReedJob {
jobId?: number;
jobTitle?: string;
employerName?: string;
employerProfileUrl?: string;
jobDescription?: string;
jobUrl?: string;
locationName?: string;
date?: string;
expirationDate?: string;
applications?: number;
currency?: string;
minimumSalary?: number;
maximumSalary?: number;
yearlyMinimumSalary?: number;
yearlyMaximumSalary?: number;
}
interface ReedResponse {
totalResults?: number;
results?: ReedJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function mapJob(raw: ReedJob): CreateJobInput | null {
const jobUrl = asString(raw.jobUrl);
if (!jobUrl) return null;
return {
source: "reed",
sourceJobId: raw.jobId != null ? String(raw.jobId) : undefined,
title: asString(raw.jobTitle) ?? "Unknown Title",
employer: asString(raw.employerName) ?? "Unknown Employer",
employerUrl: asString(raw.employerProfileUrl),
jobUrl,
applicationLink: jobUrl,
location: asString(raw.locationName),
datePosted: asString(raw.date),
deadline: asString(raw.expirationDate),
jobDescription: asString(raw.jobDescription),
salaryMinAmount:
typeof raw.minimumSalary === "number" ? raw.minimumSalary : undefined,
salaryMaxAmount:
typeof raw.maximumSalary === "number" ? raw.maximumSalary : undefined,
salaryCurrency: asString(raw.currency) ?? "GBP",
salaryInterval: raw.yearlyMinimumSalary != null ? "yearly" : undefined,
};
}
async function fetchPage(args: {
apiKey: string;
keywords: string;
locationName?: string;
resultsToTake: number;
resultsToSkip: number;
}): Promise<ReedResponse> {
const url = new URL(API_URL);
url.searchParams.set("keywords", args.keywords);
if (args.locationName)
url.searchParams.set("locationName", args.locationName);
url.searchParams.set("resultsToTake", String(args.resultsToTake));
url.searchParams.set("resultsToSkip", String(args.resultsToSkip));
const auth = Buffer.from(`${args.apiKey}:`).toString("base64");
const response = await fetch(url.toString(), {
headers: {
Accept: "application/json",
Authorization: `Basic ${auth}`,
},
});
if (!response.ok) {
throw new Error(`Reed request failed with status ${response.status}`);
}
return (await response.json()) as ReedResponse;
}
export const manifest: ExtractorManifest = {
id: "reed",
displayName: "Reed",
providesSources: ["reed"],
requiredEnvVars: ["REED_API_KEY"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const apiKey =
context.settings.reedApiKey?.trim() || process.env.REED_API_KEY?.trim();
if (!apiKey) {
return {
success: false,
jobs: [],
error: "Reed extractor requires REED_API_KEY",
};
}
const maxJobsPerTerm = context.settings.reedMaxJobsPerTerm
? Number.parseInt(context.settings.reedMaxJobsPerTerm, 10)
: 100;
// Reed accepts up to 100 per page.
const pageSize = Math.min(100, maxJobsPerTerm);
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const locationName =
context.settings.searchCities?.split("|")[0]?.trim() || undefined;
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all)",
detail: `Reed: term ${i + 1}/${terms.length}`,
});
let collected = 0;
let resultsToSkip = 0;
while (collected < maxJobsPerTerm) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
apiKey,
keywords: term,
locationName,
resultsToTake: pageSize,
resultsToSkip,
});
const items = Array.isArray(body.results) ? body.results : [];
if (items.length === 0) break;
for (const raw of items) {
const mapped = mapJob(raw);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
if (items.length < pageSize) break;
resultsToSkip += pageSize;
if (resultsToSkip > 5000) break;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all)",
jobPagesProcessed: out.length,
detail: `Reed: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "reed-extractor",
"version": "0.0.1",
"type": "module",
"description": "Reed.co.uk Jobseeker API extractor (UK only)",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,15 @@
# remoteok-extractor
Pulls listings from the public [Remote OK feed](https://remoteok.com/api).
- No authentication required.
- The endpoint returns the entire active board in a single JSON array; the
first element is metadata/legal text. We fetch once and apply each pipeline
`searchTerm` as a case-insensitive filter over the job's `position` and
`tags`.
- Caps results per term via the `remoteokMaxJobsPerTerm` setting (default 100).
- Listings are flagged `isRemote: true` and labelled "Remote" if Remote OK
doesn't supply a city.
- Per Remote OK's TOS we send a descriptive User-Agent and preserve the legal
notice on the response. If you publish results, please link back to the
original posting URLs.

View File

@ -0,0 +1,190 @@
/**
* Remote OK public feed.
*
* https://remoteok.com/api — single JSON endpoint that returns the entire
* active remote-jobs board in one shot. The first array element is metadata
* (legal/attribution); jobs follow.
*
* No auth, no server-side pagination, no per-term query we fetch once per
* pipeline run and apply each `searchTerm` as a case-insensitive filter over
* `position` + `tags` so the orchestrator's per-term iteration still works.
*
* Per Remote OK's TOS we send a descriptive User-Agent so they can identify
* traffic; we do not strip the legal/attribution element from the response.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://remoteok.com/api";
const USER_AGENT =
"Mozilla/5.0 (compatible; JobOps/1.0; +https://github.com/) job-search pipeline";
interface RemoteOkJob {
id?: string | number;
slug?: string;
position?: string;
company?: string;
company_logo?: string;
logo?: string;
location?: string;
tags?: string[];
description?: string;
url?: string;
apply_url?: string;
date?: string;
epoch?: number;
salary_min?: number;
salary_max?: number;
}
interface RemoteOkLegalEntry {
legal?: string;
last_updated?: number;
}
type RemoteOkResponseEntry = RemoteOkJob | RemoteOkLegalEntry;
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function isJobEntry(entry: RemoteOkResponseEntry): entry is RemoteOkJob {
return (
"id" in entry || "position" in entry || "url" in entry || "slug" in entry
);
}
function tagMatchesTerm(job: RemoteOkJob, normalizedTerm: string): boolean {
if (!normalizedTerm) return true;
const haystack = [
job.position ?? "",
...(Array.isArray(job.tags) ? job.tags : []),
]
.join(" ")
.toLowerCase();
return haystack.includes(normalizedTerm);
}
function mapJob(job: RemoteOkJob): CreateJobInput | null {
const jobUrl = asString(job.url) ?? asString(job.apply_url);
if (!jobUrl) return null;
// Remote OK reports salary as raw numbers; 0 means "not specified".
const minSalary =
typeof job.salary_min === "number" && job.salary_min > 0
? job.salary_min
: undefined;
const maxSalary =
typeof job.salary_max === "number" && job.salary_max > 0
? job.salary_max
: undefined;
const tags = Array.isArray(job.tags)
? job.tags.filter((tag): tag is string => typeof tag === "string")
: [];
return {
source: "remoteok",
sourceJobId: job.id != null ? String(job.id) : asString(job.slug),
title: asString(job.position) ?? "Unknown Title",
employer: asString(job.company) ?? "Unknown Employer",
jobUrl,
applicationLink: asString(job.apply_url) ?? jobUrl,
location: asString(job.location) ?? "Remote",
isRemote: true,
datePosted: asString(job.date),
jobDescription: asString(job.description),
companyLogo: asString(job.company_logo) ?? asString(job.logo),
disciplines: tags.length > 0 ? tags.join(", ") : undefined,
salaryMinAmount: minSalary,
salaryMaxAmount: maxSalary,
salaryCurrency: minSalary || maxSalary ? "USD" : undefined,
salaryInterval: minSalary || maxSalary ? "yearly" : undefined,
};
}
async function fetchAll(): Promise<RemoteOkJob[]> {
const response = await fetch(API_URL, {
headers: {
Accept: "application/json",
"User-Agent": USER_AGENT,
},
});
if (!response.ok) {
throw new Error(`Remote OK request failed with status ${response.status}`);
}
const body = (await response.json()) as RemoteOkResponseEntry[];
if (!Array.isArray(body)) return [];
return body.filter(isJobEntry);
}
export const manifest: ExtractorManifest = {
id: "remoteok",
displayName: "Remote OK",
providesSources: ["remoteok"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobsPerTerm = context.settings.remoteokMaxJobsPerTerm
? Number.parseInt(context.settings.remoteokMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
let allJobs: RemoteOkJob[];
try {
allJobs = await fetchAll();
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };
}
const seen = new Set<string>();
const out: CreateJobInput[] = [];
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
const normalizedTerm = term.toLowerCase();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all remote)",
detail: `Remote OK: term ${i + 1}/${terms.length}`,
});
let collected = 0;
for (const job of allJobs) {
if (collected >= maxJobsPerTerm) break;
if (!tagMatchesTerm(job, normalizedTerm)) continue;
const mapped = mapJob(job);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all remote)",
jobPagesProcessed: out.length,
detail: `Remote OK: completed term ${i + 1}/${terms.length} (${collected} matched)`,
});
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "remoteok-extractor",
"version": "0.0.1",
"type": "module",
"description": "Remote OK public job-feed extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,9 @@
# remotive-extractor
Pulls listings from the public [Remotive API](https://remotive.com/api/remote-jobs).
- No authentication required.
- Each pipeline `searchTerm` is passed as the `search` query parameter;
without terms we fetch the generic remote feed.
- Caps results per term via the `remotiveMaxJobsPerTerm` setting (default 100).
- All listings are flagged `isRemote: true`.

View File

@ -0,0 +1,153 @@
/**
* Remotive public remote-jobs API.
*
* https://remotive.com/api/remote-jobs?limit=N&search=term
*
* No auth. Returns up to `limit` results per call with a `search` keyword
* filter. We iterate pipeline search terms as the `search` parameter.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://remotive.com/api/remote-jobs";
interface RemotiveJob {
id?: number;
url?: string;
title?: string;
company_name?: string;
company_logo?: string;
category?: string;
tags?: string[];
job_type?: string;
publication_date?: string;
candidate_required_location?: string;
salary?: string;
description?: string;
}
interface RemotiveResponse {
jobs?: RemotiveJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function normalizeJobType(raw: string | undefined): string | undefined {
if (!raw) return undefined;
return raw.replace(/_/g, " ").trim() || undefined;
}
function mapJob(raw: RemotiveJob): CreateJobInput | null {
const jobUrl = asString(raw.url);
if (!jobUrl) return null;
const tags = Array.isArray(raw.tags)
? raw.tags.filter((t): t is string => typeof t === "string" && t.length > 0)
: [];
return {
source: "remotive",
sourceJobId: raw.id != null ? String(raw.id) : undefined,
title: asString(raw.title) ?? "Unknown Title",
employer: asString(raw.company_name) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: asString(raw.candidate_required_location) ?? "Remote",
isRemote: true,
jobType: normalizeJobType(raw.job_type),
companyIndustry: asString(raw.category),
companyLogo: asString(raw.company_logo),
datePosted: asString(raw.publication_date),
salary: asString(raw.salary),
jobDescription: asString(raw.description),
disciplines: tags.length > 0 ? tags.join(", ") : undefined,
};
}
async function fetchJobs(
search: string | null,
limit: number,
): Promise<RemotiveJob[]> {
const url = new URL(API_URL);
url.searchParams.set("limit", String(Math.min(Math.max(limit, 1), 100)));
if (search) url.searchParams.set("search", search);
const response = await fetch(url.toString(), {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`Remotive request failed with status ${response.status}`);
}
const body = (await response.json()) as RemotiveResponse;
return Array.isArray(body.jobs) ? body.jobs : [];
}
export const manifest: ExtractorManifest = {
id: "remotive",
displayName: "Remotive",
providesSources: ["remotive"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobsPerTerm = context.settings.remotiveMaxJobsPerTerm
? Number.parseInt(context.settings.remotiveMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [null];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i];
const search = term ? term.trim() : null;
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: search ?? "(all remote)",
detail: `Remotive: term ${i + 1}/${terms.length}`,
});
const raw = await fetchJobs(search, maxJobsPerTerm);
let collected = 0;
for (const item of raw) {
if (collected >= maxJobsPerTerm) break;
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: search ?? "(all remote)",
jobPagesProcessed: out.length,
detail: `Remotive: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "remotive-extractor",
"version": "0.0.1",
"type": "module",
"description": "Remotive public remote-jobs API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,8 @@
# themuse-extractor
Pulls postings from [The Muse public jobs API](https://www.themuse.com/developers/api/v2).
- Works without auth, but `themuseApiKey` (`THEMUSE_API_KEY`) raises rate limits.
- Each pipeline `searchTerm` is sent as a `category`. The Muse's first
`searchCities` token is forwarded as `location`.
- Capped per term via `themuseMaxJobsPerTerm` (default 100).

View File

@ -0,0 +1,224 @@
/**
* The Muse public jobs API.
*
* https://www.themuse.com/api/public/jobs?page=0&category=...&location=...
*
* The endpoint works without auth but is heavily rate-limited; an API key
* (THEMUSE_API_KEY / `themuseApiKey` setting) lifts that. We pass each pipeline
* search term as a `category` to keep parity with how other extractors iterate
* search terms; if your role doesn't map to a Muse category it'll still match
* because Muse falls back to generic listings.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://www.themuse.com/api/public/jobs";
interface MuseLocation {
name?: string;
}
interface MuseCompany {
name?: string;
short_name?: string;
}
interface MuseRefs {
landing_page?: string;
}
interface MuseJob {
id?: number;
name?: string;
publication_date?: string;
type?: string;
contents?: string;
short_description?: string;
locations?: MuseLocation[];
company?: MuseCompany;
refs?: MuseRefs;
}
interface MuseResponse {
page?: number;
page_count?: number;
results?: MuseJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function joinLocations(
locations: MuseLocation[] | undefined,
): string | undefined {
if (!locations || locations.length === 0) return undefined;
const cleaned = locations
.map((entry) => asString(entry.name))
.filter((name): name is string => Boolean(name));
return cleaned.length > 0 ? cleaned.join("; ") : undefined;
}
function isRemoteFromLocations(
locations: MuseLocation[] | undefined,
): boolean | undefined {
if (!locations || locations.length === 0) return undefined;
return locations.some((loc) =>
typeof loc.name === "string"
? /\bflexible|remote\b/i.test(loc.name)
: false,
);
}
// The Muse `category` filter expects an exact, Title-Cased category name (e.g.
// "Software Engineer", "Engineering"). User-supplied search terms are commonly
// lowercase free-text, which the API silently ignores and returns zero results.
// Title-case the term so common values map to real categories; if the term
// still doesn't match a category the extractor will fall back to no filter.
function toMuseCategory(term: string): string | undefined {
const trimmed = term.trim();
if (!trimmed) return undefined;
return trimmed
.toLowerCase()
.split(/\s+/)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
}
function mapJob(raw: MuseJob): CreateJobInput | null {
const jobUrl = asString(raw.refs?.landing_page);
if (!jobUrl) return null;
return {
source: "themuse",
sourceJobId: raw.id != null ? String(raw.id) : undefined,
title: asString(raw.name) ?? "Unknown Title",
employer: asString(raw.company?.name) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: joinLocations(raw.locations),
isRemote: isRemoteFromLocations(raw.locations),
jobType: asString(raw.type),
datePosted: asString(raw.publication_date),
jobDescription:
asString(raw.contents) ?? asString(raw.short_description) ?? undefined,
};
}
async function fetchPage(args: {
apiKey?: string;
page: number;
category?: string;
location?: string;
}): Promise<MuseResponse> {
const url = new URL(API_URL);
url.searchParams.set("page", String(args.page));
if (args.category) url.searchParams.set("category", args.category);
if (args.location) url.searchParams.set("location", args.location);
if (args.apiKey) url.searchParams.set("api_key", args.apiKey);
const response = await fetch(url.toString(), {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`The Muse request failed with status ${response.status}`);
}
return (await response.json()) as MuseResponse;
}
export const manifest: ExtractorManifest = {
id: "themuse",
displayName: "The Muse",
providesSources: ["themuse"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const apiKey = context.settings.themuseApiKey?.trim() || undefined;
const maxJobsPerTerm = context.settings.themuseMaxJobsPerTerm
? Number.parseInt(context.settings.themuseMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const locationHint =
context.settings.searchCities?.split("|")[0]?.trim() || undefined;
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all)",
detail: `The Muse: term ${i + 1}/${terms.length}`,
});
let collected = 0;
let page = 0;
let pageCount = Number.POSITIVE_INFINITY;
// The Muse returns pageCount; cap pages defensively to avoid runaway
// loops if the API misbehaves. We try the term as a category first and,
// if the very first page is empty, drop the category filter once so an
// unknown category doesn't silently nuke the entire term.
let categoryToUse: string | undefined = toMuseCategory(term);
let droppedCategory = false;
while (collected < maxJobsPerTerm && page < pageCount && page < 100) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
apiKey,
page,
category: categoryToUse,
location: locationHint,
});
if (typeof body.page_count === "number") {
pageCount = body.page_count;
}
const results = Array.isArray(body.results) ? body.results : [];
if (results.length === 0) {
if (page === 0 && categoryToUse && !droppedCategory) {
categoryToUse = undefined;
droppedCategory = true;
pageCount = Number.POSITIVE_INFINITY;
continue;
}
break;
}
for (const item of results) {
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
page += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all)",
jobPagesProcessed: out.length,
detail: `The Muse: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "themuse-extractor",
"version": "0.0.1",
"type": "module",
"description": "The Muse public jobs API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,22 @@
# usajobs-extractor
US-government job listings via the public
[USAJOBS Search API](https://developer.usajobs.gov/api-reference/get-api-search) (`GET /api/Search` on `data.usajobs.gov`).
## Getting API access
1. Open the [USAJOBS Developer Site](https://developer.usajobs.gov/) and complete **USAJOBS API Access Request** (sign in / register as required).
2. After approval, USAJOBS emails your **API key** to the address you used—use that value for `USAJOBS_API_KEY`.
3. Set `USAJOBS_USER_AGENT` to a **real contact email** (same one you used for registration is typical). This is required by their terms, not optional branding—the HTTP `User-Agent` header must identify you.
Reference material on the developer site includes API Reference, tutorials, and code lists (locations, pay plans, etc.).
## Configuration
| Env / setting | Required | Notes |
| --- | --- | --- |
| `USAJOBS_API_KEY` / `usajobsApiKey` | yes | From the email USAJOBS sends after API access is granted |
| `USAJOBS_USER_AGENT` / `usajobsUserAgent` | yes | Real contact email per USAJOBS TOS |
| `usajobsMaxJobsPerTerm` | no | Per-term cap (default 100) |
The orchestrator's country gating restricts this source to the United States.

View File

@ -0,0 +1,263 @@
/**
* USAJOBS public search API.
*
* https://developer.usajobs.gov/api-reference/get-api-search
*
* Requires:
* - USAJOBS_API_KEY (`usajobsApiKey` setting)
* - USAJOBS_USER_AGENT must be a real contact email per their TOS
*
* The orchestrator already gates this source to United States via
* `isSourceAllowedForCountry`, so we don't re-validate country here.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://data.usajobs.gov/api/Search";
interface UsaJobsLocation {
LocationName?: string;
CountryCode?: string;
}
interface UsaJobsRemuneration {
MinimumRange?: string;
MaximumRange?: string;
RateIntervalCode?: string;
}
interface UsaJobsDescriptor {
PositionID?: string;
PositionTitle?: string;
PositionURI?: string;
ApplyURI?: string[];
PositionLocationDisplay?: string;
PositionLocation?: UsaJobsLocation[];
OrganizationName?: string;
DepartmentName?: string;
PublicationStartDate?: string;
PositionStartDate?: string;
PositionEndDate?: string;
PositionRemuneration?: UsaJobsRemuneration[];
UserArea?: { Details?: { JobSummary?: string } };
PositionSchedule?: Array<{ Name?: string }>;
}
interface UsaJobsSearchResultItem {
MatchedObjectDescriptor?: UsaJobsDescriptor;
}
interface UsaJobsSearchResult {
SearchResult?: {
SearchResultCountAll?: number;
SearchResultItems?: UsaJobsSearchResultItem[];
};
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function toNumberOrUndefined(value: unknown): number | undefined {
if (typeof value === "number" && Number.isFinite(value)) return value;
if (typeof value === "string") {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
return undefined;
}
function mapInterval(code: string | undefined): string | undefined {
if (!code) return undefined;
switch (code.toLowerCase()) {
case "py":
case "pa":
return "yearly";
case "ph":
return "hourly";
case "pd":
return "daily";
case "pm":
return "monthly";
case "pw":
return "weekly";
default:
return undefined;
}
}
function mapJob(item: UsaJobsSearchResultItem): CreateJobInput | null {
const descriptor = item.MatchedObjectDescriptor;
if (!descriptor) return null;
const jobUrl = asString(descriptor.PositionURI);
if (!jobUrl) return null;
const remuneration = descriptor.PositionRemuneration?.[0];
const min = toNumberOrUndefined(remuneration?.MinimumRange);
const max = toNumberOrUndefined(remuneration?.MaximumRange);
const interval = mapInterval(remuneration?.RateIntervalCode);
const applyArr = descriptor.ApplyURI;
const applicationLink =
Array.isArray(applyArr) && applyArr.length > 0
? (asString(applyArr[0]) ?? jobUrl)
: jobUrl;
return {
source: "usajobs",
sourceJobId: asString(descriptor.PositionID),
title: asString(descriptor.PositionTitle) ?? "Unknown Title",
employer:
asString(descriptor.OrganizationName) ??
asString(descriptor.DepartmentName) ??
"U.S. Federal Government",
jobUrl,
applicationLink,
location: asString(descriptor.PositionLocationDisplay),
datePosted: asString(descriptor.PublicationStartDate),
deadline: asString(descriptor.PositionEndDate),
jobDescription: asString(descriptor.UserArea?.Details?.JobSummary),
jobType: descriptor.PositionSchedule?.[0]?.Name?.trim() || undefined,
salaryMinAmount: min,
salaryMaxAmount: max,
salaryCurrency: min || max ? "USD" : undefined,
salaryInterval: interval,
};
}
async function fetchPage(args: {
apiKey: string;
userAgent: string;
keyword: string;
locationName?: string;
page: number;
resultsPerPage: number;
}): Promise<UsaJobsSearchResult> {
const url = new URL(API_URL);
url.searchParams.set("Keyword", args.keyword);
if (args.locationName) {
url.searchParams.set("LocationName", args.locationName);
}
url.searchParams.set("ResultsPerPage", String(args.resultsPerPage));
url.searchParams.set("Page", String(args.page));
url.searchParams.set("SortField", "OpenDate");
url.searchParams.set("SortDirection", "Desc");
const response = await fetch(url.toString(), {
headers: {
Host: "data.usajobs.gov",
"User-Agent": args.userAgent,
"Authorization-Key": args.apiKey,
Accept: "application/json",
},
});
if (!response.ok) {
throw new Error(`USAJOBS request failed with status ${response.status}`);
}
return (await response.json()) as UsaJobsSearchResult;
}
export const manifest: ExtractorManifest = {
id: "usajobs",
displayName: "USAJOBS",
providesSources: ["usajobs"],
requiredEnvVars: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const apiKey =
context.settings.usajobsApiKey?.trim() ||
process.env.USAJOBS_API_KEY?.trim();
const userAgent =
context.settings.usajobsUserAgent?.trim() ||
process.env.USAJOBS_USER_AGENT?.trim();
if (!apiKey || !userAgent) {
return {
success: false,
jobs: [],
error:
"USAJOBS extractor requires USAJOBS_API_KEY and USAJOBS_USER_AGENT (a contact email)",
};
}
const maxJobsPerTerm = context.settings.usajobsMaxJobsPerTerm
? Number.parseInt(context.settings.usajobsMaxJobsPerTerm, 10)
: 100;
// USAJOBS caps page size at 500, but smaller pages are friendlier on retry.
const resultsPerPage = 50;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const locationName =
context.settings.searchCities?.split("|")[0]?.trim() || undefined;
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all)",
detail: `USAJOBS: term ${i + 1}/${terms.length}`,
});
let collected = 0;
let page = 1;
let total = Number.POSITIVE_INFINITY;
while (
collected < maxJobsPerTerm &&
(page - 1) * resultsPerPage < total &&
page < 200
) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
apiKey,
userAgent,
keyword: term,
locationName,
page,
resultsPerPage,
});
if (typeof body.SearchResult?.SearchResultCountAll === "number") {
total = body.SearchResult.SearchResultCountAll;
}
const items = body.SearchResult?.SearchResultItems ?? [];
if (items.length === 0) break;
for (const item of items) {
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
if (items.length < resultsPerPage) break;
page += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all)",
jobPagesProcessed: out.length,
detail: `USAJOBS: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "usajobs-extractor",
"version": "0.0.1",
"type": "module",
"description": "USAJOBS public search API extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,10 @@
# weworkremotely-extractor
Pulls listings from the public [We Work Remotely RSS feed](https://weworkremotely.com/remote-jobs.rss).
- No authentication required.
- Single RSS fetch returns all recent listings; we filter client-side
by matching title + skills + category against pipeline search terms.
- Uses lightweight regex-based XML parsing (no external XML library).
- All listings are flagged `isRemote: true`.
- Caps results per term via the `weworkremotelyMaxJobsPerTerm` setting (default 100).

View File

@ -0,0 +1,192 @@
/**
* We Work Remotely public RSS feed.
*
* https://weworkremotely.com/remote-jobs.rss
*
* No auth. Returns all recent listings in a single XML feed.
* We filter client-side by matching title + skills + category
* against each pipeline search term.
*
* Title format from WWR: "Company Name: Job Title"
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const RSS_URL = "https://weworkremotely.com/remote-jobs.rss";
interface WwrItem {
title?: string;
link?: string;
guid?: string;
description?: string;
pubDate?: string;
region?: string;
country?: string;
skills?: string;
category?: string;
type?: string;
logoUrl?: string;
}
function xmlText(xml: string, tag: string): string | undefined {
const pattern = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`);
const match = xml.match(pattern);
if (!match?.[1]) return undefined;
return (
match[1].replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1").trim() || undefined
);
}
function parseItems(xml: string): WwrItem[] {
const items: WwrItem[] = [];
const blocks = xml.match(/<item>([\s\S]*?)<\/item>/g) ?? [];
for (const raw of blocks) {
const block = raw.replace(/^<item>/, "").replace(/<\/item>$/, "");
const logoMatch = block.match(/media:content\s+url="([^"]+)"/);
items.push({
title: xmlText(block, "title"),
link: xmlText(block, "link"),
guid: xmlText(block, "guid"),
description: xmlText(block, "description"),
pubDate: xmlText(block, "pubDate"),
region: xmlText(block, "region"),
country: xmlText(block, "country"),
skills: xmlText(block, "skills"),
category: xmlText(block, "category"),
type: xmlText(block, "type"),
logoUrl: logoMatch?.[1],
});
}
return items;
}
function parseTitle(raw: string): { employer: string; title: string } {
const colonIdx = raw.indexOf(": ");
if (colonIdx > 0) {
return {
employer: raw.slice(0, colonIdx).trim(),
title: raw.slice(colonIdx + 2).trim(),
};
}
return { employer: "Unknown Employer", title: raw.trim() };
}
function matchesTerm(item: WwrItem, term: string): boolean {
const lower = term.toLowerCase();
if (item.title?.toLowerCase().includes(lower)) return true;
if (item.skills?.toLowerCase().includes(lower)) return true;
if (item.category?.toLowerCase().includes(lower)) return true;
return false;
}
function decodeHtmlEntities(html: string): string {
return html
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">");
}
function mapJob(item: WwrItem): CreateJobInput | null {
const jobUrl = item.link || item.guid;
if (!jobUrl) return null;
const rawTitle = item.title
? decodeHtmlEntities(item.title)
: "Unknown Title";
const { employer, title } = parseTitle(rawTitle);
const location =
[item.region, item.country].filter(Boolean).join(" — ") || "Remote";
return {
source: "weworkremotely",
sourceJobId: item.guid ?? item.link,
title,
employer,
jobUrl,
applicationLink: jobUrl,
location,
isRemote: true,
jobType: item.type || undefined,
companyLogo: item.logoUrl,
datePosted: item.pubDate,
jobDescription: item.description
? decodeHtmlEntities(item.description)
: undefined,
disciplines: item.skills || undefined,
companyIndustry: item.category || undefined,
};
}
export const manifest: ExtractorManifest = {
id: "weworkremotely",
displayName: "We Work Remotely",
providesSources: ["weworkremotely"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.weworkremotelyMaxJobsPerTerm
? Number.parseInt(context.settings.weworkremotelyMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: RSS_URL,
detail: "We Work Remotely: fetching RSS feed",
});
try {
const response = await fetch(RSS_URL, {
headers: { Accept: "application/rss+xml, application/xml, text/xml" },
});
if (!response.ok) {
throw new Error(`WWR RSS failed with status ${response.status}`);
}
const xml = await response.text();
const items = parseItems(xml);
const seen = new Set<string>();
const out: CreateJobInput[] = [];
for (const item of items) {
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) {
continue;
}
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: 1,
termsTotal: 1,
currentUrl: RSS_URL,
jobPagesProcessed: out.length,
detail: `We Work Remotely: ${out.length} matched from ${items.length} total`,
});
return { success: true, jobs: out };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };
}
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "weworkremotely-extractor",
"version": "0.0.1",
"type": "module",
"description": "We Work Remotely RSS feed extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -0,0 +1,25 @@
# workday-extractor
Public Workday career sites via the JSON CXS endpoint
`POST {tenantUrl}/wday/cxs/{tenant}/{site}/jobs`.
## Configuration
Set `workdayTenants` (or `WORKDAY_TENANTS` env). Each entry is either:
1. A career-site URL we'll auto-parse, e.g.
`https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite`
2. A JSON object with explicit fields:
```json
{
"company": "NVIDIA",
"tenantUrl": "https://nvidia.wd5.myworkdayjobs.com",
"tenant": "nvidia",
"site": "NVIDIAExternalCareerSite",
"locale": "en-US"
}
```
Multiple entries are separated by newlines or commas. Pipeline `searchTerms`
are passed as the request `searchText`.

View File

@ -0,0 +1,263 @@
/**
* Workday public career-site extractor.
*
* Workday tenants expose their public job board over a JSON CXS endpoint:
* POST {tenantUrl}/wday/cxs/{tenant}/{site}/jobs
* { appliedFacets: {}, limit: 20, offset: 0, searchText: "..." }
*
* `workdayTenants` accepts entries shaped as JSON objects (preferred) or as
* career-page URLs we parse on a best-effort basis. When we can't recover the
* tenant + site we skip the entry and continue.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface WorkdayTarget {
company: string;
tenantUrl: string;
tenant: string;
site: string;
locale?: string;
}
interface WorkdayJobPosting {
title?: string;
externalPath?: string;
locationsText?: string;
postedOn?: string;
bulletFields?: string[];
}
interface WorkdayResponse {
total?: number;
jobPostings?: WorkdayJobPosting[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function inferTenantFromHost(host: string): string | null {
// host looks like `acme.wd5.myworkdayjobs.com` → tenant "acme"
const match = host.match(/^([^.]+)\.wd\d+\.myworkdayjobs\.com$/i);
return match ? match[1] : null;
}
function parseTargetEntry(entry: string): WorkdayTarget | null {
const trimmed = entry.trim();
if (!trimmed) return null;
// First, try JSON.
try {
const parsed = JSON.parse(trimmed) as Partial<WorkdayTarget>;
if (
parsed &&
typeof parsed.company === "string" &&
typeof parsed.tenantUrl === "string" &&
typeof parsed.tenant === "string" &&
typeof parsed.site === "string"
) {
return {
company: parsed.company,
tenantUrl: parsed.tenantUrl.replace(/\/$/, ""),
tenant: parsed.tenant,
site: parsed.site,
locale: typeof parsed.locale === "string" ? parsed.locale : undefined,
};
}
} catch {
// Fall through to URL parsing.
}
// URL form, e.g.
// https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite
try {
const url = new URL(trimmed);
const tenant = inferTenantFromHost(url.host);
if (!tenant) return null;
const segments = url.pathname.split("/").filter(Boolean);
if (segments.length < 2) return null;
const [maybeLocale, site] = segments;
return {
company: tenant,
tenantUrl: `${url.protocol}//${url.host}`,
tenant,
site,
locale: maybeLocale,
};
} catch {
return null;
}
}
function readTargets(raw: string | undefined): WorkdayTarget[] {
if (!raw) return [];
const out: WorkdayTarget[] = [];
// settings store stringifies JSON arrays; if we got a JSON array of strings
// we still need to parse each entry individually.
let entries: string[] = [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
entries = parsed
.map((entry) =>
typeof entry === "string" ? entry : JSON.stringify(entry),
)
.filter(Boolean);
}
} catch {
entries = raw
.split(/\n+/)
.map((line) => line.trim())
.filter(Boolean);
}
if (entries.length === 0) {
entries = raw
.split(/\n+/)
.map((line) => line.trim())
.filter(Boolean);
}
for (const entry of entries) {
const target = parseTargetEntry(entry);
if (target) out.push(target);
}
return out;
}
function mapPosting(
posting: WorkdayJobPosting,
target: WorkdayTarget,
): CreateJobInput | null {
const externalPath = asString(posting.externalPath);
if (!externalPath) return null;
const locale = target.locale ?? "en-US";
const jobUrl = `${target.tenantUrl}/${locale}/${target.site}${externalPath}`;
return {
source: "workday",
sourceJobId: externalPath,
title: asString(posting.title) ?? "Unknown Title",
employer: target.company,
jobUrl,
applicationLink: jobUrl,
location: asString(posting.locationsText),
datePosted: asString(posting.postedOn),
jobType: posting.bulletFields?.find((field) => field?.length)?.trim(),
};
}
async function fetchPage(args: {
target: WorkdayTarget;
searchText: string;
offset: number;
limit: number;
}): Promise<WorkdayResponse> {
const url = `${args.target.tenantUrl}/wday/cxs/${encodeURIComponent(args.target.tenant)}/${encodeURIComponent(args.target.site)}/jobs`;
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
Accept: "application/json",
},
body: JSON.stringify({
appliedFacets: {},
limit: args.limit,
offset: args.offset,
searchText: args.searchText,
}),
});
if (!response.ok) {
throw new Error(
`Workday request for "${args.target.company}" failed with status ${response.status}`,
);
}
return (await response.json()) as WorkdayResponse;
}
export const manifest: ExtractorManifest = {
id: "workday",
displayName: "Workday (ATS)",
providesSources: ["workday"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const targets = readTargets(context.settings.workdayTenants);
if (targets.length === 0) {
return {
success: true,
jobs: [],
error:
"No Workday tenants configured. Set WORKDAY_TENANTS or the workdayTenants setting to a list of career-site URLs (or JSON entries with company/tenantUrl/tenant/site).",
};
}
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
const limit = 20;
const errors: string[] = [];
for (let t = 0; t < targets.length; t += 1) {
if (context.shouldCancel?.()) break;
const target = targets[t];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: t * terms.length + i,
termsTotal: targets.length * terms.length,
currentUrl: `${target.company} (${term || "all"})`,
detail: `Workday: ${target.company} term ${i + 1}/${terms.length}`,
});
let offset = 0;
let total = Number.POSITIVE_INFINITY;
while (offset < total && offset < 1000) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
target,
searchText: term,
offset,
limit,
});
if (typeof body.total === "number") total = body.total;
const postings = Array.isArray(body.jobPostings)
? body.jobPostings
: [];
if (postings.length === 0) break;
for (const posting of postings) {
const mapped = mapPosting(posting, target);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
offset += postings.length;
if (postings.length < limit) break;
}
}
} catch (error) {
const message =
error instanceof Error ? error.message : "Unknown error";
errors.push(`${target.company}: ${message}`);
}
}
if (out.length === 0 && errors.length > 0) {
return { success: false, jobs: out, error: errors.join("; ") };
}
return {
success: true,
jobs: out,
error: errors.length > 0 ? errors.join("; ") : undefined,
};
},
};
export default manifest;

View File

@ -0,0 +1,17 @@
{
"name": "workday-extractor",
"version": "0.0.1",
"type": "module",
"description": "Workday public career-site extractor",
"main": "manifest.ts",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
},
"scripts": {
"check:types": "tsc --noEmit"
}
}

View File

@ -0,0 +1,17 @@
{
"compilerOptions": {
"module": "ESNext",
"moduleResolution": "bundler",
"target": "ES2022",
"outDir": "dist",
"strict": true,
"noUnusedLocals": false,
"lib": ["ES2022", "DOM"],
"types": ["node"],
"baseUrl": ".",
"paths": {
"@shared/*": ["../../shared/src/*"]
}
},
"include": ["./manifest.ts", "./src/**/*"]
}

View File

@ -1,6 +1,6 @@
{ {
"name": "job-ops-orchestrator", "name": "job-ops-orchestrator",
"version": "0.2.0", "version": "0.2.1",
"type": "module", "type": "module",
"description": "Unified orchestrator for job application pipeline", "description": "Unified orchestrator for job application pipeline",
"main": "src/server/index.ts", "main": "src/server/index.ts",

View File

@ -502,11 +502,33 @@ async function fetchApi<T>(
options?: RequestInit, options?: RequestInit,
): Promise<T> { ): Promise<T> {
const method = (options?.method || "GET").toUpperCase(); const method = (options?.method || "GET").toUpperCase();
let authHeader = cachedBasicAuthCredentials const activeCreds = getActiveBasicAuthCredentials();
? encodeBasicAuthHeaderValue(cachedBasicAuthCredentials) let authHeader = activeCreds ? encodeBasicAuthHeaderValue(activeCreds) : undefined;
: undefined;
let authAttempt = 0; let authAttempt = 0;
let usernameHint = cachedBasicAuthCredentials?.username; let usernameHint = activeCreds?.username;
const shouldPromptForAuth = (args: {
method: string;
endpoint: string;
response: Response;
parsed: ApiResponse<unknown> | LegacyApiResponse<unknown>;
}): boolean => {
if (!basicAuthPromptHandler) return false;
if (authAttempt >= 2) return false;
if (!isUnauthorizedResponse(args.response, args.parsed)) return false;
// By default we only prompt for write methods. However, some parts of the UI
// (Settings/Profile) require auth even for reads. Without prompting, those
// screens appear to "forget" saved data after refresh.
if (isWriteMethod(args.method)) return true;
const readAuthEndpoints = new Set<string>([
"/settings",
"/profile",
"/profiles",
]);
return readAuthEndpoints.has(args.endpoint);
};
while (true) { while (true) {
const { response, parsed } = await fetchAndParse( const { response, parsed } = await fetchAndParse(
@ -515,12 +537,7 @@ async function fetchApi<T>(
authHeader, authHeader,
); );
if ( if (shouldPromptForAuth({ method, endpoint, response, parsed })) {
isWriteMethod(method) &&
isUnauthorizedResponse(response, parsed) &&
basicAuthPromptHandler &&
authAttempt < 2
) {
const credentials = await requestBasicAuthCredentials({ const credentials = await requestBasicAuthCredentials({
endpoint, endpoint,
method, method,

View File

@ -21,6 +21,12 @@ describe("orchestrator utils", () => {
expect(getEnabledSources(createAppSettings())).toContain("startupjobs"); expect(getEnabledSources(createAppSettings())).toContain("startupjobs");
}); });
it("enables jobicy and themuse without credentials", () => {
const enabled = getEnabledSources(createAppSettings());
expect(enabled).toContain("jobicy");
expect(enabled).toContain("themuse");
});
it("counts processing jobs in ready and discovered tabs", () => { it("counts processing jobs in ready and discovered tabs", () => {
const jobs = [ const jobs = [
createJob({ id: "ready", status: "ready", closedAt: null }), createJob({ id: "ready", status: "ready", closedAt: null }),

View File

@ -195,6 +195,21 @@ export const getEnabledSources = (
const hasAdzunaAuth = Boolean( const hasAdzunaAuth = Boolean(
settings.adzunaAppId?.trim() && settings.adzunaAppKeyHint, settings.adzunaAppId?.trim() && settings.adzunaAppKeyHint,
); );
const hasUsajobsAuth = Boolean(
settings.usajobsUserAgent?.trim() && settings.usajobsApiKeyHint,
);
const hasJoobleAuth = Boolean(settings.joobleApiKeyHint);
const hasCareerjetAuth = Boolean(
settings.careerjetAffid?.trim() &&
settings.careerjetReferer?.trim() &&
settings.careerjetUserIp?.trim(),
);
const hasReedAuth = Boolean(settings.reedApiKeyHint);
const hasLeverCompanies = (settings.leverCompanies?.value ?? []).length > 0;
const hasAshbyCompanies = (settings.ashbyCompanies?.value ?? []).length > 0;
const hasGreenhouseCompanies =
(settings.greenhouseCompanies?.value ?? []).length > 0;
const hasWorkdayTenants = (settings.workdayTenants?.value ?? []).length > 0;
for (const source of orderedSources) { for (const source of orderedSources) {
if (source === "gradcracker") { if (source === "gradcracker") {
@ -209,6 +224,22 @@ export const getEnabledSources = (
if (hasAdzunaAuth) enabled.push(source); if (hasAdzunaAuth) enabled.push(source);
continue; continue;
} }
if (source === "usajobs") {
if (hasUsajobsAuth) enabled.push(source);
continue;
}
if (source === "jooble") {
if (hasJoobleAuth) enabled.push(source);
continue;
}
if (source === "careerjet") {
if (hasCareerjetAuth) enabled.push(source);
continue;
}
if (source === "reed") {
if (hasReedAuth) enabled.push(source);
continue;
}
if (source === "hiringcafe") { if (source === "hiringcafe") {
enabled.push(source); enabled.push(source);
continue; continue;
@ -217,10 +248,45 @@ export const getEnabledSources = (
enabled.push(source); enabled.push(source);
continue; continue;
} }
if (source === "jobicy") {
enabled.push(source);
continue;
}
if (source === "themuse") {
enabled.push(source);
continue;
}
if (source === "lever") {
if (hasLeverCompanies) enabled.push(source);
continue;
}
if (source === "ashby") {
if (hasAshbyCompanies) enabled.push(source);
continue;
}
if (source === "greenhouse") {
if (hasGreenhouseCompanies) enabled.push(source);
continue;
}
if (source === "workday") {
if (hasWorkdayTenants) enabled.push(source);
continue;
}
if ( if (
source === "indeed" || source === "indeed" ||
source === "linkedin" || source === "linkedin" ||
source === "glassdoor" source === "glassdoor"
) {
enabled.push(source);
continue;
}
if (
source === "remoteok" ||
source === "remotive" ||
source === "arbeitnow" ||
source === "himalayas" ||
source === "weworkremotely" ||
source === "fourdayweek"
) { ) {
enabled.push(source); enabled.push(source);
} }

View File

@ -1,4 +1,4 @@
import { badRequest, forbidden } from "@infra/errors"; import { badRequest, forbidden, unauthorized } from "@infra/errors";
import { asyncRoute, fail, ok } from "@infra/http"; import { asyncRoute, fail, ok } from "@infra/http";
import { logger } from "@infra/logger"; import { logger } from "@infra/logger";
import { import {
@ -28,10 +28,11 @@ function profileMatchesBasicAuthUser(
function assertProfileVisibleToRequest( function assertProfileVisibleToRequest(
req: Request, req: Request,
profile: SearchProfile, profile: SearchProfile,
): boolean { ): true | "unauthorized" | "forbidden" {
if (!isBasicAuthEnabled()) return true; if (!isBasicAuthEnabled()) return true;
const username = parseBasicAuthUsername(req.headers.authorization); const username = parseBasicAuthUsername(req.headers.authorization);
return profileMatchesBasicAuthUser(profile, username); if (!username?.trim()) return "unauthorized";
return profileMatchesBasicAuthUser(profile, username) ? true : "forbidden";
} }
profilesRouter.get( profilesRouter.get(
@ -53,7 +54,11 @@ profilesRouter.get(
if (!profile) { if (!profile) {
return fail(res, badRequest("Profile not found")); return fail(res, badRequest("Profile not found"));
} }
if (!assertProfileVisibleToRequest(req, profile)) { const visible = assertProfileVisibleToRequest(req, profile);
if (visible === "unauthorized") {
return fail(res, unauthorized("Authentication required"));
}
if (visible === "forbidden") {
return fail(res, forbidden("You cannot access this profile")); return fail(res, forbidden("You cannot access this profile"));
} }
return ok(res, profile); return ok(res, profile);
@ -93,7 +98,11 @@ profilesRouter.patch(
if (!existing) { if (!existing) {
return fail(res, badRequest("Profile not found")); return fail(res, badRequest("Profile not found"));
} }
if (!assertProfileVisibleToRequest(req, existing)) { const visible = assertProfileVisibleToRequest(req, existing);
if (visible === "unauthorized") {
return fail(res, unauthorized("Authentication required"));
}
if (visible === "forbidden") {
return fail(res, forbidden("You cannot update this profile")); return fail(res, forbidden("You cannot update this profile"));
} }
const { name, data } = req.body; const { name, data } = req.body;
@ -139,7 +148,11 @@ profilesRouter.delete(
if (!existing) { if (!existing) {
return fail(res, badRequest("Profile not found")); return fail(res, badRequest("Profile not found"));
} }
if (!assertProfileVisibleToRequest(req, existing)) { const visible = assertProfileVisibleToRequest(req, existing);
if (visible === "unauthorized") {
return fail(res, unauthorized("Authentication required"));
}
if (visible === "forbidden") {
return fail(res, forbidden("You cannot delete this profile")); return fail(res, forbidden("You cannot delete this profile"));
} }
const deleted = await profilesRepo.deleteProfile(req.params.id); const deleted = await profilesRepo.deleteProfile(req.params.id);
@ -157,7 +170,11 @@ profilesRouter.post(
if (!profile) { if (!profile) {
return fail(res, badRequest("Profile not found")); return fail(res, badRequest("Profile not found"));
} }
if (!assertProfileVisibleToRequest(req, profile)) { const visible = assertProfileVisibleToRequest(req, profile);
if (visible === "unauthorized") {
return fail(res, unauthorized("Authentication required"));
}
if (visible === "forbidden") {
return fail(res, forbidden("You cannot activate this profile")); return fail(res, forbidden("You cannot activate this profile"));
} }
const basicUser = parseBasicAuthUsername(req.headers.authorization)?.trim(); const basicUser = parseBasicAuthUsername(req.headers.authorization)?.trim();

View File

@ -10,6 +10,10 @@ import { asyncRoute, fail, ok } from "@infra/http";
import { logger } from "@infra/logger"; import { logger } from "@infra/logger";
import { getRequestId } from "@infra/request-context"; import { getRequestId } from "@infra/request-context";
import { isDemoMode, sendDemoBlocked } from "@server/config/demo"; import { isDemoMode, sendDemoBlocked } from "@server/config/demo";
import { getJobOwnerProfileId } from "@server/infra/request-context";
import { DEFAULT_JOB_OWNER_PROFILE_ID } from "@server/infra/job-owner-context";
import { parseBasicAuthUsername } from "@server/infra/basic-auth-credentials";
import * as profilesRepo from "@server/repositories/profiles";
import { getSetting } from "@server/repositories/settings"; import { getSetting } from "@server/repositories/settings";
import { setBackupSettings } from "@server/services/backup/index"; import { setBackupSettings } from "@server/services/backup/index";
import { LlmService } from "@server/services/llm/service"; import { LlmService } from "@server/services/llm/service";
@ -30,6 +34,7 @@ import {
type UpdateSettingsInput, type UpdateSettingsInput,
updateSettingsSchema, updateSettingsSchema,
} from "@shared/settings-schema"; } from "@shared/settings-schema";
import { jobSearchProfileSchema } from "@shared/settings-registry";
import { type Request, type Response, Router } from "express"; import { type Request, type Response, Router } from "express";
export const settingsRouter = Router(); export const settingsRouter = Router();
@ -232,6 +237,28 @@ settingsRouter.patch(
clearProfileCache(); clearProfileCache();
} }
// When Basic Auth is enabled, the effective job search profile comes from
// the authenticated user's saved SearchProfile row (basicAuthUser), not the
// `jobSearchProfile` setting override. If the client is saving a
// jobSearchProfile, persist it directly to the request owner profile so the
// next settings fetch immediately reflects the changes (and doesn't "snap back").
if (Object.hasOwn(input, "jobSearchProfile") && input.jobSearchProfile) {
const ownerId = getJobOwnerProfileId();
if (
ownerId &&
ownerId !== DEFAULT_JOB_OWNER_PROFILE_ID &&
ownerId !== "__unmapped__"
) {
const parsed = jobSearchProfileSchema.safeParse(input.jobSearchProfile);
if (parsed.success) {
const username = parseBasicAuthUsername(req.headers.authorization)?.trim();
const dataWithOwner =
username ? { ...parsed.data, basicAuthUser: username } : parsed.data;
await profilesRepo.updateProfile(ownerId, { data: dataWithOwner });
}
}
}
const data = await getEffectiveSettings(); const data = await getEffectiveSettings();
if (plan.shouldRefreshBackupScheduler) { if (plan.shouldRefreshBackupScheduler) {

View File

@ -256,6 +256,22 @@ export const DEMO_SOURCE_BASE_URLS: Record<JobSource, string> = {
adzuna: "https://www.adzuna.com", adzuna: "https://www.adzuna.com",
hiringcafe: "https://hiring.cafe", hiringcafe: "https://hiring.cafe",
startupjobs: "https://startup.jobs", startupjobs: "https://startup.jobs",
usajobs: "https://www.usajobs.gov",
jobicy: "https://jobicy.com",
themuse: "https://www.themuse.com",
jooble: "https://jooble.org",
careerjet: "https://www.careerjet.com",
reed: "https://www.reed.co.uk",
remoteok: "https://remoteok.com",
remotive: "https://remotive.com",
arbeitnow: "https://www.arbeitnow.com",
himalayas: "https://himalayas.app",
weworkremotely: "https://weworkremotely.com",
fourdayweek: "https://4dayweek.io",
ashby: "https://jobs.ashbyhq.com",
lever: "https://jobs.lever.co",
greenhouse: "https://boards.greenhouse.io",
workday: "https://workday.com",
manual: "https://example.com", manual: "https://example.com",
}; };

View File

@ -781,6 +781,13 @@ const migrations = [
`CREATE INDEX IF NOT EXISTS idx_jobs_status_discovered_at ON jobs(status, discovered_at)`, `CREATE INDEX IF NOT EXISTS idx_jobs_status_discovered_at ON jobs(status, discovered_at)`,
`CREATE INDEX IF NOT EXISTS idx_jobs_owner_profile_id ON jobs(owner_profile_id)`, `CREATE INDEX IF NOT EXISTS idx_jobs_owner_profile_id ON jobs(owner_profile_id)`,
// Cross-source dedup: store a normalized (employer, title) fingerprint and
// index it per owner so import-time skip lookups are fast. Backfill happens
// lazily in the repository on next insert; existing rows just get NULL until
// they're re-imported or rewritten.
`ALTER TABLE jobs ADD COLUMN content_fingerprint TEXT`,
`CREATE INDEX IF NOT EXISTS idx_jobs_owner_profile_content_fingerprint ON jobs(owner_profile_id, content_fingerprint)`,
// Seed default job-search personas (INSERT OR IGNORE — safe on existing DBs). // Seed default job-search personas (INSERT OR IGNORE — safe on existing DBs).
sqlInsertSearchProfileSeed({ sqlInsertSearchProfileSeed({
id: "685b0000-0000-4000-8000-000000000001", id: "685b0000-0000-4000-8000-000000000001",

View File

@ -45,6 +45,12 @@ export const jobs = sqliteTable(
employer: text("employer").notNull(), employer: text("employer").notNull(),
employerUrl: text("employer_url"), employerUrl: text("employer_url"),
jobUrl: text("job_url").notNull(), jobUrl: text("job_url").notNull(),
/**
* Cross-source dedup key derived from normalized (employer, title).
* Nullable because some legacy rows / very thin postings can't produce a
* fingerprint. New imports skip when this matches an existing row.
*/
contentFingerprint: text("content_fingerprint"),
applicationLink: text("application_link"), applicationLink: text("application_link"),
disciplines: text("disciplines"), disciplines: text("disciplines"),
deadline: text("deadline"), deadline: text("deadline"),
@ -126,6 +132,9 @@ export const jobs = sqliteTable(
table.ownerProfileId, table.ownerProfileId,
table.jobUrl, table.jobUrl,
), ),
ownerContentFingerprintIndex: index(
"idx_jobs_owner_profile_content_fingerprint",
).on(table.ownerProfileId, table.contentFingerprint),
}), }),
); );

View File

@ -12,6 +12,7 @@ import {
normalizeCountryKey, normalizeCountryKey,
} from "@shared/location-support.js"; } from "@shared/location-support.js";
import { resolveBlockedCompanyKeywordsFromStoredString } from "@shared/resolve-blocked-company-keywords.js"; import { resolveBlockedCompanyKeywordsFromStoredString } from "@shared/resolve-blocked-company-keywords.js";
import { jobSearchProfileSchema } from "@shared/settings-registry.js";
import { import {
inferCountryKeyFromSearchGeography, inferCountryKeyFromSearchGeography,
matchesRequestedCity, matchesRequestedCity,
@ -67,6 +68,127 @@ function filterJobsByRequestedCities(args: {
); );
} }
const ROLE_TOKEN_STOPWORDS = new Set([
"a",
"an",
"and",
"the",
"of",
"to",
"for",
"in",
"on",
"with",
"at",
"by",
"from",
"senior",
"sr",
"jr",
"junior",
"lead",
"principal",
"staff",
"i",
"ii",
"iii",
"iv",
"v",
"remote",
"hybrid",
"onsite",
// These are too generic and cause massive false positives.
"software",
"development",
"developer",
"engineer",
"engineering",
]);
function normalizeText(value: string | null | undefined): string {
return (value ?? "")
.toLowerCase()
.replace(/\s+/g, " ")
.trim();
}
function buildRoleMatchers(phrases: string[]): {
phraseMatchers: string[];
tokenMatchers: string[];
} {
const phraseMatchers = phrases
.map((p) => normalizeText(p))
.filter(Boolean);
const tokenSet = new Set<string>();
for (const phrase of phraseMatchers) {
for (const token of phrase.split(/[^a-z0-9+.#]+/g)) {
const cleaned = token.trim();
if (!cleaned) continue;
if (cleaned.length < 2) continue;
if (ROLE_TOKEN_STOPWORDS.has(cleaned)) continue;
tokenSet.add(cleaned);
}
}
// Ensure common QA acronyms remain even if user only typed long-form roles.
for (const token of ["qa", "sdet", "test", "testing", "automation"]) {
tokenSet.add(token);
}
return { phraseMatchers, tokenMatchers: [...tokenSet] };
}
function matchesAny(text: string, needles: string[]): boolean {
if (!text) return false;
for (const needle of needles) {
if (needle && text.includes(needle)) return true;
}
return false;
}
function filterJobsBySearchProfile(args: {
jobs: CreateJobInput[];
targetRolePhrases: string[];
mustHaveSkills: string[];
dealBreakers: string[];
}): { jobs: CreateJobInput[]; dropped: number } {
const { jobs, targetRolePhrases, mustHaveSkills, dealBreakers } = args;
const roleMatchers = buildRoleMatchers(targetRolePhrases);
const mustHaveLower = mustHaveSkills.map(normalizeText).filter(Boolean);
const dealBreakersLower = dealBreakers.map(normalizeText).filter(Boolean);
const filtered = jobs.filter((job) => {
const title = normalizeText(job.title);
const body = normalizeText(job.jobDescription);
const haystack = `${title}\n${body}`;
if (dealBreakersLower.length > 0 && matchesAny(haystack, dealBreakersLower)) {
return false;
}
// If the user specified target roles, enforce a strict role match so we
// don't surface irrelevant jobs (e.g. legal/sales/finance) in Discovered.
if (roleMatchers.phraseMatchers.length > 0) {
const roleMatch =
matchesAny(title, roleMatchers.phraseMatchers) ||
matchesAny(title, roleMatchers.tokenMatchers) ||
matchesAny(body, roleMatchers.phraseMatchers) ||
matchesAny(body, roleMatchers.tokenMatchers);
if (!roleMatch) return false;
}
if (mustHaveLower.length > 0 && !matchesAny(haystack, mustHaveLower)) {
return false;
}
return true;
});
return { jobs: filtered, dropped: jobs.length - filtered.length };
}
export async function discoverJobsStep(args: { export async function discoverJobsStep(args: {
mergedConfig: PipelineConfig; mergedConfig: PipelineConfig;
shouldCancel?: () => boolean; shouldCancel?: () => boolean;
@ -98,6 +220,9 @@ export async function discoverJobsStep(args: {
const ownerProfileId = const ownerProfileId =
args.mergedConfig.ownerProfileId ?? DEFAULT_JOB_OWNER_PROFILE_ID; args.mergedConfig.ownerProfileId ?? DEFAULT_JOB_OWNER_PROFILE_ID;
let searchProfileTargetRoles: string[] = [];
let searchProfileMustHaveSkills: string[] = [];
let searchProfileDealBreakers: string[] = [];
const mergeTargetRoles = (targetRoles: unknown) => { const mergeTargetRoles = (targetRoles: unknown) => {
if (!Array.isArray(targetRoles) || targetRoles.length === 0) return; if (!Array.isArray(targetRoles) || targetRoles.length === 0) return;
@ -120,19 +245,38 @@ export async function discoverJobsStep(args: {
if (ownerProfileId && ownerProfileId !== DEFAULT_JOB_OWNER_PROFILE_ID) { if (ownerProfileId && ownerProfileId !== DEFAULT_JOB_OWNER_PROFILE_ID) {
const row = await getProfileById(ownerProfileId); const row = await getProfileById(ownerProfileId);
if (row?.data?.targetRoles?.length) { if (row?.data) {
mergeTargetRoles(row.data.targetRoles); const parsed = jobSearchProfileSchema.safeParse(row.data);
if (parsed.success) {
searchProfileTargetRoles = parsed.data.targetRoles ?? [];
searchProfileMustHaveSkills = parsed.data.mustHaveSkills ?? [];
searchProfileDealBreakers = parsed.data.dealBreakers ?? [];
if (searchProfileTargetRoles.length > 0) {
mergeTargetRoles(searchProfileTargetRoles);
}
} else if (row.data.targetRoles?.length) {
// Legacy profile shapes: keep augmenting terms but we won't enforce strict filtering.
mergeTargetRoles(row.data.targetRoles);
}
} }
} else { } else {
const profileSetting = settings.jobSearchProfile; const profileSetting = settings.jobSearchProfile;
if (profileSetting) { if (profileSetting) {
try { try {
const profile = JSON.parse(profileSetting); const profile = JSON.parse(profileSetting);
if ( const parsed = jobSearchProfileSchema.safeParse(profile);
Array.isArray(profile.targetRoles) && if (parsed.success) {
profile.targetRoles.length > 0 searchProfileTargetRoles = parsed.data.targetRoles ?? [];
searchProfileMustHaveSkills = parsed.data.mustHaveSkills ?? [];
searchProfileDealBreakers = parsed.data.dealBreakers ?? [];
if (searchProfileTargetRoles.length > 0) {
mergeTargetRoles(searchProfileTargetRoles);
}
} else if (
Array.isArray((profile as { targetRoles?: unknown }).targetRoles) &&
(profile as { targetRoles: unknown[] }).targetRoles.length > 0
) { ) {
mergeTargetRoles(profile.targetRoles); mergeTargetRoles((profile as { targetRoles: unknown }).targetRoles);
} }
} catch { } catch {
// malformed profile JSON, continue with existing terms // malformed profile JSON, continue with existing terms
@ -406,7 +550,32 @@ export async function discoverJobsStep(args: {
return { discoveredJobs: filteredDiscoveredJobs, sourceErrors }; return { discoveredJobs: filteredDiscoveredJobs, sourceErrors };
} }
if (filteredDiscoveredJobs.length === 0 && sourceErrors.length > 0) { const strictProfileFilteringEnabled =
searchProfileTargetRoles.length > 0 ||
searchProfileMustHaveSkills.length > 0 ||
searchProfileDealBreakers.length > 0;
const profileFiltered = strictProfileFilteringEnabled
? filterJobsBySearchProfile({
jobs: filteredDiscoveredJobs,
targetRolePhrases: searchProfileTargetRoles.length
? searchProfileTargetRoles
: searchTerms,
mustHaveSkills: searchProfileMustHaveSkills,
dealBreakers: searchProfileDealBreakers,
})
: { jobs: filteredDiscoveredJobs, dropped: 0 };
if (profileFiltered.dropped > 0) {
logger.info("Dropped discovered jobs that didn't match search profile", {
step: "discover-jobs",
droppedCount: profileFiltered.dropped,
targetRolesCount: searchProfileTargetRoles.length,
mustHaveSkillsCount: searchProfileMustHaveSkills.length,
dealBreakersCount: searchProfileDealBreakers.length,
});
}
if (profileFiltered.jobs.length === 0 && sourceErrors.length > 0) {
throw new Error(`All sources failed: ${sourceErrors.join("; ")}`); throw new Error(`All sources failed: ${sourceErrors.join("; ")}`);
} }
@ -414,9 +583,9 @@ export async function discoverJobsStep(args: {
logger.warn("Some discovery sources failed", { sourceErrors }); logger.warn("Some discovery sources failed", { sourceErrors });
} }
progressHelpers.crawlingComplete(filteredDiscoveredJobs.length); progressHelpers.crawlingComplete(profileFiltered.jobs.length);
const stamped = filteredDiscoveredJobs.map((job) => ({ const stamped = profileFiltered.jobs.map((job) => ({
...job, ...job,
ownerProfileId, ownerProfileId,
})); }));

View File

@ -5,6 +5,7 @@
import { randomUUID } from "node:crypto"; import { randomUUID } from "node:crypto";
import { getJobOwnerProfileId } from "@infra/request-context"; import { getJobOwnerProfileId } from "@infra/request-context";
import { DEFAULT_JOB_OWNER_PROFILE_ID } from "@server/infra/job-owner-context"; import { DEFAULT_JOB_OWNER_PROFILE_ID } from "@server/infra/job-owner-context";
import { buildJobContentFingerprint } from "@shared/job-fingerprint";
import { canonicalizeJobUrl } from "@shared/job-url-canonical"; import { canonicalizeJobUrl } from "@shared/job-url-canonical";
import type { import type {
CreateJobInput, CreateJobInput,
@ -40,12 +41,16 @@ function resolveOwnerForCreate(input: CreateJobInput): string {
async function loadJobDedupIndexes(ownerProfileId: string): Promise<{ async function loadJobDedupIndexes(ownerProfileId: string): Promise<{
existingCanonicalSet: Set<string>; existingCanonicalSet: Set<string>;
existingSourceJobKeySet: Set<string>; existingSourceJobKeySet: Set<string>;
existingContentFingerprintSet: Set<string>;
}> { }> {
const rows = await db const rows = await db
.select({ .select({
jobUrl: jobs.jobUrl, jobUrl: jobs.jobUrl,
source: jobs.source, source: jobs.source,
sourceJobId: jobs.sourceJobId, sourceJobId: jobs.sourceJobId,
contentFingerprint: jobs.contentFingerprint,
employer: jobs.employer,
title: jobs.title,
}) })
.from(jobs) .from(jobs)
.where(eq(jobs.ownerProfileId, ownerProfileId)); .where(eq(jobs.ownerProfileId, ownerProfileId));
@ -60,7 +65,29 @@ async function loadJobDedupIndexes(ownerProfileId: string): Promise<{
) )
.map((r) => sourceJobKey(r.source, String(r.sourceJobId))), .map((r) => sourceJobKey(r.source, String(r.sourceJobId))),
); );
return { existingCanonicalSet, existingSourceJobKeySet }; // Cross-source dedup: prefer the persisted fingerprint, but fall back to
// recomputing it from (employer, title) so legacy rows participate in
// dedup until they're rewritten.
const existingContentFingerprintSet = new Set<string>();
for (const row of rows) {
const stored = row.contentFingerprint?.trim();
if (stored) {
existingContentFingerprintSet.add(stored);
continue;
}
const recomputed = buildJobContentFingerprint({
employer: row.employer,
title: row.title,
});
if (recomputed) {
existingContentFingerprintSet.add(recomputed);
}
}
return {
existingCanonicalSet,
existingSourceJobKeySet,
existingContentFingerprintSet,
};
} }
async function findJobByCanonicalUrl( async function findJobByCanonicalUrl(
@ -87,6 +114,46 @@ async function findJobByCanonicalUrl(
return null; return null;
} }
async function findJobByContentFingerprint(
fingerprint: string,
ownerProfileId: string,
): Promise<Job | null> {
// Fast path: stored fingerprint match.
const [stored] = await db
.select()
.from(jobs)
.where(
and(
eq(jobs.ownerProfileId, ownerProfileId),
eq(jobs.contentFingerprint, fingerprint),
),
)
.limit(1);
if (stored) return mapRowToJob(stored);
// Fallback for legacy rows without a persisted fingerprint: scan and
// recompute. Owner-scoped table size keeps this cheap in practice.
const allRows = await db
.select()
.from(jobs)
.where(
and(
eq(jobs.ownerProfileId, ownerProfileId),
isNull(jobs.contentFingerprint),
),
);
for (const row of allRows) {
const recomputed = buildJobContentFingerprint({
employer: row.employer,
title: row.title,
});
if (recomputed === fingerprint) {
return mapRowToJob(row);
}
}
return null;
}
async function getJobBySourceAndExternalId( async function getJobBySourceAndExternalId(
source: string, source: string,
sourceJobId: string, sourceJobId: string,
@ -302,6 +369,10 @@ async function insertJob(input: CreateJobInput): Promise<Job> {
const now = new Date().toISOString(); const now = new Date().toISOString();
const ownerProfileId = resolveOwnerForCreate(input); const ownerProfileId = resolveOwnerForCreate(input);
const contentFingerprint = buildJobContentFingerprint({
employer: input.employer,
title: input.title,
});
await db.insert(jobs).values({ await db.insert(jobs).values({
id, id,
@ -314,6 +385,7 @@ async function insertJob(input: CreateJobInput): Promise<Job> {
employer: input.employer, employer: input.employer,
employerUrl: input.employerUrl ?? null, employerUrl: input.employerUrl ?? null,
jobUrl: input.jobUrl, jobUrl: input.jobUrl,
contentFingerprint,
applicationLink: input.applicationLink ?? null, applicationLink: input.applicationLink ?? null,
disciplines: input.disciplines ?? null, disciplines: input.disciplines ?? null,
deadline: input.deadline ?? null, deadline: input.deadline ?? null,
@ -395,8 +467,11 @@ export async function createJobs(
...normalized, ...normalized,
ownerProfileId, ownerProfileId,
}; };
const { existingCanonicalSet, existingSourceJobKeySet } = const {
await loadJobDedupIndexes(ownerProfileId); existingCanonicalSet,
existingSourceJobKeySet,
existingContentFingerprintSet,
} = await loadJobDedupIndexes(ownerProfileId);
const sid = normalized.sourceJobId?.trim(); const sid = normalized.sourceJobId?.trim();
if (sid) { if (sid) {
@ -419,6 +494,18 @@ export async function createJobs(
if (existing) return existing; if (existing) return existing;
} }
const fingerprint = buildJobContentFingerprint({
employer: normalized.employer,
title: normalized.title,
});
if (fingerprint && existingContentFingerprintSet.has(fingerprint)) {
const existing = await findJobByContentFingerprint(
fingerprint,
ownerProfileId,
);
if (existing) return existing;
}
const inserted = await tryInsertJob(normalizedWithOwner); const inserted = await tryInsertJob(normalizedWithOwner);
if (inserted) return inserted; if (inserted) return inserted;
@ -437,14 +524,18 @@ export async function createJobs(
} }
const ownerProfileId = resolveOwnerForCreate(inputOrInputs[0] ?? {}); const ownerProfileId = resolveOwnerForCreate(inputOrInputs[0] ?? {});
const { existingCanonicalSet, existingSourceJobKeySet } = const {
await loadJobDedupIndexes(ownerProfileId); existingCanonicalSet,
existingSourceJobKeySet,
existingContentFingerprintSet,
} = await loadJobDedupIndexes(ownerProfileId);
const batchBuckets = new Map< const batchBuckets = new Map<
string, string,
{ {
input: CreateJobInput; input: CreateJobInput;
count: number; count: number;
fingerprint: string | null;
} }
>(); >();
@ -454,21 +545,30 @@ export async function createJobs(
ownerProfileId, ownerProfileId,
}); });
const sidForKey = normalized.sourceJobId?.trim(); const sidForKey = normalized.sourceJobId?.trim();
const batchKey = sidForKey const fingerprint = buildJobContentFingerprint({
? `sid:${sourceJobKey(normalized.source, sidForKey)}` employer: normalized.employer,
: `url:${normalized.jobUrl}`; title: normalized.title,
});
// Coalesce duplicates within a single batch, preferring fingerprint when
// available so two different feeds posting the same role merge into one
// bucket. Fall back to source-job-id, then canonical URL.
const batchKey = fingerprint
? `fp:${fingerprint}`
: sidForKey
? `sid:${sourceJobKey(normalized.source, sidForKey)}`
: `url:${normalized.jobUrl}`;
const prev = batchBuckets.get(batchKey); const prev = batchBuckets.get(batchKey);
if (prev) { if (prev) {
prev.count += 1; prev.count += 1;
} else { } else {
batchBuckets.set(batchKey, { input: normalized, count: 1 }); batchBuckets.set(batchKey, { input: normalized, count: 1, fingerprint });
} }
} }
let created = 0; let created = 0;
let skipped = 0; let skipped = 0;
for (const { input, count } of batchBuckets.values()) { for (const { input, count, fingerprint } of batchBuckets.values()) {
const canonical = input.jobUrl; const canonical = input.jobUrl;
const sid = input.sourceJobId?.trim(); const sid = input.sourceJobId?.trim();
const sk = sid ? sourceJobKey(input.source, sid) : null; const sk = sid ? sourceJobKey(input.source, sid) : null;
@ -481,6 +581,10 @@ export async function createJobs(
skipped += count; skipped += count;
continue; continue;
} }
if (fingerprint && existingContentFingerprintSet.has(fingerprint)) {
skipped += count;
continue;
}
const inserted = await tryInsertJob(input); const inserted = await tryInsertJob(input);
if (!inserted) { if (!inserted) {
@ -494,6 +598,9 @@ export async function createJobs(
if (sk) { if (sk) {
existingSourceJobKeySet.add(sk); existingSourceJobKeySet.add(sk);
} }
if (fingerprint) {
existingContentFingerprintSet.add(fingerprint);
}
} }
return { created, skipped }; return { created, skipped };

View File

@ -151,6 +151,20 @@ export async function getEffectiveSettings(): Promise<AppSettings> {
...envSettings, ...envSettings,
}; };
// In Basic Auth mode, the "active" search profile is derived from the request
// context (basic auth username → profile.basicAuthUser), not from the stored
// activeProfileId setting. Expose that derived id so the UI updates the right
// profile record on save.
const requestOwnerProfileId = getJobOwnerProfileId();
if (
tenantJobSearchProfile &&
requestOwnerProfileId &&
requestOwnerProfileId !== DEFAULT_JOB_OWNER_PROFILE_ID &&
requestOwnerProfileId !== "__unmapped__"
) {
result.activeProfileId = requestOwnerProfileId;
}
const rawModel = overrides.model; const rawModel = overrides.model;
const modelDef = settingsRegistry.model; const modelDef = settingsRegistry.model;
const overrideModel = normalizeModelForProviderCompatibility( const overrideModel = normalizeModelForProviderCompatibility(

425
package-lock.json generated
View File

@ -13,6 +13,7 @@
], ],
"devDependencies": { "devDependencies": {
"@types/node": "^25.2.3", "@types/node": "^25.2.3",
"dotenv": "^17.2.3",
"knip": "^5.83.1", "knip": "^5.83.1",
"tsx": "^4.19.2", "tsx": "^4.19.2",
"typescript": "^5.9.3" "typescript": "^5.9.3"
@ -103,6 +104,90 @@
"undici-types": "~7.16.0" "undici-types": "~7.16.0"
} }
}, },
"extractors/arbeitnow": {
"name": "arbeitnow-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/arbeitnow/node_modules/@types/node": {
"version": "24.12.4",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz",
"integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/ashby": {
"name": "ashby-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/ashby/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/careerjet": {
"name": "careerjet-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/careerjet/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/fourdayweek": {
"name": "fourdayweek-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/fourdayweek/node_modules/@types/node": {
"version": "24.12.4",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz",
"integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/gradcracker": { "extractors/gradcracker": {
"name": "gradcracker-extractor", "name": "gradcracker-extractor",
"version": "0.0.1", "version": "0.0.1",
@ -154,6 +239,48 @@
"undici-types": "~7.16.0" "undici-types": "~7.16.0"
} }
}, },
"extractors/greenhouse": {
"name": "greenhouse-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/greenhouse/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/himalayas": {
"name": "himalayas-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/himalayas/node_modules/@types/node": {
"version": "24.12.4",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz",
"integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/hiringcafe": { "extractors/hiringcafe": {
"name": "hiringcafe-extractor", "name": "hiringcafe-extractor",
"version": "0.0.1", "version": "0.0.1",
@ -181,6 +308,132 @@
"undici-types": "~7.16.0" "undici-types": "~7.16.0"
} }
}, },
"extractors/jobicy": {
"name": "jobicy-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/jobicy/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/jooble": {
"name": "jooble-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/jooble/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/lever": {
"name": "lever-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/lever/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/reed": {
"name": "reed-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/reed/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/remoteok": {
"name": "remoteok-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/remoteok/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/remotive": {
"name": "remotive-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/remotive/node_modules/@types/node": {
"version": "24.12.4",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz",
"integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/startupjobs": { "extractors/startupjobs": {
"name": "startupjobs-extractor", "name": "startupjobs-extractor",
"version": "0.0.1", "version": "0.0.1",
@ -202,6 +455,27 @@
"undici-types": "~7.16.0" "undici-types": "~7.16.0"
} }
}, },
"extractors/themuse": {
"name": "themuse-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/themuse/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/ukvisajobs": { "extractors/ukvisajobs": {
"name": "ukvisajobs-extractor", "name": "ukvisajobs-extractor",
"version": "0.0.1", "version": "0.0.1",
@ -234,6 +508,69 @@
"undici-types": "~7.16.0" "undici-types": "~7.16.0"
} }
}, },
"extractors/usajobs": {
"name": "usajobs-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/usajobs/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/weworkremotely": {
"name": "weworkremotely-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/weworkremotely/node_modules/@types/node": {
"version": "24.12.4",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.4.tgz",
"integrity": "sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"extractors/workday": {
"name": "workday-extractor",
"version": "0.0.1",
"dependencies": {
"job-ops-shared": "^1.0.0"
},
"devDependencies": {
"@types/node": "^24.0.0",
"typescript": "~5.9.0"
}
},
"extractors/workday/node_modules/@types/node": {
"version": "24.12.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.3.tgz",
"integrity": "sha512-8oljBDGun9cIsZRJR6fkihn0TSXJI0UDOOhncYaERq6M0JMDoPLxyscwruJcb4GKS6dvK/d8xebYBg27h/duaQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~7.16.0"
}
},
"node_modules/@algolia/abtesting": { "node_modules/@algolia/abtesting": {
"version": "1.14.1", "version": "1.14.1",
"resolved": "https://registry.npmjs.org/@algolia/abtesting/-/abtesting-1.14.1.tgz", "resolved": "https://registry.npmjs.org/@algolia/abtesting/-/abtesting-1.14.1.tgz",
@ -9384,6 +9721,10 @@
"type-fest": "^4.0.0" "type-fest": "^4.0.0"
} }
}, },
"node_modules/arbeitnow-extractor": {
"resolved": "extractors/arbeitnow",
"link": true
},
"node_modules/arg": { "node_modules/arg": {
"version": "5.0.2", "version": "5.0.2",
"resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz", "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz",
@ -9423,6 +9764,10 @@
"node": ">=8" "node": ">=8"
} }
}, },
"node_modules/ashby-extractor": {
"resolved": "extractors/ashby",
"link": true
},
"node_modules/asn1js": { "node_modules/asn1js": {
"version": "3.0.7", "version": "3.0.7",
"resolved": "https://registry.npmjs.org/asn1js/-/asn1js-3.0.7.tgz", "resolved": "https://registry.npmjs.org/asn1js/-/asn1js-3.0.7.tgz",
@ -10208,6 +10553,10 @@
], ],
"license": "CC-BY-4.0" "license": "CC-BY-4.0"
}, },
"node_modules/careerjet-extractor": {
"resolved": "extractors/careerjet",
"link": true
},
"node_modules/ccount": { "node_modules/ccount": {
"version": "2.0.1", "version": "2.0.1",
"resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz",
@ -11899,6 +12248,18 @@
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
}, },
"node_modules/dotenv": {
"version": "17.4.2",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.4.2.tgz",
"integrity": "sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://dotenvx.com"
}
},
"node_modules/dunder-proto": { "node_modules/dunder-proto": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
@ -13168,6 +13529,10 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/fourdayweek-extractor": {
"resolved": "extractors/fourdayweek",
"link": true
},
"node_modules/fraction.js": { "node_modules/fraction.js": {
"version": "5.3.4", "version": "5.3.4",
"resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-5.3.4.tgz", "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-5.3.4.tgz",
@ -13695,6 +14060,10 @@
"js-yaml": "bin/js-yaml.js" "js-yaml": "bin/js-yaml.js"
} }
}, },
"node_modules/greenhouse-extractor": {
"resolved": "extractors/greenhouse",
"link": true
},
"node_modules/gzip-size": { "node_modules/gzip-size": {
"version": "6.0.0", "version": "6.0.0",
"resolved": "https://registry.npmjs.org/gzip-size/-/gzip-size-6.0.0.tgz", "resolved": "https://registry.npmjs.org/gzip-size/-/gzip-size-6.0.0.tgz",
@ -13995,6 +14364,10 @@
"node": ">=16.0.0" "node": ">=16.0.0"
} }
}, },
"node_modules/himalayas-extractor": {
"resolved": "extractors/himalayas",
"link": true
},
"node_modules/hiringcafe-extractor": { "node_modules/hiringcafe-extractor": {
"resolved": "extractors/hiringcafe", "resolved": "extractors/hiringcafe",
"link": true "link": true
@ -15261,6 +15634,10 @@
"resolved": "shared", "resolved": "shared",
"link": true "link": true
}, },
"node_modules/jobicy-extractor": {
"resolved": "extractors/jobicy",
"link": true
},
"node_modules/joi": { "node_modules/joi": {
"version": "17.13.3", "version": "17.13.3",
"resolved": "https://registry.npmjs.org/joi/-/joi-17.13.3.tgz", "resolved": "https://registry.npmjs.org/joi/-/joi-17.13.3.tgz",
@ -15274,6 +15651,10 @@
"@sideway/pinpoint": "^2.0.0" "@sideway/pinpoint": "^2.0.0"
} }
}, },
"node_modules/jooble-extractor": {
"resolved": "extractors/jooble",
"link": true
},
"node_modules/jquery": { "node_modules/jquery": {
"version": "3.7.1", "version": "3.7.1",
"resolved": "https://registry.npmjs.org/jquery/-/jquery-3.7.1.tgz", "resolved": "https://registry.npmjs.org/jquery/-/jquery-3.7.1.tgz",
@ -15573,6 +15954,10 @@
"node": ">=6" "node": ">=6"
} }
}, },
"node_modules/lever-extractor": {
"resolved": "extractors/lever",
"link": true
},
"node_modules/lightningcss-android-arm64": { "node_modules/lightningcss-android-arm64": {
"version": "1.30.2", "version": "1.30.2",
"resolved": "https://registry.npmjs.org/lightningcss-android-arm64/-/lightningcss-android-arm64-1.30.2.tgz", "resolved": "https://registry.npmjs.org/lightningcss-android-arm64/-/lightningcss-android-arm64-1.30.2.tgz",
@ -21836,6 +22221,10 @@
"url": "https://opencollective.com/unified" "url": "https://opencollective.com/unified"
} }
}, },
"node_modules/reed-extractor": {
"resolved": "extractors/reed",
"link": true
},
"node_modules/reflect-metadata": { "node_modules/reflect-metadata": {
"version": "0.2.2", "version": "0.2.2",
"resolved": "https://registry.npmjs.org/reflect-metadata/-/reflect-metadata-0.2.2.tgz", "resolved": "https://registry.npmjs.org/reflect-metadata/-/reflect-metadata-0.2.2.tgz",
@ -22089,6 +22478,14 @@
"url": "https://opencollective.com/unified" "url": "https://opencollective.com/unified"
} }
}, },
"node_modules/remoteok-extractor": {
"resolved": "extractors/remoteok",
"link": true
},
"node_modules/remotive-extractor": {
"resolved": "extractors/remotive",
"link": true
},
"node_modules/renderkid": { "node_modules/renderkid": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/renderkid/-/renderkid-3.0.0.tgz", "resolved": "https://registry.npmjs.org/renderkid/-/renderkid-3.0.0.tgz",
@ -23752,6 +24149,10 @@
"integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/themuse-extractor": {
"resolved": "extractors/themuse",
"link": true
},
"node_modules/thingies": { "node_modules/thingies": {
"version": "2.5.0", "version": "2.5.0",
"resolved": "https://registry.npmjs.org/thingies/-/thingies-2.5.0.tgz", "resolved": "https://registry.npmjs.org/thingies/-/thingies-2.5.0.tgz",
@ -24601,6 +25002,10 @@
"url": "https://opencollective.com/webpack" "url": "https://opencollective.com/webpack"
} }
}, },
"node_modules/usajobs-extractor": {
"resolved": "extractors/usajobs",
"link": true
},
"node_modules/use-callback-ref": { "node_modules/use-callback-ref": {
"version": "1.3.3", "version": "1.3.3",
"resolved": "https://registry.npmjs.org/use-callback-ref/-/use-callback-ref-1.3.3.tgz", "resolved": "https://registry.npmjs.org/use-callback-ref/-/use-callback-ref-1.3.3.tgz",
@ -25191,6 +25596,10 @@
"node": ">=0.8.0" "node": ">=0.8.0"
} }
}, },
"node_modules/weworkremotely-extractor": {
"resolved": "extractors/weworkremotely",
"link": true
},
"node_modules/whatwg-encoding": { "node_modules/whatwg-encoding": {
"version": "3.1.1", "version": "3.1.1",
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
@ -25324,6 +25733,10 @@
"integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==", "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/workday-extractor": {
"resolved": "extractors/workday",
"link": true
},
"node_modules/wrap-ansi": { "node_modules/wrap-ansi": {
"version": "6.2.0", "version": "6.2.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz",
@ -25640,7 +26053,7 @@
}, },
"orchestrator": { "orchestrator": {
"name": "job-ops-orchestrator", "name": "job-ops-orchestrator",
"version": "0.2.0", "version": "0.2.1",
"dependencies": { "dependencies": {
"@hookform/resolvers": "^5.2.2", "@hookform/resolvers": "^5.2.2",
"@paralleldrive/cuid2": "^3.0.6", "@paralleldrive/cuid2": "^3.0.6",
@ -27412,16 +27825,6 @@
"csstype": "^3.0.2" "csstype": "^3.0.2"
} }
}, },
"orchestrator/node_modules/dotenv": {
"version": "17.2.3",
"license": "BSD-2-Clause",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://dotenvx.com"
}
},
"orchestrator/node_modules/drizzle-kit": { "orchestrator/node_modules/drizzle-kit": {
"version": "0.30.6", "version": "0.30.6",
"dev": true, "dev": true,

View File

@ -24,6 +24,7 @@
"knip": "knip" "knip": "knip"
}, },
"devDependencies": { "devDependencies": {
"dotenv": "^17.2.3",
"@types/node": "^25.2.3", "@types/node": "^25.2.3",
"knip": "^5.83.1", "knip": "^5.83.1",
"tsx": "^4.19.2", "tsx": "^4.19.2",

View File

@ -0,0 +1,24 @@
#
# Jobber cron env — cherepaha
# Copy to /root/.jobber-cron-cherepaha.env (chmod 600)
#
# Used by: scripts/jobber-pipeline-telegram.sh
#
# JobOps base URL (where the app is reachable from the cron host)
JOBOPS_URL=http://127.0.0.1:3005
# Optional: limit number of jobs linked in Telegram message
JOB_TELEGRAM_MAX_JOBS=25
# Optional: comma-separated sources to run (leave empty to use server defaults)
# JOBBER_PIPELINE_SOURCES=adzuna,gradcracker,ukvisajobs
# App-level Basic Auth (enables per-user separation when set on the server)
BASIC_AUTH_USER=cherepaha
BASIC_AUTH_PASSWORD=CHANGEME
# Telegram bot + chat destination
TELEGRAM_BOT_TOKEN=CHANGEME
TELEGRAM_CHAT_ID=CHANGEME

View File

@ -0,0 +1,24 @@
#
# Jobber cron env — dobkin
# Copy to /root/.jobber-cron-dobkin.env (chmod 600)
#
# Used by: scripts/jobber-pipeline-telegram.sh
#
# JobOps base URL (where the app is reachable from the cron host)
JOBOPS_URL=http://127.0.0.1:3005
# Optional: limit number of jobs linked in Telegram message
JOB_TELEGRAM_MAX_JOBS=25
# Optional: comma-separated sources to run (leave empty to use server defaults)
# JOBBER_PIPELINE_SOURCES=adzuna,gradcracker,ukvisajobs
# App-level Basic Auth (enables per-user separation when set on the server)
BASIC_AUTH_USER=dobkin
BASIC_AUTH_PASSWORD=CHANGEME
# Telegram bot + chat destination
TELEGRAM_BOT_TOKEN=CHANGEME
TELEGRAM_CHAT_ID=CHANGEME

219
scripts/smoke-extractors.ts Normal file
View File

@ -0,0 +1,219 @@
/**
* Tiny smoke-test for new extractors: imports each manifest, runs it with a
* minimal context, and prints the count of mapped jobs + a few samples.
*
* Run from repo root: npx tsx scripts/smoke-extractors.ts [comma,separated,ids]
*
* Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain
* `tsx` does not read `.env` automatically).
*/
import path from "node:path";
import { fileURLToPath } from "node:url";
import { config as loadEnv } from "dotenv";
import type {
ExtractorManifest,
ExtractorRuntimeContext,
} from "../shared/src/types/extractors";
const repoRoot = path.resolve(
path.dirname(fileURLToPath(import.meta.url)),
"..",
);
loadEnv({ path: path.join(repoRoot, ".env") });
interface Target {
id: string;
importPath: string;
needs?: string[]; // env vars required to run; skipped if missing
settings?: Record<string, string>;
}
const ALL_TARGETS: Target[] = [
{
id: "jobicy",
importPath: "../extractors/jobicy/manifest",
settings: { jobicyMaxJobsPerTerm: "10" },
},
{
id: "themuse",
importPath: "../extractors/themuse/manifest",
settings: { themuseMaxJobsPerTerm: "10" },
},
{
id: "usajobs",
importPath: "../extractors/usajobs/manifest",
needs: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
settings: { usajobsMaxJobsPerTerm: "10" },
},
{
id: "jooble",
importPath: "../extractors/jooble/manifest",
needs: ["JOOBLE_API_KEY"],
settings: { joobleMaxJobsPerTerm: "10" },
},
{
id: "careerjet",
importPath: "../extractors/careerjet/manifest",
needs: ["CAREERJET_AFFID", "CAREERJET_REFERER", "CAREERJET_USER_IP"],
settings: { careerjetMaxJobsPerTerm: "10" },
},
{
id: "reed",
importPath: "../extractors/reed/manifest",
needs: ["REED_API_KEY"],
settings: { reedMaxJobsPerTerm: "10" },
},
{
id: "lever",
importPath: "../extractors/lever/manifest",
settings: {
// Known active public Lever board used purely as a connectivity check.
leverCompanies: JSON.stringify(["palantir", "netflix"]),
},
},
{
id: "ashby",
importPath: "../extractors/ashby/manifest",
settings: {
ashbyCompanies: JSON.stringify(["ramp", "linear"]),
},
},
{
id: "greenhouse",
importPath: "../extractors/greenhouse/manifest",
settings: {
greenhouseCompanies: JSON.stringify(["stripe", "airbnb"]),
},
},
{
id: "workday",
importPath: "../extractors/workday/manifest",
settings: {
workdayTenants: JSON.stringify([
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
]),
},
},
{
id: "remoteok",
importPath: "../extractors/remoteok/manifest",
settings: { remoteokMaxJobsPerTerm: "10" },
},
{
id: "remotive",
importPath: "../extractors/remotive/manifest",
settings: { remotiveMaxJobsPerTerm: "10" },
},
{
id: "arbeitnow",
importPath: "../extractors/arbeitnow/manifest",
settings: { arbeitnowMaxJobsPerTerm: "10" },
},
{
id: "himalayas",
importPath: "../extractors/himalayas/manifest",
settings: { himalayasMaxJobsPerTerm: "10" },
},
{
id: "weworkremotely",
importPath: "../extractors/weworkremotely/manifest",
settings: { weworkremotelyMaxJobsPerTerm: "10" },
},
{
id: "fourdayweek",
importPath: "../extractors/fourdayweek/manifest",
settings: { fourdayweekMaxJobsPerTerm: "10" },
},
];
function buildContext(
source: string,
settings: Record<string, string>,
): ExtractorRuntimeContext {
return {
source,
selectedSources: [source],
settings,
searchTerms: ["software engineer"],
selectedCountry: "United States",
getExistingJobUrls: async () => [],
shouldCancel: () => false,
onProgress: () => {},
};
}
function pad(s: string, n: number): string {
return s.length >= n ? s : s + " ".repeat(n - s.length);
}
async function runOne(target: Target): Promise<void> {
const missing = (target.needs ?? []).filter((k) => !process.env[k]);
if (missing.length > 0) {
console.log(
`${pad(target.id, 12)} SKIP missing env: ${missing.join(", ")}`,
);
return;
}
let mod: { manifest?: ExtractorManifest; default?: ExtractorManifest };
try {
mod = await import(target.importPath);
} catch (err) {
console.log(
`${pad(target.id, 12)} FAIL import error: ${(err as Error).message}`,
);
return;
}
const manifest = mod.manifest ?? mod.default;
if (!manifest) {
console.log(`${pad(target.id, 12)} FAIL manifest export missing`);
return;
}
const started = Date.now();
try {
const ctx = buildContext(target.id, target.settings ?? {});
const result = await manifest.run(ctx);
const ms = Date.now() - started;
const status = result.success ? "OK " : "ERR ";
const sample = result.jobs[0];
const sampleStr = sample
? ` | first: "${sample.title}" @ ${sample.employer}`
: "";
console.log(
`${pad(target.id, 12)} ${status} jobs=${result.jobs.length} ${ms}ms${result.error ? ` | error: ${result.error}` : ""}${sampleStr}`,
);
} catch (err) {
const ms = Date.now() - started;
console.log(
`${pad(target.id, 12)} CRASH ${ms}ms ${(err as Error).message}`,
);
}
}
async function main() {
const requested = (process.argv[2] ?? "").trim();
const filter = requested
? new Set(
requested
.split(",")
.map((s) => s.trim())
.filter(Boolean),
)
: null;
const targets = filter
? ALL_TARGETS.filter((t) => filter.has(t.id))
: ALL_TARGETS;
console.log(`Smoke testing ${targets.length} extractor(s)...\n`);
for (const t of targets) {
await runOne(t);
}
}
main().catch((err) => {
console.error(err);
process.exit(1);
});

View File

@ -9,6 +9,24 @@ export const EXTRACTOR_SOURCE_IDS = [
"adzuna", "adzuna",
"hiringcafe", "hiringcafe",
"startupjobs", "startupjobs",
// --- Public APIs / feeds ---
"usajobs",
"jobicy",
"themuse",
"jooble",
"careerjet",
"reed",
"remoteok",
"remotive",
"arbeitnow",
"himalayas",
"weworkremotely",
"fourdayweek",
// --- Public ATS / career-page sources ---
"ashby",
"lever",
"greenhouse",
"workday",
"manual", "manual",
] as const; ] as const;
@ -20,6 +38,10 @@ export interface ExtractorSourceMetadata {
category: "pipeline" | "manual"; category: "pipeline" | "manual";
requiresCredentials?: boolean; requiresCredentials?: boolean;
ukOnly?: boolean; ukOnly?: boolean;
/** Country gating: when set, only run/show this source for these country keys. */
countryAllowlist?: readonly string[];
/** Region tag for grouping / filtering in the UI. */
region?: "us" | "uk" | "global" | "remote";
} }
export const EXTRACTOR_SOURCE_METADATA: Record< export const EXTRACTOR_SOURCE_METADATA: Record<
@ -31,26 +53,157 @@ export const EXTRACTOR_SOURCE_METADATA: Record<
order: 10, order: 10,
category: "pipeline", category: "pipeline",
ukOnly: true, ukOnly: true,
region: "uk",
},
indeed: {
label: "Indeed",
order: 20,
category: "pipeline",
region: "global",
},
linkedin: {
label: "LinkedIn",
order: 30,
category: "pipeline",
region: "global",
},
glassdoor: {
label: "Glassdoor",
order: 40,
category: "pipeline",
region: "global",
}, },
indeed: { label: "Indeed", order: 20, category: "pipeline" },
linkedin: { label: "LinkedIn", order: 30, category: "pipeline" },
glassdoor: { label: "Glassdoor", order: 40, category: "pipeline" },
ukvisajobs: { ukvisajobs: {
label: "UK Visa Jobs", label: "UK Visa Jobs",
order: 50, order: 50,
category: "pipeline", category: "pipeline",
requiresCredentials: true, requiresCredentials: true,
ukOnly: true, ukOnly: true,
region: "uk",
}, },
adzuna: { adzuna: {
label: "Adzuna", label: "Adzuna",
order: 60, order: 60,
category: "pipeline", category: "pipeline",
requiresCredentials: true, requiresCredentials: true,
region: "global",
}, },
hiringcafe: { label: "Hiring Cafe", order: 70, category: "pipeline" }, hiringcafe: {
startupjobs: { label: "startup.jobs", order: 80, category: "pipeline" }, label: "Hiring Cafe",
manual: { label: "Manual", order: 90, category: "manual" }, order: 70,
category: "pipeline",
region: "global",
},
startupjobs: {
label: "startup.jobs",
order: 80,
category: "pipeline",
region: "global",
},
usajobs: {
label: "USAJOBS",
order: 110,
category: "pipeline",
requiresCredentials: true,
countryAllowlist: ["united states", "usa", "us"],
region: "us",
},
jobicy: {
label: "Jobicy (Remote)",
order: 120,
category: "pipeline",
region: "remote",
},
themuse: {
label: "The Muse",
order: 130,
category: "pipeline",
region: "global",
},
jooble: {
label: "Jooble",
order: 140,
category: "pipeline",
requiresCredentials: true,
region: "global",
},
careerjet: {
label: "Careerjet",
order: 150,
category: "pipeline",
requiresCredentials: true,
region: "global",
},
reed: {
label: "Reed",
order: 160,
category: "pipeline",
requiresCredentials: true,
ukOnly: true,
countryAllowlist: ["united kingdom", "uk", "great britain", "england"],
region: "uk",
},
remoteok: {
label: "Remote OK",
order: 170,
category: "pipeline",
region: "remote",
},
remotive: {
label: "Remotive",
order: 175,
category: "pipeline",
region: "remote",
},
arbeitnow: {
label: "Arbeitnow",
order: 180,
category: "pipeline",
region: "global",
},
himalayas: {
label: "Himalayas",
order: 185,
category: "pipeline",
region: "remote",
},
weworkremotely: {
label: "We Work Remotely",
order: 190,
category: "pipeline",
region: "remote",
},
fourdayweek: {
label: "4 Day Week",
order: 195,
category: "pipeline",
region: "remote",
},
ashby: {
label: "Ashby (ATS)",
order: 210,
category: "pipeline",
region: "global",
},
lever: {
label: "Lever (ATS)",
order: 220,
category: "pipeline",
region: "global",
},
greenhouse: {
label: "Greenhouse (ATS)",
order: 230,
category: "pipeline",
region: "global",
},
workday: {
label: "Workday (ATS)",
order: 240,
category: "pipeline",
region: "global",
},
manual: { label: "Manual", order: 900, category: "manual" },
}; };
export const PIPELINE_EXTRACTOR_SOURCE_IDS = EXTRACTOR_SOURCE_IDS.filter( export const PIPELINE_EXTRACTOR_SOURCE_IDS = EXTRACTOR_SOURCE_IDS.filter(

View File

@ -1,4 +1,5 @@
export * from "./extractors"; export * from "./extractors";
export * from "./job-fingerprint";
export * from "./job-url-canonical"; export * from "./job-url-canonical";
export * from "./location-support"; export * from "./location-support";
export * from "./types"; export * from "./types";

View File

@ -0,0 +1,80 @@
import { describe, expect, it } from "vitest";
import {
buildJobContentFingerprint,
normalizeEmployerForFingerprint,
normalizeTitleForFingerprint,
} from "./job-fingerprint";
describe("buildJobContentFingerprint", () => {
it("collapses the same role across sources", () => {
const a = buildJobContentFingerprint({
employer: "Stripe, Inc.",
title: "Senior Software Engineer (Backend) - Toronto, ON",
});
const b = buildJobContentFingerprint({
employer: "stripe inc",
title: "Senior Software Engineer (Backend)",
});
expect(a).toBe(b);
expect(a).not.toBeNull();
});
it("ignores trailing location decorations on titles", () => {
const a = buildJobContentFingerprint({
employer: "Acme",
title: "Software Engineer — Remote",
});
const b = buildJobContentFingerprint({
employer: "Acme",
title: "Software Engineer",
});
expect(a).toBe(b);
});
it("strips diacritics and punctuation", () => {
const a = buildJobContentFingerprint({
employer: "Café Münchën",
title: "Étudiant Stage",
});
const b = buildJobContentFingerprint({
employer: "cafe munchen",
title: "Etudiant Stage",
});
expect(a).toBe(b);
});
it("returns null when employer or title is empty", () => {
expect(
buildJobContentFingerprint({ employer: "", title: "Engineer" }),
).toBeNull();
expect(
buildJobContentFingerprint({ employer: "Acme", title: "" }),
).toBeNull();
});
it("does not collapse different roles at the same employer", () => {
const a = buildJobContentFingerprint({
employer: "Acme",
title: "Software Engineer",
});
const b = buildJobContentFingerprint({
employer: "Acme",
title: "Product Designer",
});
expect(a).not.toBe(b);
});
describe("normalizers", () => {
it("normalizeEmployerForFingerprint strips legal suffixes", () => {
expect(normalizeEmployerForFingerprint("Acme Corporation")).toBe("acme");
expect(normalizeEmployerForFingerprint("Acme, LLC")).toBe("acme");
expect(normalizeEmployerForFingerprint("Acme GmbH")).toBe("acme");
});
it("normalizeTitleForFingerprint drops leading repost markers", () => {
expect(normalizeTitleForFingerprint("[Reposted] Software Engineer")).toBe(
"softwareengineer",
);
});
});
});

View File

@ -0,0 +1,77 @@
/**
* Cross-source duplicate detection.
*
* Two postings from different sources almost always describe the same role
* when their employer + title agree once you strip noise (case, punctuation,
* tracking suffixes, common decorations like "(Remote)", "- Toronto, ON" etc).
* The fingerprint is intentionally coarse so we err on the side of skipping
* a duplicate rather than re-showing it from a second source.
*/
const PUNCTUATION_RE = /[\p{P}\p{S}]+/gu;
const WHITESPACE_RE = /\s+/g;
const LEADING_NOISE_RE = /^(?:re-?post(?:ed)?|new|hot|urgent)\s*[-:]?\s*/i;
const PARENS_RE = /\s*[([][^)\]]*[)\]]/g;
// Trailing decorations we know are location / arrangement metadata, not role
// suffix. Matched after the title body and stripped before fingerprinting.
// Examples we want to strip:
// "Software Engineer — Remote"
// "Senior Engineer - Toronto, ON"
// "Designer | Hybrid"
// Examples we must NOT strip (otherwise we'd collide unrelated roles):
// "Etudiant Stage" (Stage is a role qualifier in French postings)
// "Designer — Senior" (level qualifier)
const TRAILING_LOCATION_KEYWORDS_RE =
/\s+[-|–—]\s+(?:remote|hybrid|on[\s-]?site|wfh|telework|anywhere)\s*$/i;
// Escape the ASCII hyphen explicitly so it doesn't form a character range
// with the surrounding delimiters (which would silently swallow letters).
const TRAILING_CITY_REGION_RE = /\s+[-|–—]\s+[^,\-|–—]+,\s*[^,\-|–—]+\s*$/;
const COMPANY_LEGAL_SUFFIX_RE =
/\b(?:inc|inc\.|ltd|ltd\.|llc|gmbh|s\.a\.|s\.r\.l|sa|nv|bv|plc|corp|corporation|co|company|holdings|holding)\b/g;
function stripDiacritics(input: string): string {
return input.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
}
export function normalizeEmployerForFingerprint(
employer: string | null | undefined,
): string {
if (!employer) return "";
let value = stripDiacritics(employer.toLowerCase()).trim();
value = value.replace(PARENS_RE, " ");
value = value.replace(COMPANY_LEGAL_SUFFIX_RE, " ");
value = value.replace(PUNCTUATION_RE, " ");
value = value.replace(WHITESPACE_RE, "").trim();
return value;
}
export function normalizeTitleForFingerprint(
title: string | null | undefined,
): string {
if (!title) return "";
let value = stripDiacritics(title.toLowerCase()).trim();
value = value.replace(LEADING_NOISE_RE, "");
value = value.replace(PARENS_RE, " ");
value = value.replace(TRAILING_LOCATION_KEYWORDS_RE, " ");
value = value.replace(TRAILING_CITY_REGION_RE, " ");
value = value.replace(PUNCTUATION_RE, " ");
value = value.replace(WHITESPACE_RE, "").trim();
return value;
}
/**
* Build a stable, source-agnostic fingerprint for a posting.
*
* Returns `null` when employer or title is empty after normalization, so
* callers fall back to URL/sourceJobId equality and don't accidentally
* collapse unrelated rows under the empty key.
*/
export function buildJobContentFingerprint(args: {
employer: string | null | undefined;
title: string | null | undefined;
}): string | null {
const employer = normalizeEmployerForFingerprint(args.employer);
const title = normalizeTitleForFingerprint(args.title);
if (!employer || !title) return null;
return `${employer}::${title}`;
}

View File

@ -99,7 +99,12 @@ export const SUPPORTED_COUNTRY_INPUTS = [
"worldwide", "worldwide",
] as const; ] as const;
const UK_ONLY_SOURCES = new Set<JobSource>(["gradcracker", "ukvisajobs"]); const UK_ONLY_SOURCES = new Set<JobSource>([
"gradcracker",
"ukvisajobs",
"reed",
]);
const US_ONLY_SOURCES = new Set<JobSource>(["usajobs"]);
const GLASSDOOR_SUPPORTED_COUNTRIES = new Set( const GLASSDOOR_SUPPORTED_COUNTRIES = new Set(
[ [
"australia", "australia",
@ -170,6 +175,10 @@ export function isUkCountry(country: string | null | undefined): boolean {
return normalizeCountryKey(country) === "united kingdom"; return normalizeCountryKey(country) === "united kingdom";
} }
export function isUsCountry(country: string | null | undefined): boolean {
return normalizeCountryKey(country) === "united states";
}
export function isGlassdoorCountry( export function isGlassdoorCountry(
country: string | null | undefined, country: string | null | undefined,
): boolean { ): boolean {
@ -187,6 +196,7 @@ export function isSourceAllowedForCountry(
country: string | null | undefined, country: string | null | undefined,
): boolean { ): boolean {
if (UK_ONLY_SOURCES.has(source)) return isUkCountry(country); if (UK_ONLY_SOURCES.has(source)) return isUkCountry(country);
if (US_ONLY_SOURCES.has(source)) return isUsCountry(country);
if (source === "glassdoor") return isGlassdoorCountry(country); if (source === "glassdoor") return isGlassdoorCountry(country);
if (source === "adzuna") return getAdzunaCountryCode(country) !== null; if (source === "adzuna") return getAdzunaCountryCode(country) !== null;
return true; return true;

View File

@ -28,6 +28,24 @@ function parseJsonArrayOrNull(raw: string | undefined): string[] | null {
} }
} }
/**
* Parse a delimited list (comma / newline / pipe) into a deduped, trimmed
* array. Used for env-backed defaults like LEVER_COMPANIES="acme,stripe".
*/
function parseCompanyList(raw: string | undefined | null): string[] {
if (!raw) return [];
const out: string[] = [];
const seen = new Set<string>();
for (const piece of raw.split(/[\n,;|]+/)) {
const value = piece.trim();
if (!value) continue;
if (seen.has(value)) continue;
seen.add(value);
out.push(value);
}
return out;
}
function parseBitBoolOrNull(raw: string | undefined): boolean | null { function parseBitBoolOrNull(raw: string | undefined): boolean | null {
if (!raw) return null; if (!raw) return null;
return raw === "true" || raw === "1"; return raw === "true" || raw === "1";
@ -336,6 +354,145 @@ export const settingsRegistry = {
parse: parseIntOrNull, parse: parseIntOrNull,
serialize: serializeNullableNumber, serialize: serializeNullableNumber,
}, },
// --- New extractor caps & per-source target lists ---
usajobsMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number =>
parseInt(
typeof process !== "undefined"
? process.env.USAJOBS_MAX_JOBS_PER_TERM || "100"
: "100",
10,
),
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
jobicyMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
themuseMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
joobleMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
careerjetMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
reedMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
remoteokMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
remotiveMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
arbeitnowMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
himalayasMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
weworkremotelyMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
fourdayweekMaxJobsPerTerm: {
kind: "typed" as const,
schema: z.number().int().min(1).max(1000),
default: (): number => 100,
parse: parseIntOrNull,
serialize: serializeNullableNumber,
},
/**
* Comma- or newline-separated company slugs to fetch from public ATS feeds.
* `lever`, `ashby`, and `greenhouse` each take one entry per company.
*/
leverCompanies: {
kind: "typed" as const,
schema: z.array(z.string().trim().min(1).max(100)).max(200),
default: (): string[] =>
parseCompanyList(
typeof process !== "undefined" ? process.env.LEVER_COMPANIES : "",
),
parse: parseJsonArrayOrNull,
serialize: serializeNullableJsonArray,
},
ashbyCompanies: {
kind: "typed" as const,
schema: z.array(z.string().trim().min(1).max(100)).max(200),
default: (): string[] =>
parseCompanyList(
typeof process !== "undefined" ? process.env.ASHBY_COMPANIES : "",
),
parse: parseJsonArrayOrNull,
serialize: serializeNullableJsonArray,
},
greenhouseCompanies: {
kind: "typed" as const,
schema: z.array(z.string().trim().min(1).max(100)).max(200),
default: (): string[] =>
parseCompanyList(
typeof process !== "undefined" ? process.env.GREENHOUSE_COMPANIES : "",
),
parse: parseJsonArrayOrNull,
serialize: serializeNullableJsonArray,
},
/**
* Workday tenant configurations as JSON, e.g.
* `[{"company":"Acme","tenantUrl":"https://acme.wd1.myworkdayjobs.com","sites":["External"]}]`.
*/
workdayTenants: {
kind: "typed" as const,
schema: z.array(z.string().trim().min(1).max(2000)).max(50),
default: (): string[] =>
parseCompanyList(
typeof process !== "undefined" ? process.env.WORKDAY_TENANTS : "",
),
parse: parseJsonArrayOrNull,
serialize: serializeNullableJsonArray,
},
searchTerms: { searchTerms: {
kind: "typed" as const, kind: "typed" as const,
schema: z.array(z.string().trim().min(1).max(200)).max(100), schema: z.array(z.string().trim().min(1).max(200)).max(100),
@ -626,6 +783,40 @@ export const settingsRegistry = {
envKey: "ADZUNA_APP_ID", envKey: "ADZUNA_APP_ID",
schema: z.string().trim().max(200), schema: z.string().trim().max(200),
}, },
// --- New extractor keys / identifiers (non-secret) ---
usajobsUserAgent: {
kind: "string" as const,
envKey: "USAJOBS_USER_AGENT",
schema: z.string().trim().max(200),
},
themuseApiKey: {
kind: "string" as const,
envKey: "THEMUSE_API_KEY",
schema: z.string().trim().max(200),
},
/** Publisher API key (Basic auth user); Careerjet labels this “API key” in the dashboard. */
careerjetAffid: {
kind: "string" as const,
envKey: "CAREERJET_AFFID",
schema: z.string().trim().max(200),
},
/** Required Referer URL for v4 (your job-search page that triggers API use). */
careerjetReferer: {
kind: "string" as const,
envKey: "CAREERJET_REFERER",
schema: z.string().trim().max(500),
},
/** Must match an IP allowlisted in Careerjet (usually your server egress IP). */
careerjetUserIp: {
kind: "string" as const,
envKey: "CAREERJET_USER_IP",
schema: z.string().trim().max(80),
},
careerjetUserAgent: {
kind: "string" as const,
envKey: "CAREERJET_USER_AGENT",
schema: z.string().trim().max(500),
},
basicAuthUser: { basicAuthUser: {
kind: "string" as const, kind: "string" as const,
envKey: "BASIC_AUTH_USER", envKey: "BASIC_AUTH_USER",
@ -658,6 +849,22 @@ export const settingsRegistry = {
envKey: "ADZUNA_APP_KEY", envKey: "ADZUNA_APP_KEY",
schema: z.string().trim().max(2000), schema: z.string().trim().max(2000),
}, },
// --- Secrets for new extractors ---
usajobsApiKey: {
kind: "secret" as const,
envKey: "USAJOBS_API_KEY",
schema: z.string().trim().max(2000),
},
joobleApiKey: {
kind: "secret" as const,
envKey: "JOOBLE_API_KEY",
schema: z.string().trim().max(2000),
},
reedApiKey: {
kind: "secret" as const,
envKey: "REED_API_KEY",
schema: z.string().trim().max(2000),
},
basicAuthPassword: { basicAuthPassword: {
kind: "secret" as const, kind: "secret" as const,
envKey: "BASIC_AUTH_PASSWORD", envKey: "BASIC_AUTH_PASSWORD",

View File

@ -188,6 +188,22 @@ export const createAppSettings = (
adzunaMaxJobsPerTerm: { value: 50, default: 50, override: null }, adzunaMaxJobsPerTerm: { value: 50, default: 50, override: null },
gradcrackerMaxJobsPerTerm: { value: 50, default: 50, override: null }, gradcrackerMaxJobsPerTerm: { value: 50, default: 50, override: null },
startupjobsMaxJobsPerTerm: { value: 50, default: 50, override: null }, startupjobsMaxJobsPerTerm: { value: 50, default: 50, override: null },
usajobsMaxJobsPerTerm: { value: 50, default: 50, override: null },
jobicyMaxJobsPerTerm: { value: 50, default: 50, override: null },
themuseMaxJobsPerTerm: { value: 50, default: 50, override: null },
joobleMaxJobsPerTerm: { value: 50, default: 50, override: null },
careerjetMaxJobsPerTerm: { value: 50, default: 50, override: null },
reedMaxJobsPerTerm: { value: 50, default: 50, override: null },
remoteokMaxJobsPerTerm: { value: 50, default: 50, override: null },
remotiveMaxJobsPerTerm: { value: 50, default: 50, override: null },
arbeitnowMaxJobsPerTerm: { value: 50, default: 50, override: null },
himalayasMaxJobsPerTerm: { value: 50, default: 50, override: null },
weworkremotelyMaxJobsPerTerm: { value: 50, default: 50, override: null },
fourdayweekMaxJobsPerTerm: { value: 50, default: 50, override: null },
leverCompanies: { value: [], default: [], override: null },
ashbyCompanies: { value: [], default: [], override: null },
greenhouseCompanies: { value: [], default: [], override: null },
workdayTenants: { value: [], default: [], override: null },
searchTerms: { searchTerms: {
value: ["Software Engineer"], value: ["Software Engineer"],
default: ["Software Engineer"], default: ["Software Engineer"],
@ -256,6 +272,15 @@ export const createAppSettings = (
adzunaAppId: null, adzunaAppId: null,
adzunaAppKeyHint: null, adzunaAppKeyHint: null,
webhookSecretHint: null, webhookSecretHint: null,
usajobsUserAgent: null,
themuseApiKey: null,
careerjetAffid: null,
careerjetReferer: null,
careerjetUserIp: null,
careerjetUserAgent: null,
usajobsApiKeyHint: null,
joobleApiKeyHint: null,
reedApiKeyHint: null,
basicAuthActive: false, basicAuthActive: false,
localResumeFileConfigured: false, localResumeFileConfigured: false,
backupEnabled: { value: false, default: false, override: null }, backupEnabled: { value: false, default: false, override: null },

View File

@ -201,6 +201,22 @@ export interface AppSettings {
adzunaMaxJobsPerTerm: Resolved<number>; adzunaMaxJobsPerTerm: Resolved<number>;
gradcrackerMaxJobsPerTerm: Resolved<number>; gradcrackerMaxJobsPerTerm: Resolved<number>;
startupjobsMaxJobsPerTerm: Resolved<number>; startupjobsMaxJobsPerTerm: Resolved<number>;
usajobsMaxJobsPerTerm: Resolved<number>;
jobicyMaxJobsPerTerm: Resolved<number>;
themuseMaxJobsPerTerm: Resolved<number>;
joobleMaxJobsPerTerm: Resolved<number>;
careerjetMaxJobsPerTerm: Resolved<number>;
reedMaxJobsPerTerm: Resolved<number>;
remoteokMaxJobsPerTerm: Resolved<number>;
remotiveMaxJobsPerTerm: Resolved<number>;
arbeitnowMaxJobsPerTerm: Resolved<number>;
himalayasMaxJobsPerTerm: Resolved<number>;
weworkremotelyMaxJobsPerTerm: Resolved<number>;
fourdayweekMaxJobsPerTerm: Resolved<number>;
leverCompanies: Resolved<string[]>;
ashbyCompanies: Resolved<string[]>;
greenhouseCompanies: Resolved<string[]>;
workdayTenants: Resolved<string[]>;
searchTerms: Resolved<string[]>; searchTerms: Resolved<string[]>;
workplaceTypes: Resolved<Array<"remote" | "hybrid" | "onsite">>; workplaceTypes: Resolved<Array<"remote" | "hybrid" | "onsite">>;
blockedCompanyKeywords: Resolved<string[]>; blockedCompanyKeywords: Resolved<string[]>;
@ -241,6 +257,12 @@ export interface AppSettings {
ukvisajobsEmail: string | null; ukvisajobsEmail: string | null;
adzunaAppId: string | null; adzunaAppId: string | null;
basicAuthUser: string | null; basicAuthUser: string | null;
usajobsUserAgent: string | null;
themuseApiKey: string | null;
careerjetAffid: string | null;
careerjetReferer: string | null;
careerjetUserIp: string | null;
careerjetUserAgent: string | null;
// Secret hints: // Secret hints:
llmApiKeyHint: string | null; llmApiKeyHint: string | null;
@ -250,6 +272,9 @@ export interface AppSettings {
adzunaAppKeyHint: string | null; adzunaAppKeyHint: string | null;
basicAuthPasswordHint: string | null; basicAuthPasswordHint: string | null;
webhookSecretHint: string | null; webhookSecretHint: string | null;
usajobsApiKeyHint: string | null;
joobleApiKeyHint: string | null;
reedApiKeyHint: string | null;
// Computed: // Computed:
basicAuthActive: boolean; basicAuthActive: boolean;