ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

173 lines
4.8 KiB
TypeScript

/**
* Arbeitnow public job board API.
*
* https://www.arbeitnow.com/api/job-board-api?page=N
*
* No auth. Returns 100 results per page, sorted by creation date.
* No server-side search — we paginate and filter client-side by
* title + tags against each pipeline search term.
*
* Aggregates listings from Greenhouse, SmartRecruiters, Join,
* TeamTailor, Recruitee, and Comeet.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://www.arbeitnow.com/api/job-board-api";
const MAX_PAGES = 5;
interface ArbeitnowJob {
slug?: string;
company_name?: string;
title?: string;
description?: string;
remote?: boolean;
url?: string;
tags?: string[];
job_types?: string[];
location?: string;
created_at?: number;
}
interface ArbeitnowResponse {
data?: ArbeitnowJob[];
links?: { next?: string | null };
meta?: { current_page?: number };
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed || undefined;
}
function matchesTerm(job: ArbeitnowJob, term: string): boolean {
const lower = term.toLowerCase();
if (job.title?.toLowerCase().includes(lower)) return true;
if (
Array.isArray(job.tags) &&
job.tags.some(
(t) => typeof t === "string" && t.toLowerCase().includes(lower),
)
)
return true;
return false;
}
function mapJob(raw: ArbeitnowJob): CreateJobInput | null {
const jobUrl = asString(raw.url);
if (!jobUrl) return null;
const tags = Array.isArray(raw.tags)
? raw.tags.filter((t): t is string => typeof t === "string" && t.length > 0)
: [];
const jobTypes = Array.isArray(raw.job_types)
? raw.job_types
.filter((t): t is string => typeof t === "string" && t.length > 0)
.join(", ")
: undefined;
const datePosted =
typeof raw.created_at === "number"
? new Date(raw.created_at * 1000).toISOString()
: undefined;
return {
source: "arbeitnow",
sourceJobId: asString(raw.slug),
title: asString(raw.title) ?? "Unknown Title",
employer: asString(raw.company_name) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: asString(raw.location) ?? "Unknown",
isRemote: raw.remote === true,
jobType: jobTypes || undefined,
datePosted,
jobDescription: asString(raw.description),
disciplines: tags.length > 0 ? tags.join(", ") : undefined,
};
}
async function fetchPage(page: number): Promise<ArbeitnowResponse> {
const url = `${API_URL}?page=${page}`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`Arbeitnow request failed with status ${response.status}`);
}
return (await response.json()) as ArbeitnowResponse;
}
export const manifest: ExtractorManifest = {
id: "arbeitnow",
displayName: "Arbeitnow",
providesSources: ["arbeitnow"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.arbeitnowMaxJobsPerTerm
? Number.parseInt(context.settings.arbeitnowMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let page = 1; page <= MAX_PAGES; page += 1) {
if (context.shouldCancel?.()) break;
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `page ${page}`,
detail: `Arbeitnow: fetching page ${page}`,
});
const body = await fetchPage(page);
const jobs = Array.isArray(body.data) ? body.data : [];
if (jobs.length === 0) break;
for (const raw of jobs) {
if (terms.length > 0 && !terms.some((t) => matchesTerm(raw, t))) {
continue;
}
const mapped = mapJob(raw);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `page ${page}`,
jobPagesProcessed: out.length,
detail: `Arbeitnow: page ${page} done (${out.length} matched so far)`,
});
if (!body.links?.next) break;
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;