ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

189 lines
5.3 KiB
TypeScript

/**
* Greenhouse public job boards API.
*
* https://developers.greenhouse.io/job-board.html
* GET https://boards-api.greenhouse.io/v1/boards/{company}/jobs?content=true
*
* No auth. Each entry in `greenhouseCompanies` is fetched independently.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface GhDepartment {
id?: number;
name?: string;
}
interface GhMetadata {
name?: string;
value?: unknown;
}
interface GhJob {
id?: number;
title?: string;
absolute_url?: string;
internal_job_id?: number;
updated_at?: string;
requisition_id?: string | null;
location?: { name?: string };
content?: string; // HTML, may be entity-encoded
metadata?: GhMetadata[];
departments?: GhDepartment[];
offices?: Array<{ name?: string }>;
}
interface GhResponse {
jobs?: GhJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function readCompanies(raw: string | undefined): string[] {
if (!raw) return [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
.filter(Boolean);
}
} catch {
// fall through
}
return raw
.split(/[\n,;|]+/)
.map((entry) => entry.trim())
.filter(Boolean);
}
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, " ");
}
function mapJob(job: GhJob, company: string): CreateJobInput | null {
const jobUrl = asString(job.absolute_url);
if (!jobUrl) return null;
const employer = company
.split(/[-_]/)
.filter(Boolean)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
const officeNames =
job.offices
?.map((office) => asString(office.name))
.filter((name): name is string => Boolean(name)) ?? [];
const departmentNames =
job.departments
?.map((dept) => asString(dept.name))
.filter((name): name is string => Boolean(name)) ?? [];
const description = job.content ? decodeHtmlEntities(job.content) : undefined;
return {
source: "greenhouse",
sourceJobId: job.id != null ? String(job.id) : undefined,
title: asString(job.title) ?? "Unknown Title",
employer: employer || company,
jobUrl,
applicationLink: jobUrl,
location:
asString(job.location?.name) ?? (officeNames.join("; ") || undefined),
jobFunction:
departmentNames.length > 0 ? departmentNames.join(", ") : undefined,
datePosted: asString(job.updated_at),
jobDescription: description,
};
}
async function fetchCompany(company: string): Promise<GhJob[]> {
const url = `https://boards-api.greenhouse.io/v1/boards/${encodeURIComponent(company)}/jobs?content=true`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (response.status === 404) return [];
if (!response.ok) {
throw new Error(
`Greenhouse request for "${company}" failed with status ${response.status}`,
);
}
const body = (await response.json()) as GhResponse;
return Array.isArray(body.jobs) ? body.jobs : [];
}
export const manifest: ExtractorManifest = {
id: "greenhouse",
displayName: "Greenhouse (ATS)",
providesSources: ["greenhouse"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const companies = readCompanies(context.settings.greenhouseCompanies);
if (companies.length === 0) {
return {
success: true,
jobs: [],
error:
"No Greenhouse companies configured. Set GREENHOUSE_COMPANIES or the greenhouseCompanies setting (comma- or newline-separated slugs).",
};
}
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < companies.length; i += 1) {
if (context.shouldCancel?.()) break;
const company = companies[i];
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: companies.length,
currentUrl: company,
detail: `Greenhouse: ${company} (${i + 1}/${companies.length})`,
});
let added = 0;
const jobs = await fetchCompany(company);
for (const job of jobs) {
const mapped = mapJob(job, company);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
added += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: companies.length,
currentUrl: company,
jobPagesProcessed: out.length,
detail: `Greenhouse: ${company}${added} jobs (${out.length} total)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;