ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

264 lines
7.7 KiB
TypeScript

/**
* Workday public career-site extractor.
*
* Workday tenants expose their public job board over a JSON CXS endpoint:
* POST {tenantUrl}/wday/cxs/{tenant}/{site}/jobs
* { appliedFacets: {}, limit: 20, offset: 0, searchText: "..." }
*
* `workdayTenants` accepts entries shaped as JSON objects (preferred) or as
* career-page URLs we parse on a best-effort basis. When we can't recover the
* tenant + site we skip the entry and continue.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface WorkdayTarget {
company: string;
tenantUrl: string;
tenant: string;
site: string;
locale?: string;
}
interface WorkdayJobPosting {
title?: string;
externalPath?: string;
locationsText?: string;
postedOn?: string;
bulletFields?: string[];
}
interface WorkdayResponse {
total?: number;
jobPostings?: WorkdayJobPosting[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function inferTenantFromHost(host: string): string | null {
// host looks like `acme.wd5.myworkdayjobs.com` → tenant "acme"
const match = host.match(/^([^.]+)\.wd\d+\.myworkdayjobs\.com$/i);
return match ? match[1] : null;
}
function parseTargetEntry(entry: string): WorkdayTarget | null {
const trimmed = entry.trim();
if (!trimmed) return null;
// First, try JSON.
try {
const parsed = JSON.parse(trimmed) as Partial<WorkdayTarget>;
if (
parsed &&
typeof parsed.company === "string" &&
typeof parsed.tenantUrl === "string" &&
typeof parsed.tenant === "string" &&
typeof parsed.site === "string"
) {
return {
company: parsed.company,
tenantUrl: parsed.tenantUrl.replace(/\/$/, ""),
tenant: parsed.tenant,
site: parsed.site,
locale: typeof parsed.locale === "string" ? parsed.locale : undefined,
};
}
} catch {
// Fall through to URL parsing.
}
// URL form, e.g.
// https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite
try {
const url = new URL(trimmed);
const tenant = inferTenantFromHost(url.host);
if (!tenant) return null;
const segments = url.pathname.split("/").filter(Boolean);
if (segments.length < 2) return null;
const [maybeLocale, site] = segments;
return {
company: tenant,
tenantUrl: `${url.protocol}//${url.host}`,
tenant,
site,
locale: maybeLocale,
};
} catch {
return null;
}
}
function readTargets(raw: string | undefined): WorkdayTarget[] {
if (!raw) return [];
const out: WorkdayTarget[] = [];
// settings store stringifies JSON arrays; if we got a JSON array of strings
// we still need to parse each entry individually.
let entries: string[] = [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
entries = parsed
.map((entry) =>
typeof entry === "string" ? entry : JSON.stringify(entry),
)
.filter(Boolean);
}
} catch {
entries = raw
.split(/\n+/)
.map((line) => line.trim())
.filter(Boolean);
}
if (entries.length === 0) {
entries = raw
.split(/\n+/)
.map((line) => line.trim())
.filter(Boolean);
}
for (const entry of entries) {
const target = parseTargetEntry(entry);
if (target) out.push(target);
}
return out;
}
function mapPosting(
posting: WorkdayJobPosting,
target: WorkdayTarget,
): CreateJobInput | null {
const externalPath = asString(posting.externalPath);
if (!externalPath) return null;
const locale = target.locale ?? "en-US";
const jobUrl = `${target.tenantUrl}/${locale}/${target.site}${externalPath}`;
return {
source: "workday",
sourceJobId: externalPath,
title: asString(posting.title) ?? "Unknown Title",
employer: target.company,
jobUrl,
applicationLink: jobUrl,
location: asString(posting.locationsText),
datePosted: asString(posting.postedOn),
jobType: posting.bulletFields?.find((field) => field?.length)?.trim(),
};
}
async function fetchPage(args: {
target: WorkdayTarget;
searchText: string;
offset: number;
limit: number;
}): Promise<WorkdayResponse> {
const url = `${args.target.tenantUrl}/wday/cxs/${encodeURIComponent(args.target.tenant)}/${encodeURIComponent(args.target.site)}/jobs`;
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
Accept: "application/json",
},
body: JSON.stringify({
appliedFacets: {},
limit: args.limit,
offset: args.offset,
searchText: args.searchText,
}),
});
if (!response.ok) {
throw new Error(
`Workday request for "${args.target.company}" failed with status ${response.status}`,
);
}
return (await response.json()) as WorkdayResponse;
}
export const manifest: ExtractorManifest = {
id: "workday",
displayName: "Workday (ATS)",
providesSources: ["workday"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const targets = readTargets(context.settings.workdayTenants);
if (targets.length === 0) {
return {
success: true,
jobs: [],
error:
"No Workday tenants configured. Set WORKDAY_TENANTS or the workdayTenants setting to a list of career-site URLs (or JSON entries with company/tenantUrl/tenant/site).",
};
}
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
const limit = 20;
const errors: string[] = [];
for (let t = 0; t < targets.length; t += 1) {
if (context.shouldCancel?.()) break;
const target = targets[t];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: t * terms.length + i,
termsTotal: targets.length * terms.length,
currentUrl: `${target.company} (${term || "all"})`,
detail: `Workday: ${target.company} term ${i + 1}/${terms.length}`,
});
let offset = 0;
let total = Number.POSITIVE_INFINITY;
while (offset < total && offset < 1000) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
target,
searchText: term,
offset,
limit,
});
if (typeof body.total === "number") total = body.total;
const postings = Array.isArray(body.jobPostings)
? body.jobPostings
: [];
if (postings.length === 0) break;
for (const posting of postings) {
const mapped = mapPosting(posting, target);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
offset += postings.length;
if (postings.length < limit) break;
}
}
} catch (error) {
const message =
error instanceof Error ? error.message : "Unknown error";
errors.push(`${target.company}: ${message}`);
}
}
if (out.length === 0 && errors.length > 0) {
return { success: false, jobs: out, error: errors.join("; ") };
}
return {
success: true,
jobs: out,
error: errors.length > 0 ? errors.join("; ") : undefined,
};
},
};
export default manifest;