Jobber/extractors/usajobs/manifest.ts
ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

264 lines
8.0 KiB
TypeScript

/**
* USAJOBS public search API.
*
* https://developer.usajobs.gov/api-reference/get-api-search
*
* Requires:
* - USAJOBS_API_KEY (`usajobsApiKey` setting)
* - USAJOBS_USER_AGENT — must be a real contact email per their TOS
*
* The orchestrator already gates this source to United States via
* `isSourceAllowedForCountry`, so we don't re-validate country here.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://data.usajobs.gov/api/Search";
interface UsaJobsLocation {
LocationName?: string;
CountryCode?: string;
}
interface UsaJobsRemuneration {
MinimumRange?: string;
MaximumRange?: string;
RateIntervalCode?: string;
}
interface UsaJobsDescriptor {
PositionID?: string;
PositionTitle?: string;
PositionURI?: string;
ApplyURI?: string[];
PositionLocationDisplay?: string;
PositionLocation?: UsaJobsLocation[];
OrganizationName?: string;
DepartmentName?: string;
PublicationStartDate?: string;
PositionStartDate?: string;
PositionEndDate?: string;
PositionRemuneration?: UsaJobsRemuneration[];
UserArea?: { Details?: { JobSummary?: string } };
PositionSchedule?: Array<{ Name?: string }>;
}
interface UsaJobsSearchResultItem {
MatchedObjectDescriptor?: UsaJobsDescriptor;
}
interface UsaJobsSearchResult {
SearchResult?: {
SearchResultCountAll?: number;
SearchResultItems?: UsaJobsSearchResultItem[];
};
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function toNumberOrUndefined(value: unknown): number | undefined {
if (typeof value === "number" && Number.isFinite(value)) return value;
if (typeof value === "string") {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
return undefined;
}
function mapInterval(code: string | undefined): string | undefined {
if (!code) return undefined;
switch (code.toLowerCase()) {
case "py":
case "pa":
return "yearly";
case "ph":
return "hourly";
case "pd":
return "daily";
case "pm":
return "monthly";
case "pw":
return "weekly";
default:
return undefined;
}
}
function mapJob(item: UsaJobsSearchResultItem): CreateJobInput | null {
const descriptor = item.MatchedObjectDescriptor;
if (!descriptor) return null;
const jobUrl = asString(descriptor.PositionURI);
if (!jobUrl) return null;
const remuneration = descriptor.PositionRemuneration?.[0];
const min = toNumberOrUndefined(remuneration?.MinimumRange);
const max = toNumberOrUndefined(remuneration?.MaximumRange);
const interval = mapInterval(remuneration?.RateIntervalCode);
const applyArr = descriptor.ApplyURI;
const applicationLink =
Array.isArray(applyArr) && applyArr.length > 0
? (asString(applyArr[0]) ?? jobUrl)
: jobUrl;
return {
source: "usajobs",
sourceJobId: asString(descriptor.PositionID),
title: asString(descriptor.PositionTitle) ?? "Unknown Title",
employer:
asString(descriptor.OrganizationName) ??
asString(descriptor.DepartmentName) ??
"U.S. Federal Government",
jobUrl,
applicationLink,
location: asString(descriptor.PositionLocationDisplay),
datePosted: asString(descriptor.PublicationStartDate),
deadline: asString(descriptor.PositionEndDate),
jobDescription: asString(descriptor.UserArea?.Details?.JobSummary),
jobType: descriptor.PositionSchedule?.[0]?.Name?.trim() || undefined,
salaryMinAmount: min,
salaryMaxAmount: max,
salaryCurrency: min || max ? "USD" : undefined,
salaryInterval: interval,
};
}
async function fetchPage(args: {
apiKey: string;
userAgent: string;
keyword: string;
locationName?: string;
page: number;
resultsPerPage: number;
}): Promise<UsaJobsSearchResult> {
const url = new URL(API_URL);
url.searchParams.set("Keyword", args.keyword);
if (args.locationName) {
url.searchParams.set("LocationName", args.locationName);
}
url.searchParams.set("ResultsPerPage", String(args.resultsPerPage));
url.searchParams.set("Page", String(args.page));
url.searchParams.set("SortField", "OpenDate");
url.searchParams.set("SortDirection", "Desc");
const response = await fetch(url.toString(), {
headers: {
Host: "data.usajobs.gov",
"User-Agent": args.userAgent,
"Authorization-Key": args.apiKey,
Accept: "application/json",
},
});
if (!response.ok) {
throw new Error(`USAJOBS request failed with status ${response.status}`);
}
return (await response.json()) as UsaJobsSearchResult;
}
export const manifest: ExtractorManifest = {
id: "usajobs",
displayName: "USAJOBS",
providesSources: ["usajobs"],
requiredEnvVars: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const apiKey =
context.settings.usajobsApiKey?.trim() ||
process.env.USAJOBS_API_KEY?.trim();
const userAgent =
context.settings.usajobsUserAgent?.trim() ||
process.env.USAJOBS_USER_AGENT?.trim();
if (!apiKey || !userAgent) {
return {
success: false,
jobs: [],
error:
"USAJOBS extractor requires USAJOBS_API_KEY and USAJOBS_USER_AGENT (a contact email)",
};
}
const maxJobsPerTerm = context.settings.usajobsMaxJobsPerTerm
? Number.parseInt(context.settings.usajobsMaxJobsPerTerm, 10)
: 100;
// USAJOBS caps page size at 500, but smaller pages are friendlier on retry.
const resultsPerPage = 50;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const locationName =
context.settings.searchCities?.split("|")[0]?.trim() || undefined;
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all)",
detail: `USAJOBS: term ${i + 1}/${terms.length}`,
});
let collected = 0;
let page = 1;
let total = Number.POSITIVE_INFINITY;
while (
collected < maxJobsPerTerm &&
(page - 1) * resultsPerPage < total &&
page < 200
) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
apiKey,
userAgent,
keyword: term,
locationName,
page,
resultsPerPage,
});
if (typeof body.SearchResult?.SearchResultCountAll === "number") {
total = body.SearchResult.SearchResultCountAll;
}
const items = body.SearchResult?.SearchResultItems ?? [];
if (items.length === 0) break;
for (const item of items) {
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
if (items.length < resultsPerPage) break;
page += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all)",
jobPagesProcessed: out.length,
detail: `USAJOBS: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;