ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

196 lines
5.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Himalayas public remote-jobs API.
*
* https://himalayas.app/jobs/api?limit=N&offset=M
*
* No auth. Returns up to `limit` results per call. No server-side
* search — we paginate and filter client-side by title + categories.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://himalayas.app/jobs/api";
const PAGE_SIZE = 50;
const MAX_PAGES = 5;
interface HimalayasJob {
title?: string;
excerpt?: string;
companyName?: string;
companySlug?: string;
companyLogo?: string;
employmentType?: string;
minSalary?: number | null;
maxSalary?: number | null;
currency?: string;
seniority?: string[];
locationRestrictions?: string[];
timezoneRestrictions?: number[];
categories?: string[];
parentCategories?: string[];
description?: string;
pubDate?: number;
expiryDate?: number;
applicationLink?: string;
guid?: string;
}
interface HimalayasResponse {
jobs?: HimalayasJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed || undefined;
}
function matchesTerm(job: HimalayasJob, term: string): boolean {
const lower = term.toLowerCase();
if (job.title?.toLowerCase().includes(lower)) return true;
if (
Array.isArray(job.categories) &&
job.categories.some(
(c) =>
typeof c === "string" &&
c.toLowerCase().replace(/-/g, " ").includes(lower),
)
)
return true;
return false;
}
function formatSalary(job: HimalayasJob): string | undefined {
if (job.minSalary == null && job.maxSalary == null) return undefined;
const cur = job.currency ?? "USD";
if (job.minSalary != null && job.maxSalary != null) {
return `${cur} ${job.minSalary.toLocaleString()}${job.maxSalary.toLocaleString()}`;
}
const val = job.minSalary ?? job.maxSalary;
return val != null ? `${cur} ${val.toLocaleString()}` : undefined;
}
function mapJob(raw: HimalayasJob): CreateJobInput | null {
const jobUrl = asString(raw.applicationLink) ?? asString(raw.guid);
if (!jobUrl) return null;
const categories = Array.isArray(raw.categories)
? raw.categories.filter(
(c): c is string => typeof c === "string" && c.length > 0,
)
: [];
const locations = Array.isArray(raw.locationRestrictions)
? raw.locationRestrictions.filter(
(l): l is string => typeof l === "string" && l.length > 0,
)
: [];
const datePosted =
typeof raw.pubDate === "number"
? new Date(raw.pubDate * 1000).toISOString()
: undefined;
return {
source: "himalayas",
sourceJobId: asString(raw.guid),
title: asString(raw.title) ?? "Unknown Title",
employer: asString(raw.companyName) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: locations.length > 0 ? locations.join(", ") : "Remote",
isRemote: true,
jobType: asString(raw.employmentType),
companyLogo: asString(raw.companyLogo),
datePosted,
salary: formatSalary(raw),
jobDescription: asString(raw.description),
disciplines: categories.length > 0 ? categories.join(", ") : undefined,
};
}
async function fetchPage(
offset: number,
limit: number,
): Promise<HimalayasJob[]> {
const url = `${API_URL}?limit=${limit}&offset=${offset}`;
const response = await fetch(url, {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`Himalayas request failed with status ${response.status}`);
}
const body = (await response.json()) as HimalayasResponse;
return Array.isArray(body.jobs) ? body.jobs : [];
}
export const manifest: ExtractorManifest = {
id: "himalayas",
displayName: "Himalayas",
providesSources: ["himalayas"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.himalayasMaxJobsPerTerm
? Number.parseInt(context.settings.himalayasMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let page = 0; page < MAX_PAGES; page += 1) {
if (context.shouldCancel?.()) break;
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
const offset = page * PAGE_SIZE;
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `offset ${offset}`,
detail: `Himalayas: fetching page ${page + 1}`,
});
const raw = await fetchPage(offset, PAGE_SIZE);
if (raw.length === 0) break;
for (const item of raw) {
if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) {
continue;
}
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: `offset ${offset}`,
jobPagesProcessed: out.length,
detail: `Himalayas: page ${page + 1} done (${out.length} matched so far)`,
});
if (raw.length < PAGE_SIZE) break;
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;