ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

225 lines
7.2 KiB
TypeScript

/**
* The Muse public jobs API.
*
* https://www.themuse.com/api/public/jobs?page=0&category=...&location=...
*
* The endpoint works without auth but is heavily rate-limited; an API key
* (THEMUSE_API_KEY / `themuseApiKey` setting) lifts that. We pass each pipeline
* search term as a `category` to keep parity with how other extractors iterate
* search terms; if your role doesn't map to a Muse category it'll still match
* because Muse falls back to generic listings.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://www.themuse.com/api/public/jobs";
interface MuseLocation {
name?: string;
}
interface MuseCompany {
name?: string;
short_name?: string;
}
interface MuseRefs {
landing_page?: string;
}
interface MuseJob {
id?: number;
name?: string;
publication_date?: string;
type?: string;
contents?: string;
short_description?: string;
locations?: MuseLocation[];
company?: MuseCompany;
refs?: MuseRefs;
}
interface MuseResponse {
page?: number;
page_count?: number;
results?: MuseJob[];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function joinLocations(
locations: MuseLocation[] | undefined,
): string | undefined {
if (!locations || locations.length === 0) return undefined;
const cleaned = locations
.map((entry) => asString(entry.name))
.filter((name): name is string => Boolean(name));
return cleaned.length > 0 ? cleaned.join("; ") : undefined;
}
function isRemoteFromLocations(
locations: MuseLocation[] | undefined,
): boolean | undefined {
if (!locations || locations.length === 0) return undefined;
return locations.some((loc) =>
typeof loc.name === "string"
? /\bflexible|remote\b/i.test(loc.name)
: false,
);
}
// The Muse `category` filter expects an exact, Title-Cased category name (e.g.
// "Software Engineer", "Engineering"). User-supplied search terms are commonly
// lowercase free-text, which the API silently ignores and returns zero results.
// Title-case the term so common values map to real categories; if the term
// still doesn't match a category the extractor will fall back to no filter.
function toMuseCategory(term: string): string | undefined {
const trimmed = term.trim();
if (!trimmed) return undefined;
return trimmed
.toLowerCase()
.split(/\s+/)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(" ");
}
function mapJob(raw: MuseJob): CreateJobInput | null {
const jobUrl = asString(raw.refs?.landing_page);
if (!jobUrl) return null;
return {
source: "themuse",
sourceJobId: raw.id != null ? String(raw.id) : undefined,
title: asString(raw.name) ?? "Unknown Title",
employer: asString(raw.company?.name) ?? "Unknown Employer",
jobUrl,
applicationLink: jobUrl,
location: joinLocations(raw.locations),
isRemote: isRemoteFromLocations(raw.locations),
jobType: asString(raw.type),
datePosted: asString(raw.publication_date),
jobDescription:
asString(raw.contents) ?? asString(raw.short_description) ?? undefined,
};
}
async function fetchPage(args: {
apiKey?: string;
page: number;
category?: string;
location?: string;
}): Promise<MuseResponse> {
const url = new URL(API_URL);
url.searchParams.set("page", String(args.page));
if (args.category) url.searchParams.set("category", args.category);
if (args.location) url.searchParams.set("location", args.location);
if (args.apiKey) url.searchParams.set("api_key", args.apiKey);
const response = await fetch(url.toString(), {
headers: { Accept: "application/json" },
});
if (!response.ok) {
throw new Error(`The Muse request failed with status ${response.status}`);
}
return (await response.json()) as MuseResponse;
}
export const manifest: ExtractorManifest = {
id: "themuse",
displayName: "The Muse",
providesSources: ["themuse"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const apiKey = context.settings.themuseApiKey?.trim() || undefined;
const maxJobsPerTerm = context.settings.themuseMaxJobsPerTerm
? Number.parseInt(context.settings.themuseMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const locationHint =
context.settings.searchCities?.split("|")[0]?.trim() || undefined;
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all)",
detail: `The Muse: term ${i + 1}/${terms.length}`,
});
let collected = 0;
let page = 0;
let pageCount = Number.POSITIVE_INFINITY;
// The Muse returns pageCount; cap pages defensively to avoid runaway
// loops if the API misbehaves. We try the term as a category first and,
// if the very first page is empty, drop the category filter once so an
// unknown category doesn't silently nuke the entire term.
let categoryToUse: string | undefined = toMuseCategory(term);
let droppedCategory = false;
while (collected < maxJobsPerTerm && page < pageCount && page < 100) {
if (context.shouldCancel?.()) break;
const body = await fetchPage({
apiKey,
page,
category: categoryToUse,
location: locationHint,
});
if (typeof body.page_count === "number") {
pageCount = body.page_count;
}
const results = Array.isArray(body.results) ? body.results : [];
if (results.length === 0) {
if (page === 0 && categoryToUse && !droppedCategory) {
categoryToUse = undefined;
droppedCategory = true;
pageCount = Number.POSITIVE_INFINITY;
continue;
}
break;
}
for (const item of results) {
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
if (collected >= maxJobsPerTerm) break;
}
page += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all)",
jobPagesProcessed: out.length,
detail: `The Muse: completed term ${i + 1}/${terms.length} (${collected} found)`,
});
}
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
return { success: true, jobs: out };
},
};
export default manifest;