Jobber/scripts/smoke-extractors.ts
ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

220 lines
5.7 KiB
TypeScript

/**
* Tiny smoke-test for new extractors: imports each manifest, runs it with a
* minimal context, and prints the count of mapped jobs + a few samples.
*
* Run from repo root: npx tsx scripts/smoke-extractors.ts [comma,separated,ids]
*
* Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain
* `tsx` does not read `.env` automatically).
*/
import path from "node:path";
import { fileURLToPath } from "node:url";
import { config as loadEnv } from "dotenv";
import type {
ExtractorManifest,
ExtractorRuntimeContext,
} from "../shared/src/types/extractors";
const repoRoot = path.resolve(
path.dirname(fileURLToPath(import.meta.url)),
"..",
);
loadEnv({ path: path.join(repoRoot, ".env") });
interface Target {
id: string;
importPath: string;
needs?: string[]; // env vars required to run; skipped if missing
settings?: Record<string, string>;
}
const ALL_TARGETS: Target[] = [
{
id: "jobicy",
importPath: "../extractors/jobicy/manifest",
settings: { jobicyMaxJobsPerTerm: "10" },
},
{
id: "themuse",
importPath: "../extractors/themuse/manifest",
settings: { themuseMaxJobsPerTerm: "10" },
},
{
id: "usajobs",
importPath: "../extractors/usajobs/manifest",
needs: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
settings: { usajobsMaxJobsPerTerm: "10" },
},
{
id: "jooble",
importPath: "../extractors/jooble/manifest",
needs: ["JOOBLE_API_KEY"],
settings: { joobleMaxJobsPerTerm: "10" },
},
{
id: "careerjet",
importPath: "../extractors/careerjet/manifest",
needs: ["CAREERJET_AFFID", "CAREERJET_REFERER", "CAREERJET_USER_IP"],
settings: { careerjetMaxJobsPerTerm: "10" },
},
{
id: "reed",
importPath: "../extractors/reed/manifest",
needs: ["REED_API_KEY"],
settings: { reedMaxJobsPerTerm: "10" },
},
{
id: "lever",
importPath: "../extractors/lever/manifest",
settings: {
// Known active public Lever board used purely as a connectivity check.
leverCompanies: JSON.stringify(["palantir", "netflix"]),
},
},
{
id: "ashby",
importPath: "../extractors/ashby/manifest",
settings: {
ashbyCompanies: JSON.stringify(["ramp", "linear"]),
},
},
{
id: "greenhouse",
importPath: "../extractors/greenhouse/manifest",
settings: {
greenhouseCompanies: JSON.stringify(["stripe", "airbnb"]),
},
},
{
id: "workday",
importPath: "../extractors/workday/manifest",
settings: {
workdayTenants: JSON.stringify([
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
]),
},
},
{
id: "remoteok",
importPath: "../extractors/remoteok/manifest",
settings: { remoteokMaxJobsPerTerm: "10" },
},
{
id: "remotive",
importPath: "../extractors/remotive/manifest",
settings: { remotiveMaxJobsPerTerm: "10" },
},
{
id: "arbeitnow",
importPath: "../extractors/arbeitnow/manifest",
settings: { arbeitnowMaxJobsPerTerm: "10" },
},
{
id: "himalayas",
importPath: "../extractors/himalayas/manifest",
settings: { himalayasMaxJobsPerTerm: "10" },
},
{
id: "weworkremotely",
importPath: "../extractors/weworkremotely/manifest",
settings: { weworkremotelyMaxJobsPerTerm: "10" },
},
{
id: "fourdayweek",
importPath: "../extractors/fourdayweek/manifest",
settings: { fourdayweekMaxJobsPerTerm: "10" },
},
];
function buildContext(
source: string,
settings: Record<string, string>,
): ExtractorRuntimeContext {
return {
source,
selectedSources: [source],
settings,
searchTerms: ["software engineer"],
selectedCountry: "United States",
getExistingJobUrls: async () => [],
shouldCancel: () => false,
onProgress: () => {},
};
}
function pad(s: string, n: number): string {
return s.length >= n ? s : s + " ".repeat(n - s.length);
}
async function runOne(target: Target): Promise<void> {
const missing = (target.needs ?? []).filter((k) => !process.env[k]);
if (missing.length > 0) {
console.log(
`${pad(target.id, 12)} SKIP missing env: ${missing.join(", ")}`,
);
return;
}
let mod: { manifest?: ExtractorManifest; default?: ExtractorManifest };
try {
mod = await import(target.importPath);
} catch (err) {
console.log(
`${pad(target.id, 12)} FAIL import error: ${(err as Error).message}`,
);
return;
}
const manifest = mod.manifest ?? mod.default;
if (!manifest) {
console.log(`${pad(target.id, 12)} FAIL manifest export missing`);
return;
}
const started = Date.now();
try {
const ctx = buildContext(target.id, target.settings ?? {});
const result = await manifest.run(ctx);
const ms = Date.now() - started;
const status = result.success ? "OK " : "ERR ";
const sample = result.jobs[0];
const sampleStr = sample
? ` | first: "${sample.title}" @ ${sample.employer}`
: "";
console.log(
`${pad(target.id, 12)} ${status} jobs=${result.jobs.length} ${ms}ms${result.error ? ` | error: ${result.error}` : ""}${sampleStr}`,
);
} catch (err) {
const ms = Date.now() - started;
console.log(
`${pad(target.id, 12)} CRASH ${ms}ms ${(err as Error).message}`,
);
}
}
async function main() {
const requested = (process.argv[2] ?? "").trim();
const filter = requested
? new Set(
requested
.split(",")
.map((s) => s.trim())
.filter(Boolean),
)
: null;
const targets = filter
? ALL_TARGETS.filter((t) => filter.has(t.id))
: ALL_TARGETS;
console.log(`Smoke testing ${targets.length} extractor(s)...\n`);
for (const t of targets) {
await runOne(t);
}
}
main().catch((err) => {
console.error(err);
process.exit(1);
});