Jobber/scripts/smoke-extractors.ts
ilia c840f289e1
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m8s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m11s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m8s
CI / Type Check (orchestrator) (push) Successful in 1m23s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m6s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m7s
CI / Documentation (push) Successful in 1m54s
feat(extractors): expand catalog, smoke coverage, and sourcing docs
Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters
manifests with registry/settings/UI wiring; registers full extractor list in
smoke-extractors and documents supplementary board access paths. Aligns Careerjet
v4 with the url query parameter and fixes strict typing in QAJobsBoard.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-15 22:36:23 -04:00

359 lines
9.7 KiB
TypeScript

/**
* Smoke-test helper for extractor manifests: imports each manifest, runs it with a
* minimal context, and prints mapped job counts + a sample row.
*
* Run from repo root:
* npx tsx scripts/smoke-extractors.ts
* npx tsx scripts/smoke-extractors.ts arcdev,icims
* npx tsx scripts/smoke-extractors.ts indeed # alias → `jobspy` (same manifest)
*
* Keep `ALL_TARGETS` aligned with every shipped manifest under each
* `extractors/<name>/` package (`manifest.ts` or `src/manifest.ts`).
*
* Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain
* `tsx` does not read `.env` automatically).
*/
import path from "node:path";
import { fileURLToPath } from "node:url";
import { config as loadEnv } from "dotenv";
import type {
ExtractorManifest,
ExtractorRuntimeContext,
} from "job-ops-shared/types/extractors";
const repoRoot = path.resolve(
path.dirname(fileURLToPath(import.meta.url)),
"..",
);
loadEnv({ path: path.join(repoRoot, ".env") });
/** Left column width for log alignment (longest pipeline source id today). */
const ID_COL = 15;
/** JobSpy serves Indeed / LinkedIn / Glassdoor; CLI filter accepts those ids as aliases. */
const JOBSPY_SITE_IDS = ["indeed", "linkedin", "glassdoor"] as const;
function expandSmokeFilter(ids: Set<string>): Set<string> {
const next = new Set(ids);
for (const site of JOBSPY_SITE_IDS) {
if (next.has(site)) {
next.add("jobspy");
break;
}
}
return next;
}
interface Target {
id: string;
importPath: string;
needs?: string[]; // env vars required to run; skipped if missing
settings?: Record<string, string>;
/** When set, replaces the default smoke search terms (use [] for sources that filter client-side). */
searchTerms?: string[];
/** Geography passed as `selectedCountry` (must match what each extractor expects). */
selectedCountry?: string;
}
const ALL_TARGETS: Target[] = [
{
id: "adzuna",
importPath: "../extractors/adzuna/manifest",
needs: ["ADZUNA_APP_ID", "ADZUNA_APP_KEY"],
selectedCountry: "United States",
settings: {
adzunaMaxJobsPerTerm: "10",
searchCities: "United States",
},
},
{
id: "arbeitnow",
importPath: "../extractors/arbeitnow/manifest",
settings: { arbeitnowMaxJobsPerTerm: "10" },
},
{
id: "arcdev",
importPath: "../extractors/arcdev/manifest",
settings: {
arcRemoteJobsPaths: JSON.stringify(["/remote-jobs/playwright"]),
arcMaxJobsPerPath: "20",
},
},
{
id: "ashby",
importPath: "../extractors/ashby/manifest",
settings: {
ashbyCompanies: JSON.stringify(["ramp", "linear"]),
},
},
{
id: "bctenet",
importPath: "../extractors/bctenet/manifest",
selectedCountry: "Canada",
settings: {
bctenetMaxJobsPerTerm: "25",
},
},
{
id: "careerjet",
importPath: "../extractors/careerjet/manifest",
needs: ["CAREERJET_AFFID", "CAREERJET_REFERER", "CAREERJET_USER_IP"],
settings: { careerjetMaxJobsPerTerm: "10", searchCities: "United States" },
},
{
id: "eluta",
importPath: "../extractors/eluta/manifest",
selectedCountry: "Canada",
settings: {
elutaRssLocations: JSON.stringify(["Toronto, ON"]),
elutaMaxJobsPerTerm: "15",
},
},
{
id: "fourdayweek",
importPath: "../extractors/fourdayweek/manifest",
settings: { fourdayweekMaxJobsPerTerm: "10" },
},
{
id: "gradcracker",
importPath: "../extractors/gradcracker/manifest",
selectedCountry: "United Kingdom",
settings: { gradcrackerMaxJobsPerTerm: "10" },
},
{
id: "greenhouse",
importPath: "../extractors/greenhouse/manifest",
settings: {
greenhouseCompanies: JSON.stringify(["stripe", "airbnb"]),
},
},
{
id: "himalayas",
importPath: "../extractors/himalayas/manifest",
settings: { himalayasMaxJobsPerTerm: "10" },
},
{
id: "hiringcafe",
importPath: "../extractors/hiringcafe/manifest",
selectedCountry: "United Kingdom",
settings: {
searchCities: "UK",
jobspyResultsWanted: "10",
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
},
},
{
id: "icims",
importPath: "../extractors/icims/manifest",
searchTerms: [],
settings: {
icimsTenants: JSON.stringify(["careers-appliedsystems.icims.com"]),
icimsMaxJobsPerTenant: "15",
icimsMaxPagesPerSearch: "2",
},
},
{
id: "jobicy",
importPath: "../extractors/jobicy/manifest",
settings: { jobicyMaxJobsPerTerm: "10" },
},
{
id: "jobspy",
importPath: "../extractors/jobspy/manifest",
selectedCountry: "United Kingdom",
settings: {
searchCities: "UK",
jobspyCountryIndeed: "UK",
jobspyResultsWanted: "5",
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
},
},
{
id: "jooble",
importPath: "../extractors/jooble/manifest",
needs: ["JOOBLE_API_KEY"],
settings: { joobleMaxJobsPerTerm: "10", searchCities: "United States" },
},
{
id: "lever",
importPath: "../extractors/lever/manifest",
settings: {
leverCompanies: JSON.stringify(["palantir", "netflix"]),
},
},
{
id: "qajobsboard",
importPath: "../extractors/qajobsboard/manifest",
settings: { qajobsboardMaxJobsPerTerm: "25" },
},
{
id: "reed",
importPath: "../extractors/reed/manifest",
needs: ["REED_API_KEY"],
selectedCountry: "United Kingdom",
settings: { reedMaxJobsPerTerm: "10" },
},
{
id: "remoteok",
importPath: "../extractors/remoteok/manifest",
settings: { remoteokMaxJobsPerTerm: "10" },
},
{
id: "remotive",
importPath: "../extractors/remotive/manifest",
settings: { remotiveMaxJobsPerTerm: "10" },
},
{
id: "smartrecruiters",
importPath: "../extractors/smartrecruiters/manifest",
settings: {
smartrecruitersCompanies: JSON.stringify(["smartrecruiters"]),
smartrecruitersMaxJobsPerCompany: "5",
},
},
{
id: "startupjobs",
importPath: "../extractors/startupjobs/src/manifest",
selectedCountry: "United Kingdom",
settings: {
searchCities: "UK",
startupjobsMaxJobsPerTerm: "10",
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
},
},
{
id: "themuse",
importPath: "../extractors/themuse/manifest",
settings: { themuseMaxJobsPerTerm: "10" },
},
{
id: "ukvisajobs",
importPath: "../extractors/ukvisajobs/manifest",
needs: ["UKVISAJOBS_EMAIL", "UKVISAJOBS_PASSWORD"],
selectedCountry: "United Kingdom",
settings: { ukvisajobsMaxJobs: "10" },
},
{
id: "usajobs",
importPath: "../extractors/usajobs/manifest",
needs: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
settings: { usajobsMaxJobsPerTerm: "10" },
},
{
id: "weworkremotely",
importPath: "../extractors/weworkremotely/manifest",
settings: { weworkremotelyMaxJobsPerTerm: "10" },
},
{
id: "workday",
importPath: "../extractors/workday/manifest",
settings: {
workdayTenants: JSON.stringify([
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
]),
},
},
];
function buildContext(args: {
source: string;
settings: Record<string, string>;
searchTerms?: string[];
selectedCountry?: string;
}): ExtractorRuntimeContext {
return {
source: args.source,
selectedSources: [args.source],
settings: args.settings,
searchTerms:
args.searchTerms !== undefined ? args.searchTerms : ["software engineer"],
selectedCountry: args.selectedCountry ?? "United States",
getExistingJobUrls: async () => [],
shouldCancel: () => false,
onProgress: () => {},
};
}
function pad(s: string, n: number): string {
return s.length >= n ? s : s + " ".repeat(n - s.length);
}
async function runOne(target: Target): Promise<void> {
const missing = (target.needs ?? []).filter((k) => !process.env[k]);
if (missing.length > 0) {
console.log(
`${pad(target.id, ID_COL)} SKIP missing env: ${missing.join(", ")}`,
);
return;
}
let mod: { manifest?: ExtractorManifest; default?: ExtractorManifest };
try {
mod = await import(target.importPath);
} catch (err) {
console.log(
`${pad(target.id, ID_COL)} FAIL import error: ${(err as Error).message}`,
);
return;
}
const manifest = mod.manifest ?? mod.default;
if (!manifest) {
console.log(`${pad(target.id, ID_COL)} FAIL manifest export missing`);
return;
}
const started = Date.now();
try {
const ctx = buildContext({
source: target.id,
settings: target.settings ?? {},
searchTerms: target.searchTerms,
selectedCountry: target.selectedCountry,
});
const result = await manifest.run(ctx);
const ms = Date.now() - started;
const status = result.success ? "OK " : "ERR ";
const sample = result.jobs[0];
const sampleStr = sample
? ` | first: "${sample.title}" @ ${sample.employer}`
: "";
console.log(
`${pad(target.id, ID_COL)} ${status} jobs=${result.jobs.length} ${ms}ms${result.error ? ` | error: ${result.error}` : ""}${sampleStr}`,
);
} catch (err) {
const ms = Date.now() - started;
console.log(
`${pad(target.id, ID_COL)} CRASH ${ms}ms ${(err as Error).message}`,
);
}
}
async function main() {
const requested = (process.argv[2] ?? "").trim();
const filter = requested
? expandSmokeFilter(
new Set(
requested
.split(",")
.map((s) => s.trim())
.filter(Boolean),
),
)
: null;
const targets = filter
? ALL_TARGETS.filter((t) => filter.has(t.id))
: ALL_TARGETS;
console.log(`Smoke testing ${targets.length} extractor(s)...\n`);
for (const t of targets) {
await runOne(t);
}
}
main().catch((err) => {
console.error(err);
process.exit(1);
});