Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m8s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m11s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m8s
CI / Type Check (orchestrator) (push) Successful in 1m23s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m6s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m7s
CI / Documentation (push) Successful in 1m54s
Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters manifests with registry/settings/UI wiring; registers full extractor list in smoke-extractors and documents supplementary board access paths. Aligns Careerjet v4 with the url query parameter and fixes strict typing in QAJobsBoard. Co-authored-by: Cursor <cursoragent@cursor.com>
359 lines
9.7 KiB
TypeScript
359 lines
9.7 KiB
TypeScript
/**
|
|
* Smoke-test helper for extractor manifests: imports each manifest, runs it with a
|
|
* minimal context, and prints mapped job counts + a sample row.
|
|
*
|
|
* Run from repo root:
|
|
* npx tsx scripts/smoke-extractors.ts
|
|
* npx tsx scripts/smoke-extractors.ts arcdev,icims
|
|
* npx tsx scripts/smoke-extractors.ts indeed # alias → `jobspy` (same manifest)
|
|
*
|
|
* Keep `ALL_TARGETS` aligned with every shipped manifest under each
|
|
* `extractors/<name>/` package (`manifest.ts` or `src/manifest.ts`).
|
|
*
|
|
* Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain
|
|
* `tsx` does not read `.env` automatically).
|
|
*/
|
|
|
|
import path from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { config as loadEnv } from "dotenv";
|
|
import type {
|
|
ExtractorManifest,
|
|
ExtractorRuntimeContext,
|
|
} from "job-ops-shared/types/extractors";
|
|
|
|
const repoRoot = path.resolve(
|
|
path.dirname(fileURLToPath(import.meta.url)),
|
|
"..",
|
|
);
|
|
loadEnv({ path: path.join(repoRoot, ".env") });
|
|
|
|
/** Left column width for log alignment (longest pipeline source id today). */
|
|
const ID_COL = 15;
|
|
|
|
/** JobSpy serves Indeed / LinkedIn / Glassdoor; CLI filter accepts those ids as aliases. */
|
|
const JOBSPY_SITE_IDS = ["indeed", "linkedin", "glassdoor"] as const;
|
|
|
|
function expandSmokeFilter(ids: Set<string>): Set<string> {
|
|
const next = new Set(ids);
|
|
for (const site of JOBSPY_SITE_IDS) {
|
|
if (next.has(site)) {
|
|
next.add("jobspy");
|
|
break;
|
|
}
|
|
}
|
|
return next;
|
|
}
|
|
|
|
interface Target {
|
|
id: string;
|
|
importPath: string;
|
|
needs?: string[]; // env vars required to run; skipped if missing
|
|
settings?: Record<string, string>;
|
|
/** When set, replaces the default smoke search terms (use [] for sources that filter client-side). */
|
|
searchTerms?: string[];
|
|
/** Geography passed as `selectedCountry` (must match what each extractor expects). */
|
|
selectedCountry?: string;
|
|
}
|
|
|
|
const ALL_TARGETS: Target[] = [
|
|
{
|
|
id: "adzuna",
|
|
importPath: "../extractors/adzuna/manifest",
|
|
needs: ["ADZUNA_APP_ID", "ADZUNA_APP_KEY"],
|
|
selectedCountry: "United States",
|
|
settings: {
|
|
adzunaMaxJobsPerTerm: "10",
|
|
searchCities: "United States",
|
|
},
|
|
},
|
|
{
|
|
id: "arbeitnow",
|
|
importPath: "../extractors/arbeitnow/manifest",
|
|
settings: { arbeitnowMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "arcdev",
|
|
importPath: "../extractors/arcdev/manifest",
|
|
settings: {
|
|
arcRemoteJobsPaths: JSON.stringify(["/remote-jobs/playwright"]),
|
|
arcMaxJobsPerPath: "20",
|
|
},
|
|
},
|
|
{
|
|
id: "ashby",
|
|
importPath: "../extractors/ashby/manifest",
|
|
settings: {
|
|
ashbyCompanies: JSON.stringify(["ramp", "linear"]),
|
|
},
|
|
},
|
|
{
|
|
id: "bctenet",
|
|
importPath: "../extractors/bctenet/manifest",
|
|
selectedCountry: "Canada",
|
|
settings: {
|
|
bctenetMaxJobsPerTerm: "25",
|
|
},
|
|
},
|
|
{
|
|
id: "careerjet",
|
|
importPath: "../extractors/careerjet/manifest",
|
|
needs: ["CAREERJET_AFFID", "CAREERJET_REFERER", "CAREERJET_USER_IP"],
|
|
settings: { careerjetMaxJobsPerTerm: "10", searchCities: "United States" },
|
|
},
|
|
{
|
|
id: "eluta",
|
|
importPath: "../extractors/eluta/manifest",
|
|
selectedCountry: "Canada",
|
|
settings: {
|
|
elutaRssLocations: JSON.stringify(["Toronto, ON"]),
|
|
elutaMaxJobsPerTerm: "15",
|
|
},
|
|
},
|
|
{
|
|
id: "fourdayweek",
|
|
importPath: "../extractors/fourdayweek/manifest",
|
|
settings: { fourdayweekMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "gradcracker",
|
|
importPath: "../extractors/gradcracker/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: { gradcrackerMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "greenhouse",
|
|
importPath: "../extractors/greenhouse/manifest",
|
|
settings: {
|
|
greenhouseCompanies: JSON.stringify(["stripe", "airbnb"]),
|
|
},
|
|
},
|
|
{
|
|
id: "himalayas",
|
|
importPath: "../extractors/himalayas/manifest",
|
|
settings: { himalayasMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "hiringcafe",
|
|
importPath: "../extractors/hiringcafe/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: {
|
|
searchCities: "UK",
|
|
jobspyResultsWanted: "10",
|
|
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
|
|
},
|
|
},
|
|
{
|
|
id: "icims",
|
|
importPath: "../extractors/icims/manifest",
|
|
searchTerms: [],
|
|
settings: {
|
|
icimsTenants: JSON.stringify(["careers-appliedsystems.icims.com"]),
|
|
icimsMaxJobsPerTenant: "15",
|
|
icimsMaxPagesPerSearch: "2",
|
|
},
|
|
},
|
|
{
|
|
id: "jobicy",
|
|
importPath: "../extractors/jobicy/manifest",
|
|
settings: { jobicyMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "jobspy",
|
|
importPath: "../extractors/jobspy/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: {
|
|
searchCities: "UK",
|
|
jobspyCountryIndeed: "UK",
|
|
jobspyResultsWanted: "5",
|
|
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
|
|
},
|
|
},
|
|
{
|
|
id: "jooble",
|
|
importPath: "../extractors/jooble/manifest",
|
|
needs: ["JOOBLE_API_KEY"],
|
|
settings: { joobleMaxJobsPerTerm: "10", searchCities: "United States" },
|
|
},
|
|
{
|
|
id: "lever",
|
|
importPath: "../extractors/lever/manifest",
|
|
settings: {
|
|
leverCompanies: JSON.stringify(["palantir", "netflix"]),
|
|
},
|
|
},
|
|
{
|
|
id: "qajobsboard",
|
|
importPath: "../extractors/qajobsboard/manifest",
|
|
settings: { qajobsboardMaxJobsPerTerm: "25" },
|
|
},
|
|
{
|
|
id: "reed",
|
|
importPath: "../extractors/reed/manifest",
|
|
needs: ["REED_API_KEY"],
|
|
selectedCountry: "United Kingdom",
|
|
settings: { reedMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "remoteok",
|
|
importPath: "../extractors/remoteok/manifest",
|
|
settings: { remoteokMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "remotive",
|
|
importPath: "../extractors/remotive/manifest",
|
|
settings: { remotiveMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "smartrecruiters",
|
|
importPath: "../extractors/smartrecruiters/manifest",
|
|
settings: {
|
|
smartrecruitersCompanies: JSON.stringify(["smartrecruiters"]),
|
|
smartrecruitersMaxJobsPerCompany: "5",
|
|
},
|
|
},
|
|
{
|
|
id: "startupjobs",
|
|
importPath: "../extractors/startupjobs/src/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: {
|
|
searchCities: "UK",
|
|
startupjobsMaxJobsPerTerm: "10",
|
|
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
|
|
},
|
|
},
|
|
{
|
|
id: "themuse",
|
|
importPath: "../extractors/themuse/manifest",
|
|
settings: { themuseMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "ukvisajobs",
|
|
importPath: "../extractors/ukvisajobs/manifest",
|
|
needs: ["UKVISAJOBS_EMAIL", "UKVISAJOBS_PASSWORD"],
|
|
selectedCountry: "United Kingdom",
|
|
settings: { ukvisajobsMaxJobs: "10" },
|
|
},
|
|
{
|
|
id: "usajobs",
|
|
importPath: "../extractors/usajobs/manifest",
|
|
needs: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
|
|
settings: { usajobsMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "weworkremotely",
|
|
importPath: "../extractors/weworkremotely/manifest",
|
|
settings: { weworkremotelyMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "workday",
|
|
importPath: "../extractors/workday/manifest",
|
|
settings: {
|
|
workdayTenants: JSON.stringify([
|
|
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
|
|
]),
|
|
},
|
|
},
|
|
];
|
|
|
|
function buildContext(args: {
|
|
source: string;
|
|
settings: Record<string, string>;
|
|
searchTerms?: string[];
|
|
selectedCountry?: string;
|
|
}): ExtractorRuntimeContext {
|
|
return {
|
|
source: args.source,
|
|
selectedSources: [args.source],
|
|
settings: args.settings,
|
|
searchTerms:
|
|
args.searchTerms !== undefined ? args.searchTerms : ["software engineer"],
|
|
selectedCountry: args.selectedCountry ?? "United States",
|
|
getExistingJobUrls: async () => [],
|
|
shouldCancel: () => false,
|
|
onProgress: () => {},
|
|
};
|
|
}
|
|
|
|
function pad(s: string, n: number): string {
|
|
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
}
|
|
|
|
async function runOne(target: Target): Promise<void> {
|
|
const missing = (target.needs ?? []).filter((k) => !process.env[k]);
|
|
if (missing.length > 0) {
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} SKIP missing env: ${missing.join(", ")}`,
|
|
);
|
|
return;
|
|
}
|
|
|
|
let mod: { manifest?: ExtractorManifest; default?: ExtractorManifest };
|
|
try {
|
|
mod = await import(target.importPath);
|
|
} catch (err) {
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} FAIL import error: ${(err as Error).message}`,
|
|
);
|
|
return;
|
|
}
|
|
|
|
const manifest = mod.manifest ?? mod.default;
|
|
if (!manifest) {
|
|
console.log(`${pad(target.id, ID_COL)} FAIL manifest export missing`);
|
|
return;
|
|
}
|
|
|
|
const started = Date.now();
|
|
try {
|
|
const ctx = buildContext({
|
|
source: target.id,
|
|
settings: target.settings ?? {},
|
|
searchTerms: target.searchTerms,
|
|
selectedCountry: target.selectedCountry,
|
|
});
|
|
const result = await manifest.run(ctx);
|
|
const ms = Date.now() - started;
|
|
const status = result.success ? "OK " : "ERR ";
|
|
const sample = result.jobs[0];
|
|
const sampleStr = sample
|
|
? ` | first: "${sample.title}" @ ${sample.employer}`
|
|
: "";
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} ${status} jobs=${result.jobs.length} ${ms}ms${result.error ? ` | error: ${result.error}` : ""}${sampleStr}`,
|
|
);
|
|
} catch (err) {
|
|
const ms = Date.now() - started;
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} CRASH ${ms}ms ${(err as Error).message}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const requested = (process.argv[2] ?? "").trim();
|
|
const filter = requested
|
|
? expandSmokeFilter(
|
|
new Set(
|
|
requested
|
|
.split(",")
|
|
.map((s) => s.trim())
|
|
.filter(Boolean),
|
|
),
|
|
)
|
|
: null;
|
|
const targets = filter
|
|
? ALL_TARGETS.filter((t) => filter.has(t.id))
|
|
: ALL_TARGETS;
|
|
|
|
console.log(`Smoke testing ${targets.length} extractor(s)...\n`);
|
|
for (const t of targets) {
|
|
await runOne(t);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|