Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m27s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Successful in 1m24s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m8s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m9s
CI / Documentation (push) Successful in 1m59s
Add blockedCountries in Settings so pipeline discovery drops jobs whose location mentions listed countries (existing discovered rows are kept). Document the feature, fix smoke tsconfig inheritance for nested extractors, and run smoke via an absolute-tsconfig wrapper. Co-authored-by: Cursor <cursoragent@cursor.com>
397 lines
11 KiB
TypeScript
397 lines
11 KiB
TypeScript
/**
|
|
* Smoke-test helper for extractor manifests: imports each manifest, runs it with a
|
|
* minimal context, and prints mapped job counts + a sample row.
|
|
*
|
|
* Run from repo root (`run-smoke-extractors.mjs` applies repo tsconfig for `@shared/*`):
|
|
* npm run smoke:extractors
|
|
* npm run smoke:extractors -- arcdev,icims
|
|
* npm run smoke:extractors -- indeed # alias → `jobspy` (same manifest)
|
|
*
|
|
* Keep `ALL_TARGETS` aligned with every shipped manifest under each
|
|
* `extractors/<name>/` package (`manifest.ts` or `src/manifest.ts`).
|
|
*
|
|
* Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain
|
|
* `tsx` does not read `.env` automatically).
|
|
*
|
|
* Resolves `@shared/*` imports in extractor manifests via `scripts/smoke-resolve-shared.mjs`.
|
|
*/
|
|
|
|
import { existsSync } from "node:fs";
|
|
import { register } from "node:module";
|
|
import { homedir } from "node:os";
|
|
import path from "node:path";
|
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
import { config as loadEnv } from "dotenv";
|
|
|
|
register(
|
|
pathToFileURL(
|
|
path.join(
|
|
path.dirname(fileURLToPath(import.meta.url)),
|
|
"smoke-resolve-shared.mjs",
|
|
),
|
|
).href,
|
|
import.meta.url,
|
|
);
|
|
|
|
import type {
|
|
ExtractorManifest,
|
|
ExtractorRuntimeContext,
|
|
} from "job-ops-shared/types/extractors";
|
|
|
|
const repoRoot = path.resolve(
|
|
path.dirname(fileURLToPath(import.meta.url)),
|
|
"..",
|
|
);
|
|
loadEnv({ path: path.join(repoRoot, ".env") });
|
|
|
|
/** Left column width for log alignment (longest pipeline source id today). */
|
|
const ID_COL = 15;
|
|
|
|
/** JobSpy serves Indeed / LinkedIn / Glassdoor; CLI filter accepts those ids as aliases. */
|
|
const JOBSPY_SITE_IDS = ["indeed", "linkedin", "glassdoor"] as const;
|
|
|
|
function expandSmokeFilter(ids: Set<string>): Set<string> {
|
|
const next = new Set(ids);
|
|
for (const site of JOBSPY_SITE_IDS) {
|
|
if (next.has(site)) {
|
|
next.add("jobspy");
|
|
break;
|
|
}
|
|
}
|
|
return next;
|
|
}
|
|
|
|
interface Target {
|
|
id: string;
|
|
importPath: string;
|
|
needs?: string[]; // env vars required to run; skipped if missing
|
|
/** When set, skip with this message (e.g. local Camoufox not installed). */
|
|
skipReason?: string;
|
|
settings?: Record<string, string>;
|
|
/** When set, replaces the default smoke search terms (use [] for sources that filter client-side). */
|
|
searchTerms?: string[];
|
|
/** Geography passed as `selectedCountry` (must match what each extractor expects). */
|
|
selectedCountry?: string;
|
|
}
|
|
|
|
function camoufoxInstalled(): boolean {
|
|
const home = homedir();
|
|
const candidates = [
|
|
path.join(home, "Library", "Caches", "camoufox", "version.json"),
|
|
path.join(home, ".cache", "camoufox", "version.json"),
|
|
];
|
|
return candidates.some((filePath) => existsSync(filePath));
|
|
}
|
|
|
|
const ALL_TARGETS: Target[] = [
|
|
{
|
|
id: "adzuna",
|
|
importPath: "../extractors/adzuna/manifest",
|
|
needs: ["ADZUNA_APP_ID", "ADZUNA_APP_KEY"],
|
|
selectedCountry: "United States",
|
|
settings: {
|
|
adzunaMaxJobsPerTerm: "10",
|
|
searchCities: "United States",
|
|
},
|
|
},
|
|
{
|
|
id: "arbeitnow",
|
|
importPath: "../extractors/arbeitnow/manifest",
|
|
settings: { arbeitnowMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "arcdev",
|
|
importPath: "../extractors/arcdev/manifest",
|
|
settings: {
|
|
arcRemoteJobsPaths: JSON.stringify(["/remote-jobs/playwright"]),
|
|
arcMaxJobsPerPath: "20",
|
|
},
|
|
},
|
|
{
|
|
id: "ashby",
|
|
importPath: "../extractors/ashby/manifest",
|
|
settings: {
|
|
ashbyCompanies: JSON.stringify(["ramp", "linear"]),
|
|
},
|
|
},
|
|
{
|
|
id: "bctenet",
|
|
importPath: "../extractors/bctenet/manifest",
|
|
selectedCountry: "Canada",
|
|
settings: {
|
|
bctenetMaxJobsPerTerm: "25",
|
|
},
|
|
},
|
|
{
|
|
id: "careerjet",
|
|
importPath: "../extractors/careerjet/manifest",
|
|
needs: ["CAREERJET_AFFID", "CAREERJET_REFERER", "CAREERJET_USER_IP"],
|
|
settings: { careerjetMaxJobsPerTerm: "10", searchCities: "United States" },
|
|
},
|
|
{
|
|
id: "eluta",
|
|
importPath: "../extractors/eluta/manifest",
|
|
selectedCountry: "Canada",
|
|
settings: {
|
|
elutaRssLocations: JSON.stringify(["Toronto, ON"]),
|
|
elutaMaxJobsPerTerm: "15",
|
|
},
|
|
},
|
|
{
|
|
id: "fourdayweek",
|
|
importPath: "../extractors/fourdayweek/manifest",
|
|
settings: { fourdayweekMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "gradcracker",
|
|
importPath: "../extractors/gradcracker/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: { gradcrackerMaxJobsPerTerm: "10" },
|
|
skipReason: camoufoxInstalled()
|
|
? undefined
|
|
: "Camoufox not installed (run: npx camoufox-js fetch)",
|
|
},
|
|
{
|
|
id: "greenhouse",
|
|
importPath: "../extractors/greenhouse/manifest",
|
|
settings: {
|
|
greenhouseCompanies: JSON.stringify(["stripe", "airbnb"]),
|
|
},
|
|
},
|
|
{
|
|
id: "himalayas",
|
|
importPath: "../extractors/himalayas/manifest",
|
|
settings: { himalayasMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "hiringcafe",
|
|
importPath: "../extractors/hiringcafe/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: {
|
|
searchCities: "UK",
|
|
jobspyResultsWanted: "10",
|
|
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
|
|
},
|
|
skipReason: camoufoxInstalled()
|
|
? undefined
|
|
: "Camoufox not installed (run: npx camoufox-js fetch)",
|
|
},
|
|
{
|
|
id: "icims",
|
|
importPath: "../extractors/icims/manifest",
|
|
searchTerms: [],
|
|
settings: {
|
|
icimsTenants: JSON.stringify(["careers-appliedsystems.icims.com"]),
|
|
icimsMaxJobsPerTenant: "15",
|
|
icimsMaxPagesPerSearch: "2",
|
|
},
|
|
},
|
|
{
|
|
id: "jobicy",
|
|
importPath: "../extractors/jobicy/manifest",
|
|
settings: { jobicyMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "jobspy",
|
|
importPath: "../extractors/jobspy/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: {
|
|
searchCities: "UK",
|
|
jobspyCountryIndeed: "UK",
|
|
jobspyResultsWanted: "5",
|
|
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
|
|
},
|
|
},
|
|
{
|
|
id: "jooble",
|
|
importPath: "../extractors/jooble/manifest",
|
|
needs: ["JOOBLE_API_KEY"],
|
|
settings: { joobleMaxJobsPerTerm: "10", searchCities: "United States" },
|
|
},
|
|
{
|
|
id: "lever",
|
|
importPath: "../extractors/lever/manifest",
|
|
settings: {
|
|
leverCompanies: JSON.stringify(["palantir", "netflix"]),
|
|
},
|
|
},
|
|
{
|
|
id: "qajobsboard",
|
|
importPath: "../extractors/qajobsboard/manifest",
|
|
settings: { qajobsboardMaxJobsPerTerm: "25" },
|
|
},
|
|
{
|
|
id: "reed",
|
|
importPath: "../extractors/reed/manifest",
|
|
needs: ["REED_API_KEY"],
|
|
selectedCountry: "United Kingdom",
|
|
settings: { reedMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "remoteok",
|
|
importPath: "../extractors/remoteok/manifest",
|
|
settings: { remoteokMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "remotive",
|
|
importPath: "../extractors/remotive/manifest",
|
|
settings: { remotiveMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "smartrecruiters",
|
|
importPath: "../extractors/smartrecruiters/manifest",
|
|
settings: {
|
|
smartrecruitersCompanies: JSON.stringify(["smartrecruiters"]),
|
|
smartrecruitersMaxJobsPerCompany: "5",
|
|
},
|
|
},
|
|
{
|
|
id: "startupjobs",
|
|
importPath: "../extractors/startupjobs/src/manifest",
|
|
selectedCountry: "United Kingdom",
|
|
settings: {
|
|
searchCities: "UK",
|
|
startupjobsMaxJobsPerTerm: "10",
|
|
workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]),
|
|
},
|
|
},
|
|
{
|
|
id: "themuse",
|
|
importPath: "../extractors/themuse/manifest",
|
|
settings: { themuseMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "ukvisajobs",
|
|
importPath: "../extractors/ukvisajobs/manifest",
|
|
needs: ["UKVISAJOBS_EMAIL", "UKVISAJOBS_PASSWORD"],
|
|
selectedCountry: "United Kingdom",
|
|
settings: { ukvisajobsMaxJobs: "10" },
|
|
},
|
|
{
|
|
id: "usajobs",
|
|
importPath: "../extractors/usajobs/manifest",
|
|
needs: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
|
|
settings: { usajobsMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "weworkremotely",
|
|
importPath: "../extractors/weworkremotely/manifest",
|
|
settings: { weworkremotelyMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "workday",
|
|
importPath: "../extractors/workday/manifest",
|
|
settings: {
|
|
workdayTenants: JSON.stringify([
|
|
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
|
|
]),
|
|
},
|
|
},
|
|
];
|
|
|
|
function buildContext(args: {
|
|
source: string;
|
|
settings: Record<string, string>;
|
|
searchTerms?: string[];
|
|
selectedCountry?: string;
|
|
}): ExtractorRuntimeContext {
|
|
return {
|
|
source: args.source,
|
|
selectedSources: [args.source],
|
|
settings: args.settings,
|
|
searchTerms:
|
|
args.searchTerms !== undefined ? args.searchTerms : ["software engineer"],
|
|
selectedCountry: args.selectedCountry ?? "United States",
|
|
getExistingJobUrls: async () => [],
|
|
shouldCancel: () => false,
|
|
onProgress: () => {},
|
|
};
|
|
}
|
|
|
|
function pad(s: string, n: number): string {
|
|
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
}
|
|
|
|
async function runOne(target: Target): Promise<void> {
|
|
if (target.skipReason) {
|
|
console.log(`${pad(target.id, ID_COL)} SKIP ${target.skipReason}`);
|
|
return;
|
|
}
|
|
|
|
const missing = (target.needs ?? []).filter((k) => !process.env[k]);
|
|
if (missing.length > 0) {
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} SKIP missing env: ${missing.join(", ")}`,
|
|
);
|
|
return;
|
|
}
|
|
|
|
let mod: { manifest?: ExtractorManifest; default?: ExtractorManifest };
|
|
try {
|
|
mod = await import(target.importPath);
|
|
} catch (err) {
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} FAIL import error: ${(err as Error).message}`,
|
|
);
|
|
return;
|
|
}
|
|
|
|
const manifest = mod.manifest ?? mod.default;
|
|
if (!manifest) {
|
|
console.log(`${pad(target.id, ID_COL)} FAIL manifest export missing`);
|
|
return;
|
|
}
|
|
|
|
const started = Date.now();
|
|
try {
|
|
const ctx = buildContext({
|
|
source: target.id,
|
|
settings: target.settings ?? {},
|
|
searchTerms: target.searchTerms,
|
|
selectedCountry: target.selectedCountry,
|
|
});
|
|
const result = await manifest.run(ctx);
|
|
const ms = Date.now() - started;
|
|
const status = result.success ? "OK " : "ERR ";
|
|
const sample = result.jobs[0];
|
|
const sampleStr = sample
|
|
? ` | first: "${sample.title}" @ ${sample.employer}`
|
|
: "";
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} ${status} jobs=${result.jobs.length} ${ms}ms${result.error ? ` | error: ${result.error}` : ""}${sampleStr}`,
|
|
);
|
|
} catch (err) {
|
|
const ms = Date.now() - started;
|
|
console.log(
|
|
`${pad(target.id, ID_COL)} CRASH ${ms}ms ${(err as Error).message}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const requested = (process.argv[2] ?? "").trim();
|
|
const filter = requested
|
|
? expandSmokeFilter(
|
|
new Set(
|
|
requested
|
|
.split(",")
|
|
.map((s) => s.trim())
|
|
.filter(Boolean),
|
|
),
|
|
)
|
|
: null;
|
|
const targets = filter
|
|
? ALL_TARGETS.filter((t) => filter.has(t.id))
|
|
: ALL_TARGETS;
|
|
|
|
console.log(`Smoke testing ${targets.length} extractor(s)...\n`);
|
|
for (const t of targets) {
|
|
await runOne(t);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|