Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.
Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources
Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)
Co-authored-by: Cursor <cursoragent@cursor.com>
220 lines
5.7 KiB
TypeScript
220 lines
5.7 KiB
TypeScript
/**
|
|
* Tiny smoke-test for new extractors: imports each manifest, runs it with a
|
|
* minimal context, and prints the count of mapped jobs + a few samples.
|
|
*
|
|
* Run from repo root: npx tsx scripts/smoke-extractors.ts [comma,separated,ids]
|
|
*
|
|
* Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain
|
|
* `tsx` does not read `.env` automatically).
|
|
*/
|
|
|
|
import path from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { config as loadEnv } from "dotenv";
|
|
import type {
|
|
ExtractorManifest,
|
|
ExtractorRuntimeContext,
|
|
} from "../shared/src/types/extractors";
|
|
|
|
const repoRoot = path.resolve(
|
|
path.dirname(fileURLToPath(import.meta.url)),
|
|
"..",
|
|
);
|
|
loadEnv({ path: path.join(repoRoot, ".env") });
|
|
|
|
interface Target {
|
|
id: string;
|
|
importPath: string;
|
|
needs?: string[]; // env vars required to run; skipped if missing
|
|
settings?: Record<string, string>;
|
|
}
|
|
|
|
const ALL_TARGETS: Target[] = [
|
|
{
|
|
id: "jobicy",
|
|
importPath: "../extractors/jobicy/manifest",
|
|
settings: { jobicyMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "themuse",
|
|
importPath: "../extractors/themuse/manifest",
|
|
settings: { themuseMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "usajobs",
|
|
importPath: "../extractors/usajobs/manifest",
|
|
needs: ["USAJOBS_API_KEY", "USAJOBS_USER_AGENT"],
|
|
settings: { usajobsMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "jooble",
|
|
importPath: "../extractors/jooble/manifest",
|
|
needs: ["JOOBLE_API_KEY"],
|
|
settings: { joobleMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "careerjet",
|
|
importPath: "../extractors/careerjet/manifest",
|
|
needs: ["CAREERJET_AFFID", "CAREERJET_REFERER", "CAREERJET_USER_IP"],
|
|
settings: { careerjetMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "reed",
|
|
importPath: "../extractors/reed/manifest",
|
|
needs: ["REED_API_KEY"],
|
|
settings: { reedMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "lever",
|
|
importPath: "../extractors/lever/manifest",
|
|
settings: {
|
|
// Known active public Lever board used purely as a connectivity check.
|
|
leverCompanies: JSON.stringify(["palantir", "netflix"]),
|
|
},
|
|
},
|
|
{
|
|
id: "ashby",
|
|
importPath: "../extractors/ashby/manifest",
|
|
settings: {
|
|
ashbyCompanies: JSON.stringify(["ramp", "linear"]),
|
|
},
|
|
},
|
|
{
|
|
id: "greenhouse",
|
|
importPath: "../extractors/greenhouse/manifest",
|
|
settings: {
|
|
greenhouseCompanies: JSON.stringify(["stripe", "airbnb"]),
|
|
},
|
|
},
|
|
{
|
|
id: "workday",
|
|
importPath: "../extractors/workday/manifest",
|
|
settings: {
|
|
workdayTenants: JSON.stringify([
|
|
"https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite",
|
|
]),
|
|
},
|
|
},
|
|
{
|
|
id: "remoteok",
|
|
importPath: "../extractors/remoteok/manifest",
|
|
settings: { remoteokMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "remotive",
|
|
importPath: "../extractors/remotive/manifest",
|
|
settings: { remotiveMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "arbeitnow",
|
|
importPath: "../extractors/arbeitnow/manifest",
|
|
settings: { arbeitnowMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "himalayas",
|
|
importPath: "../extractors/himalayas/manifest",
|
|
settings: { himalayasMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "weworkremotely",
|
|
importPath: "../extractors/weworkremotely/manifest",
|
|
settings: { weworkremotelyMaxJobsPerTerm: "10" },
|
|
},
|
|
{
|
|
id: "fourdayweek",
|
|
importPath: "../extractors/fourdayweek/manifest",
|
|
settings: { fourdayweekMaxJobsPerTerm: "10" },
|
|
},
|
|
];
|
|
|
|
function buildContext(
|
|
source: string,
|
|
settings: Record<string, string>,
|
|
): ExtractorRuntimeContext {
|
|
return {
|
|
source,
|
|
selectedSources: [source],
|
|
settings,
|
|
searchTerms: ["software engineer"],
|
|
selectedCountry: "United States",
|
|
getExistingJobUrls: async () => [],
|
|
shouldCancel: () => false,
|
|
onProgress: () => {},
|
|
};
|
|
}
|
|
|
|
function pad(s: string, n: number): string {
|
|
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
}
|
|
|
|
async function runOne(target: Target): Promise<void> {
|
|
const missing = (target.needs ?? []).filter((k) => !process.env[k]);
|
|
if (missing.length > 0) {
|
|
console.log(
|
|
`${pad(target.id, 12)} SKIP missing env: ${missing.join(", ")}`,
|
|
);
|
|
return;
|
|
}
|
|
|
|
let mod: { manifest?: ExtractorManifest; default?: ExtractorManifest };
|
|
try {
|
|
mod = await import(target.importPath);
|
|
} catch (err) {
|
|
console.log(
|
|
`${pad(target.id, 12)} FAIL import error: ${(err as Error).message}`,
|
|
);
|
|
return;
|
|
}
|
|
|
|
const manifest = mod.manifest ?? mod.default;
|
|
if (!manifest) {
|
|
console.log(`${pad(target.id, 12)} FAIL manifest export missing`);
|
|
return;
|
|
}
|
|
|
|
const started = Date.now();
|
|
try {
|
|
const ctx = buildContext(target.id, target.settings ?? {});
|
|
const result = await manifest.run(ctx);
|
|
const ms = Date.now() - started;
|
|
const status = result.success ? "OK " : "ERR ";
|
|
const sample = result.jobs[0];
|
|
const sampleStr = sample
|
|
? ` | first: "${sample.title}" @ ${sample.employer}`
|
|
: "";
|
|
console.log(
|
|
`${pad(target.id, 12)} ${status} jobs=${result.jobs.length} ${ms}ms${result.error ? ` | error: ${result.error}` : ""}${sampleStr}`,
|
|
);
|
|
} catch (err) {
|
|
const ms = Date.now() - started;
|
|
console.log(
|
|
`${pad(target.id, 12)} CRASH ${ms}ms ${(err as Error).message}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
const requested = (process.argv[2] ?? "").trim();
|
|
const filter = requested
|
|
? new Set(
|
|
requested
|
|
.split(",")
|
|
.map((s) => s.trim())
|
|
.filter(Boolean),
|
|
)
|
|
: null;
|
|
const targets = filter
|
|
? ALL_TARGETS.filter((t) => filter.has(t.id))
|
|
: ALL_TARGETS;
|
|
|
|
console.log(`Smoke testing ${targets.length} extractor(s)...\n`);
|
|
for (const t of targets) {
|
|
await runOne(t);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|