ilia f5179304c1
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m27s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Successful in 1m24s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m8s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m9s
CI / Documentation (push) Successful in 1m59s
feat(discovery): blocked countries filter and smoke subprocess fixes
Add blockedCountries in Settings so pipeline discovery drops jobs whose
location mentions listed countries (existing discovered rows are kept).
Document the feature, fix smoke tsconfig inheritance for nested extractors,
and run smoke via an absolute-tsconfig wrapper.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-16 11:41:29 -04:00

187 lines
5.7 KiB
TypeScript

import { spawn } from "node:child_process";
import { mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises";
import { dirname, join } from "node:path";
import { createInterface } from "node:readline";
import { fileURLToPath } from "node:url";
import { envForExtractorSubprocess } from "@shared/extractor-subprocess-env.js";
type CreateJobInput = {
source: "gradcracker";
title: string;
employer: string;
jobUrl: string;
employerUrl?: string;
applicationLink?: string;
disciplines?: string;
deadline?: string;
salary?: string;
location?: string;
degreeRequired?: string;
starting?: string;
jobDescription?: string;
};
const srcDir = dirname(fileURLToPath(import.meta.url));
const EXTRACTOR_DIR = join(srcDir, "..");
const STORAGE_DIR = join(EXTRACTOR_DIR, "storage/datasets/default");
const JOBOPS_STORAGE_DIR = join(EXTRACTOR_DIR, "storage/jobops");
const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
export interface CrawlerResult {
success: boolean;
jobs: CreateJobInput[];
error?: string;
}
export interface RunCrawlerOptions {
existingJobUrls?: string[];
onProgress?: (update: JobExtractorProgress) => void;
searchTerms?: string[];
maxJobsPerTerm?: number;
}
interface JobExtractorProgress {
phase?: "list" | "job";
currentUrl?: string;
listPagesProcessed?: number;
listPagesTotal?: number;
jobCardsFound?: number;
jobPagesEnqueued?: number;
jobPagesSkipped?: number;
jobPagesProcessed?: number;
ts?: string;
}
async function writeExistingJobUrlsFile(
existingJobUrls: string[] | undefined,
): Promise<string | null> {
if (!existingJobUrls || existingJobUrls.length === 0) return null;
await mkdir(JOBOPS_STORAGE_DIR, { recursive: true });
const filePath = join(JOBOPS_STORAGE_DIR, "existing-job-urls.json");
await writeFile(filePath, JSON.stringify(existingJobUrls), "utf-8");
return filePath;
}
export async function runCrawler(
options: RunCrawlerOptions = {},
): Promise<CrawlerResult> {
try {
await clearStorageDataset();
const existingJobUrlsFile = await writeExistingJobUrlsFile(
options.existingJobUrls,
);
await new Promise<void>((resolve, reject) => {
const child = spawn("npm", ["run", "start"], {
cwd: EXTRACTOR_DIR,
shell: true,
stdio: ["ignore", "pipe", "pipe"],
env: envForExtractorSubprocess({
...process.env,
JOBOPS_SKIP_APPLY_FOR_EXISTING: "1",
JOBOPS_EMIT_PROGRESS: "1",
GRADCRACKER_SEARCH_TERMS: options.searchTerms
? JSON.stringify(options.searchTerms)
: "",
GRADCRACKER_MAX_JOBS_PER_TERM: options.maxJobsPerTerm
? String(options.maxJobsPerTerm)
: "",
...(existingJobUrlsFile
? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile }
: {}),
}),
});
const handleLine = (line: string, stream: NodeJS.WriteStream) => {
if (line.startsWith(JOBOPS_PROGRESS_PREFIX)) {
const raw = line.slice(JOBOPS_PROGRESS_PREFIX.length).trim();
try {
const parsed = JSON.parse(raw) as JobExtractorProgress;
options.onProgress?.(parsed);
} catch {
// ignore malformed progress lines
}
return;
}
stream.write(`${line}\n`);
};
const stdoutRl = child.stdout
? createInterface({ input: child.stdout })
: null;
const stderrRl = child.stderr
? createInterface({ input: child.stderr })
: null;
stdoutRl?.on("line", (line) => handleLine(line, process.stdout));
stderrRl?.on("line", (line) => handleLine(line, process.stderr));
child.on("close", (code) => {
stdoutRl?.close();
stderrRl?.close();
if (code === 0) {
resolve();
} else {
reject(new Error(`Crawler exited with code ${code}`));
}
});
child.on("error", reject);
});
const jobs = await readCrawledJobs();
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };
}
}
async function readCrawledJobs(): Promise<CreateJobInput[]> {
try {
const files = await readdir(STORAGE_DIR);
const jsonFiles = files.filter((file) => file.endsWith(".json"));
const jobs: CreateJobInput[] = [];
for (const file of jsonFiles) {
const content = await readFile(join(STORAGE_DIR, file), "utf-8");
const data = JSON.parse(content) as Record<string, unknown>;
jobs.push({
source: "gradcracker",
title: (data.title as string) || "Unknown Title",
employer: (data.employer as string) || "Unknown Employer",
employerUrl: data.employerUrl as string | undefined,
jobUrl: (data.url as string) || (data.jobUrl as string),
applicationLink: data.applicationLink as string | undefined,
disciplines:
typeof data.disciplines === "string"
? data.disciplines
: Array.isArray(data.disciplines)
? data.disciplines
.filter((value): value is string => typeof value === "string")
.join(", ")
: undefined,
deadline: data.deadline as string | undefined,
salary: data.salary as string | undefined,
location: data.location as string | undefined,
degreeRequired: data.degreeRequired as string | undefined,
starting: data.starting as string | undefined,
jobDescription: data.jobDescription as string | undefined,
});
}
return jobs;
} catch {
return [];
}
}
async function clearStorageDataset(): Promise<void> {
try {
await rm(STORAGE_DIR, { recursive: true, force: true });
} catch {
// ignore
}
}