Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 5m27s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Successful in 1m24s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m8s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m9s
CI / Documentation (push) Successful in 1m59s
Add blockedCountries in Settings so pipeline discovery drops jobs whose location mentions listed countries (existing discovered rows are kept). Document the feature, fix smoke tsconfig inheritance for nested extractors, and run smoke via an absolute-tsconfig wrapper. Co-authored-by: Cursor <cursoragent@cursor.com>
295 lines
8.9 KiB
TypeScript
295 lines
8.9 KiB
TypeScript
import { spawn, spawnSync } from "node:child_process";
|
|
import { readFile } from "node:fs/promises";
|
|
import { createRequire } from "node:module";
|
|
import { dirname, join } from "node:path";
|
|
import { createInterface } from "node:readline";
|
|
import { fileURLToPath } from "node:url";
|
|
import { envForExtractorSubprocess } from "@shared/extractor-subprocess-env.js";
|
|
import { normalizeCountryKey } from "@shared/location-support.js";
|
|
import {
|
|
resolveSearchCities,
|
|
shouldApplyStrictCityFilter,
|
|
} from "@shared/search-cities.js";
|
|
import type { CreateJobInput } from "@shared/types/jobs";
|
|
import {
|
|
toNumberOrNull,
|
|
toStringOrNull,
|
|
} from "@shared/utils/type-conversion.js";
|
|
|
|
const srcDir = dirname(fileURLToPath(import.meta.url));
|
|
const EXTRACTOR_DIR = join(srcDir, "..");
|
|
const DATASET_PATH = join(EXTRACTOR_DIR, "storage/datasets/default/jobs.json");
|
|
const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
|
|
const require = createRequire(import.meta.url);
|
|
const TSX_CLI_PATH = resolveTsxCliPath();
|
|
|
|
type AdzunaRawJob = Record<string, unknown>;
|
|
|
|
export type AdzunaProgressEvent =
|
|
| {
|
|
type: "term_start";
|
|
termIndex: number;
|
|
termTotal: number;
|
|
searchTerm: string;
|
|
}
|
|
| {
|
|
type: "page_fetched";
|
|
termIndex: number;
|
|
termTotal: number;
|
|
searchTerm: string;
|
|
pageNo: number;
|
|
resultsOnPage: number;
|
|
totalCollected: number;
|
|
}
|
|
| {
|
|
type: "term_complete";
|
|
termIndex: number;
|
|
termTotal: number;
|
|
searchTerm: string;
|
|
jobsFoundTerm: number;
|
|
};
|
|
|
|
export interface RunAdzunaOptions {
|
|
searchTerms?: string[];
|
|
country?: string;
|
|
countryKey?: string;
|
|
locations?: string[];
|
|
maxJobsPerTerm?: number;
|
|
onProgress?: (event: AdzunaProgressEvent) => void;
|
|
}
|
|
|
|
export interface AdzunaResult {
|
|
success: boolean;
|
|
jobs: CreateJobInput[];
|
|
error?: string;
|
|
}
|
|
|
|
function resolveTsxCliPath(): string | null {
|
|
try {
|
|
return require.resolve("tsx/dist/cli.mjs");
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function canRunNpmCommand(): boolean {
|
|
const result = spawnSync("npm", ["--version"], { stdio: "ignore" });
|
|
return !result.error && result.status === 0;
|
|
}
|
|
|
|
function parseAdzunaProgressLine(line: string): AdzunaProgressEvent | null {
|
|
if (!line.startsWith(JOBOPS_PROGRESS_PREFIX)) return null;
|
|
const raw = line.slice(JOBOPS_PROGRESS_PREFIX.length).trim();
|
|
|
|
let parsed: Record<string, unknown>;
|
|
try {
|
|
parsed = JSON.parse(raw) as Record<string, unknown>;
|
|
} catch {
|
|
return null;
|
|
}
|
|
|
|
const event = toStringOrNull(parsed.event);
|
|
const termIndex = toNumberOrNull(parsed.termIndex);
|
|
const termTotal = toNumberOrNull(parsed.termTotal);
|
|
const searchTerm = toStringOrNull(parsed.searchTerm) ?? "";
|
|
if (!event || termIndex === null || termTotal === null) return null;
|
|
|
|
if (event === "term_start") {
|
|
return { type: "term_start", termIndex, termTotal, searchTerm };
|
|
}
|
|
|
|
if (event === "page_fetched") {
|
|
const pageNo = toNumberOrNull(parsed.pageNo);
|
|
if (pageNo === null) return null;
|
|
return {
|
|
type: "page_fetched",
|
|
termIndex,
|
|
termTotal,
|
|
searchTerm,
|
|
pageNo,
|
|
resultsOnPage: toNumberOrNull(parsed.resultsOnPage) ?? 0,
|
|
totalCollected: toNumberOrNull(parsed.totalCollected) ?? 0,
|
|
};
|
|
}
|
|
|
|
if (event === "term_complete") {
|
|
return {
|
|
type: "term_complete",
|
|
termIndex,
|
|
termTotal,
|
|
searchTerm,
|
|
jobsFoundTerm: toNumberOrNull(parsed.jobsFoundTerm) ?? 0,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function mapAdzunaRow(row: AdzunaRawJob): CreateJobInput | null {
|
|
const jobUrl = toStringOrNull(row.jobUrl);
|
|
if (!jobUrl) return null;
|
|
|
|
return {
|
|
source: "adzuna",
|
|
sourceJobId: toStringOrNull(row.sourceJobId) ?? undefined,
|
|
title: toStringOrNull(row.title) ?? "Unknown Title",
|
|
employer: toStringOrNull(row.employer) ?? "Unknown Employer",
|
|
jobUrl,
|
|
applicationLink:
|
|
toStringOrNull(row.applicationLink) ??
|
|
toStringOrNull(row.jobUrl) ??
|
|
undefined,
|
|
location: toStringOrNull(row.location) ?? undefined,
|
|
salary: toStringOrNull(row.salary) ?? undefined,
|
|
datePosted: toStringOrNull(row.datePosted) ?? undefined,
|
|
jobDescription: toStringOrNull(row.jobDescription) ?? undefined,
|
|
jobType: toStringOrNull(row.jobType) ?? undefined,
|
|
};
|
|
}
|
|
|
|
async function readDataset(): Promise<CreateJobInput[]> {
|
|
const content = await readFile(DATASET_PATH, "utf-8");
|
|
const parsed = JSON.parse(content) as unknown;
|
|
if (!Array.isArray(parsed)) return [];
|
|
|
|
const jobs: CreateJobInput[] = [];
|
|
const seen = new Set<string>();
|
|
for (const value of parsed) {
|
|
if (!value || typeof value !== "object") continue;
|
|
const mapped = mapAdzunaRow(value as AdzunaRawJob);
|
|
if (!mapped) continue;
|
|
const key = mapped.sourceJobId || mapped.jobUrl;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
jobs.push(mapped);
|
|
}
|
|
return jobs;
|
|
}
|
|
|
|
export async function runAdzuna(
|
|
options: RunAdzunaOptions = {},
|
|
): Promise<AdzunaResult> {
|
|
const appId = process.env.ADZUNA_APP_ID?.trim();
|
|
const appKey = process.env.ADZUNA_APP_KEY?.trim();
|
|
if (!appId || !appKey) {
|
|
return {
|
|
success: false,
|
|
jobs: [],
|
|
error: "Missing Adzuna credentials (ADZUNA_APP_ID / ADZUNA_APP_KEY)",
|
|
};
|
|
}
|
|
|
|
const country = (options.country || "gb").trim().toLowerCase();
|
|
const countryKey = normalizeCountryKey(options.countryKey ?? "");
|
|
const maxJobsPerTerm = options.maxJobsPerTerm ?? 50;
|
|
const searchTerms =
|
|
options.searchTerms && options.searchTerms.length > 0
|
|
? options.searchTerms
|
|
: ["web developer"];
|
|
const locations = resolveSearchCities({
|
|
list: options.locations,
|
|
env: process.env.ADZUNA_LOCATION_QUERY,
|
|
});
|
|
const runLocations = locations.length > 0 ? locations : [null];
|
|
const termTotal = searchTerms.length * runLocations.length;
|
|
const useNpmCommand = canRunNpmCommand();
|
|
if (!useNpmCommand && !TSX_CLI_PATH) {
|
|
return {
|
|
success: false,
|
|
jobs: [],
|
|
error: "Unable to execute Adzuna extractor (npm/tsx unavailable)",
|
|
};
|
|
}
|
|
|
|
try {
|
|
const jobs: CreateJobInput[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
for (let runIndex = 0; runIndex < runLocations.length; runIndex += 1) {
|
|
const location = runLocations[runIndex];
|
|
const strictLocationFilter =
|
|
location !== null && shouldApplyStrictCityFilter(location, countryKey);
|
|
|
|
await new Promise<void>((resolve, reject) => {
|
|
const extractorEnv = envForExtractorSubprocess({
|
|
...process.env,
|
|
JOBOPS_EMIT_PROGRESS: "1",
|
|
ADZUNA_APP_ID: appId,
|
|
ADZUNA_APP_KEY: appKey,
|
|
ADZUNA_COUNTRY: country,
|
|
ADZUNA_MAX_JOBS_PER_TERM: String(maxJobsPerTerm),
|
|
ADZUNA_SEARCH_TERMS: JSON.stringify(searchTerms),
|
|
ADZUNA_OUTPUT_JSON: DATASET_PATH,
|
|
ADZUNA_LOCATION_QUERY: strictLocationFilter ? location : "",
|
|
});
|
|
const child = useNpmCommand
|
|
? spawn("npm", ["run", "start"], {
|
|
cwd: EXTRACTOR_DIR,
|
|
stdio: ["ignore", "pipe", "pipe"],
|
|
env: extractorEnv,
|
|
})
|
|
: (() => {
|
|
const tsxCliPath = TSX_CLI_PATH;
|
|
if (!tsxCliPath) {
|
|
throw new Error(
|
|
"Unable to execute Adzuna extractor (npm/tsx unavailable)",
|
|
);
|
|
}
|
|
return spawn(process.execPath, [tsxCliPath, "src/main.ts"], {
|
|
cwd: EXTRACTOR_DIR,
|
|
stdio: ["ignore", "pipe", "pipe"],
|
|
env: extractorEnv,
|
|
});
|
|
})();
|
|
|
|
const handleLine = (line: string, stream: NodeJS.WriteStream) => {
|
|
const progressEvent = parseAdzunaProgressLine(line);
|
|
if (progressEvent) {
|
|
const termOffset = runIndex * searchTerms.length;
|
|
options.onProgress?.({
|
|
...progressEvent,
|
|
termIndex: termOffset + progressEvent.termIndex,
|
|
termTotal,
|
|
});
|
|
return;
|
|
}
|
|
stream.write(`${line}\n`);
|
|
};
|
|
|
|
const stdoutRl = child.stdout
|
|
? createInterface({ input: child.stdout })
|
|
: null;
|
|
const stderrRl = child.stderr
|
|
? createInterface({ input: child.stderr })
|
|
: null;
|
|
|
|
stdoutRl?.on("line", (line) => handleLine(line, process.stdout));
|
|
stderrRl?.on("line", (line) => handleLine(line, process.stderr));
|
|
|
|
child.on("close", (code) => {
|
|
stdoutRl?.close();
|
|
stderrRl?.close();
|
|
if (code === 0) resolve();
|
|
else reject(new Error(`Adzuna extractor exited with code ${code}`));
|
|
});
|
|
child.on("error", reject);
|
|
});
|
|
|
|
const runJobs = await readDataset();
|
|
const filtered = runJobs;
|
|
|
|
for (const job of filtered) {
|
|
const key = job.sourceJobId || job.jobUrl;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
jobs.push(job);
|
|
}
|
|
}
|
|
|
|
return { success: true, jobs };
|
|
} catch (error) {
|
|
const message = error instanceof Error ? error.message : "Unknown error";
|
|
return { success: false, jobs: [], error: message };
|
|
}
|
|
}
|