Shaheer Sarfaraz 3da5ea35b4
Deduplicate shared helpers and enforce aliased imports (#228)
* Deduplicate string cleanup helpers and not-found responses

* Enforce aliased imports for infra and shared modules

* Enforce @client/@server aliases for deep relative imports

* Deduplicate visa sponsor and location filter definitions

* Use shared city filter export in extractor location checks
2026-02-22 16:13:52 +00:00

294 lines
8.8 KiB
TypeScript

import { spawn, spawnSync } from "node:child_process";
import { readFile } from "node:fs/promises";
import { createRequire } from "node:module";
import { dirname, join } from "node:path";
import { createInterface } from "node:readline";
import { fileURLToPath } from "node:url";
import { normalizeCountryKey } from "@shared/location-support.js";
import {
resolveSearchCities,
shouldApplyStrictCityFilter,
} from "@shared/search-cities.js";
import type { CreateJobInput } from "@shared/types/jobs";
import {
toNumberOrNull,
toStringOrNull,
} from "@shared/utils/type-conversion.js";
const srcDir = dirname(fileURLToPath(import.meta.url));
const EXTRACTOR_DIR = join(srcDir, "..");
const DATASET_PATH = join(EXTRACTOR_DIR, "storage/datasets/default/jobs.json");
const JOBOPS_PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
const require = createRequire(import.meta.url);
const TSX_CLI_PATH = resolveTsxCliPath();
type AdzunaRawJob = Record<string, unknown>;
export type AdzunaProgressEvent =
| {
type: "term_start";
termIndex: number;
termTotal: number;
searchTerm: string;
}
| {
type: "page_fetched";
termIndex: number;
termTotal: number;
searchTerm: string;
pageNo: number;
resultsOnPage: number;
totalCollected: number;
}
| {
type: "term_complete";
termIndex: number;
termTotal: number;
searchTerm: string;
jobsFoundTerm: number;
};
export interface RunAdzunaOptions {
searchTerms?: string[];
country?: string;
countryKey?: string;
locations?: string[];
maxJobsPerTerm?: number;
onProgress?: (event: AdzunaProgressEvent) => void;
}
export interface AdzunaResult {
success: boolean;
jobs: CreateJobInput[];
error?: string;
}
function resolveTsxCliPath(): string | null {
try {
return require.resolve("tsx/dist/cli.mjs");
} catch {
return null;
}
}
function canRunNpmCommand(): boolean {
const result = spawnSync("npm", ["--version"], { stdio: "ignore" });
return !result.error && result.status === 0;
}
function parseAdzunaProgressLine(line: string): AdzunaProgressEvent | null {
if (!line.startsWith(JOBOPS_PROGRESS_PREFIX)) return null;
const raw = line.slice(JOBOPS_PROGRESS_PREFIX.length).trim();
let parsed: Record<string, unknown>;
try {
parsed = JSON.parse(raw) as Record<string, unknown>;
} catch {
return null;
}
const event = toStringOrNull(parsed.event);
const termIndex = toNumberOrNull(parsed.termIndex);
const termTotal = toNumberOrNull(parsed.termTotal);
const searchTerm = toStringOrNull(parsed.searchTerm) ?? "";
if (!event || termIndex === null || termTotal === null) return null;
if (event === "term_start") {
return { type: "term_start", termIndex, termTotal, searchTerm };
}
if (event === "page_fetched") {
const pageNo = toNumberOrNull(parsed.pageNo);
if (pageNo === null) return null;
return {
type: "page_fetched",
termIndex,
termTotal,
searchTerm,
pageNo,
resultsOnPage: toNumberOrNull(parsed.resultsOnPage) ?? 0,
totalCollected: toNumberOrNull(parsed.totalCollected) ?? 0,
};
}
if (event === "term_complete") {
return {
type: "term_complete",
termIndex,
termTotal,
searchTerm,
jobsFoundTerm: toNumberOrNull(parsed.jobsFoundTerm) ?? 0,
};
}
return null;
}
function mapAdzunaRow(row: AdzunaRawJob): CreateJobInput | null {
const jobUrl = toStringOrNull(row.jobUrl);
if (!jobUrl) return null;
return {
source: "adzuna",
sourceJobId: toStringOrNull(row.sourceJobId) ?? undefined,
title: toStringOrNull(row.title) ?? "Unknown Title",
employer: toStringOrNull(row.employer) ?? "Unknown Employer",
jobUrl,
applicationLink:
toStringOrNull(row.applicationLink) ??
toStringOrNull(row.jobUrl) ??
undefined,
location: toStringOrNull(row.location) ?? undefined,
salary: toStringOrNull(row.salary) ?? undefined,
datePosted: toStringOrNull(row.datePosted) ?? undefined,
jobDescription: toStringOrNull(row.jobDescription) ?? undefined,
jobType: toStringOrNull(row.jobType) ?? undefined,
};
}
async function readDataset(): Promise<CreateJobInput[]> {
const content = await readFile(DATASET_PATH, "utf-8");
const parsed = JSON.parse(content) as unknown;
if (!Array.isArray(parsed)) return [];
const jobs: CreateJobInput[] = [];
const seen = new Set<string>();
for (const value of parsed) {
if (!value || typeof value !== "object") continue;
const mapped = mapAdzunaRow(value as AdzunaRawJob);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
jobs.push(mapped);
}
return jobs;
}
export async function runAdzuna(
options: RunAdzunaOptions = {},
): Promise<AdzunaResult> {
const appId = process.env.ADZUNA_APP_ID?.trim();
const appKey = process.env.ADZUNA_APP_KEY?.trim();
if (!appId || !appKey) {
return {
success: false,
jobs: [],
error: "Missing Adzuna credentials (ADZUNA_APP_ID / ADZUNA_APP_KEY)",
};
}
const country = (options.country || "gb").trim().toLowerCase();
const countryKey = normalizeCountryKey(options.countryKey ?? "");
const maxJobsPerTerm = options.maxJobsPerTerm ?? 50;
const searchTerms =
options.searchTerms && options.searchTerms.length > 0
? options.searchTerms
: ["web developer"];
const locations = resolveSearchCities({
list: options.locations,
env: process.env.ADZUNA_LOCATION_QUERY,
});
const runLocations = locations.length > 0 ? locations : [null];
const termTotal = searchTerms.length * runLocations.length;
const useNpmCommand = canRunNpmCommand();
if (!useNpmCommand && !TSX_CLI_PATH) {
return {
success: false,
jobs: [],
error: "Unable to execute Adzuna extractor (npm/tsx unavailable)",
};
}
try {
const jobs: CreateJobInput[] = [];
const seen = new Set<string>();
for (let runIndex = 0; runIndex < runLocations.length; runIndex += 1) {
const location = runLocations[runIndex];
const strictLocationFilter =
location !== null && shouldApplyStrictCityFilter(location, countryKey);
await new Promise<void>((resolve, reject) => {
const extractorEnv = {
...process.env,
JOBOPS_EMIT_PROGRESS: "1",
ADZUNA_APP_ID: appId,
ADZUNA_APP_KEY: appKey,
ADZUNA_COUNTRY: country,
ADZUNA_MAX_JOBS_PER_TERM: String(maxJobsPerTerm),
ADZUNA_SEARCH_TERMS: JSON.stringify(searchTerms),
ADZUNA_OUTPUT_JSON: DATASET_PATH,
ADZUNA_LOCATION_QUERY: strictLocationFilter ? location : "",
};
const child = useNpmCommand
? spawn("npm", ["run", "start"], {
cwd: EXTRACTOR_DIR,
stdio: ["ignore", "pipe", "pipe"],
env: extractorEnv,
})
: (() => {
const tsxCliPath = TSX_CLI_PATH;
if (!tsxCliPath) {
throw new Error(
"Unable to execute Adzuna extractor (npm/tsx unavailable)",
);
}
return spawn(process.execPath, [tsxCliPath, "src/main.ts"], {
cwd: EXTRACTOR_DIR,
stdio: ["ignore", "pipe", "pipe"],
env: extractorEnv,
});
})();
const handleLine = (line: string, stream: NodeJS.WriteStream) => {
const progressEvent = parseAdzunaProgressLine(line);
if (progressEvent) {
const termOffset = runIndex * searchTerms.length;
options.onProgress?.({
...progressEvent,
termIndex: termOffset + progressEvent.termIndex,
termTotal,
});
return;
}
stream.write(`${line}\n`);
};
const stdoutRl = child.stdout
? createInterface({ input: child.stdout })
: null;
const stderrRl = child.stderr
? createInterface({ input: child.stderr })
: null;
stdoutRl?.on("line", (line) => handleLine(line, process.stdout));
stderrRl?.on("line", (line) => handleLine(line, process.stderr));
child.on("close", (code) => {
stdoutRl?.close();
stderrRl?.close();
if (code === 0) resolve();
else reject(new Error(`Adzuna extractor exited with code ${code}`));
});
child.on("error", reject);
});
const runJobs = await readDataset();
const filtered = runJobs;
for (const job of filtered) {
const key = job.sourceJobId || job.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
jobs.push(job);
}
}
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };
}
}