Shaheer Sarfaraz 82e142a8a8
Auto-Registering Extractor System (#223)
* initial commit?

* Address PR feedback on extractor discovery and startup resilience

* Address latest PR review comments

* fix city resolution fallback when input parses empty

* address PR feedback on extractor registry and pipeline validation

* address copilot comments on manifests and registry startup

* fix extractor discovery export handling and env isolation in tests

* enforce duplicate manifest id failures in strict mode

* Fix remaining extractor registry and runtime review comments

* docs

* docs

* test all, logic remains in extractors

* Address PR review feedback on extractor registry and validation

* Revert extractor moduleResolution to bundler

* Enforce shared city filtering across all discovery sources

* Deduplicate extractor strict city post-filtering
2026-02-21 17:44:07 +00:00

105 lines
2.8 KiB
TypeScript

import type {
ExtractorManifest,
ExtractorProgressEvent,
ExtractorRuntimeContext,
} from "@shared/types/extractors";
import { runUkVisaJobs } from "./src/run";
function toProgress(event: {
type: string;
termIndex: number;
termTotal: number;
searchTerm: string;
pageNo?: number;
maxPages?: number;
totalCollected?: number;
message?: string;
}): ExtractorProgressEvent {
if (event.type === "init") {
return {
phase: "list",
termsProcessed: Math.max(event.termIndex - 1, 0),
termsTotal: event.termTotal,
listPagesProcessed: 0,
listPagesTotal: event.maxPages ?? 0,
currentUrl: event.searchTerm || "all jobs",
detail: `UKVisaJobs: term ${event.termIndex}/${event.termTotal} (${event.searchTerm || "all jobs"})`,
};
}
if (event.type === "page_fetched") {
return {
phase: "list",
termsProcessed: Math.max(event.termIndex - 1, 0),
termsTotal: event.termTotal,
listPagesProcessed: event.pageNo ?? 0,
listPagesTotal: event.maxPages ?? 0,
jobPagesEnqueued: event.totalCollected ?? 0,
jobPagesProcessed: event.totalCollected ?? 0,
currentUrl: `page ${event.pageNo ?? 0}/${event.maxPages ?? 0}`,
detail: `UKVisaJobs: term ${event.termIndex}/${event.termTotal}, page ${event.pageNo ?? 0}/${event.maxPages ?? 0} (${event.totalCollected ?? 0} collected)`,
};
}
if (event.type === "term_complete") {
return {
phase: "list",
termsProcessed: event.termIndex,
termsTotal: event.termTotal,
currentUrl: event.searchTerm || "all jobs",
detail: `UKVisaJobs: completed term ${event.termIndex}/${event.termTotal} (${event.searchTerm || "all jobs"})`,
};
}
if (event.type === "empty_page") {
return {
detail: `UKVisaJobs: page ${event.pageNo ?? 0} returned no jobs`,
};
}
return {
detail: `UKVisaJobs: ${event.message ?? "unknown event"}`,
};
}
export const manifest: ExtractorManifest = {
id: "ukvisajobs",
displayName: "UK Visa Jobs",
providesSources: ["ukvisajobs"],
requiredEnvVars: ["UKVISAJOBS_EMAIL", "UKVISAJOBS_PASSWORD"],
async run(context: ExtractorRuntimeContext) {
if (context.shouldCancel?.()) {
return { success: true, jobs: [] };
}
const maxJobs = context.settings.ukvisajobsMaxJobs
? parseInt(context.settings.ukvisajobsMaxJobs, 10)
: 50;
const result = await runUkVisaJobs({
maxJobs,
searchTerms: context.searchTerms,
onProgress: (event) => {
if (context.shouldCancel?.()) return;
context.onProgress?.(toProgress(event));
},
});
if (!result.success) {
return {
success: false,
jobs: [],
error: result.error,
};
}
return {
success: true,
jobs: result.jobs,
};
},
};
export default manifest;