ilia c840f289e1
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m8s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m11s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m8s
CI / Type Check (orchestrator) (push) Successful in 1m23s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m6s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m7s
CI / Documentation (push) Successful in 1m54s
feat(extractors): expand catalog, smoke coverage, and sourcing docs
Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters
manifests with registry/settings/UI wiring; registers full extractor list in
smoke-extractors and documents supplementary board access paths. Aligns Careerjet
v4 with the url query parameter and fixes strict typing in QAJobsBoard.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-15 22:36:23 -04:00

195 lines
5.5 KiB
TypeScript

/**
* BC T-Net — public RSS aggregate of BC tech jobs.
*
* Default: https://www.bctechnology.com/rss/jobs/tnetjobs.xml
*
* Feeds may embed `<![CDATA[&]]>` inside `<link>` URLs — normalized before fetch.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface BcItem {
title?: string;
link?: string;
guid?: string;
description?: string;
pubDate?: string;
category?: string;
}
function xmlText(xml: string, tag: string): string | undefined {
const pattern = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`);
const match = xml.match(pattern);
if (!match?.[1]) return undefined;
return (
match[1].replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1").trim() || undefined
);
}
function normalizeFeedLink(raw: string): string {
return raw.replace(/<!\[CDATA\[&\]\]>/g, "&").trim();
}
function parseItems(xml: string): BcItem[] {
const items: BcItem[] = [];
const blocks = xml.match(/<item>([\s\S]*?)<\/item>/g) ?? [];
for (const raw of blocks) {
const block = raw.replace(/^<item>/, "").replace(/<\/item>$/, "");
const linkRaw = xmlText(block, "link");
items.push({
title: xmlText(block, "title"),
link: linkRaw ? normalizeFeedLink(linkRaw) : undefined,
guid: xmlText(block, "guid"),
description: xmlText(block, "description"),
pubDate: xmlText(block, "pubDate"),
category: xmlText(block, "category"),
});
}
return items;
}
function readUrls(raw: string | undefined): string[] {
if (!raw) return [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
.filter(Boolean);
}
} catch {
// fall through
}
return raw
.split(/[\n|]+/)
.map((entry) => entry.trim())
.filter(Boolean);
}
function decodeHtmlEntities(html: string): string {
return html
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&#x2f;/gi, "/")
.replace(/&#x26;/gi, "&");
}
function matchesTerm(item: BcItem, term: string): boolean {
const lower = term.toLowerCase();
const hay =
`${item.title ?? ""} ${item.description ?? ""} ${item.category ?? ""}`.toLowerCase();
return hay.includes(lower);
}
function mapJob(item: BcItem): CreateJobInput | null {
const jobUrl = item.link?.trim();
if (!jobUrl) return null;
const title = item.title ? decodeHtmlEntities(item.title) : "Unknown Title";
const employer = item.category?.trim() || "Unknown Employer";
return {
source: "bctenet",
sourceJobId: item.guid ?? jobUrl,
title,
employer,
jobUrl,
applicationLink: jobUrl,
location: "British Columbia, Canada",
datePosted: item.pubDate,
jobDescription: item.description
? decodeHtmlEntities(item.description)
: undefined,
};
}
export const manifest: ExtractorManifest = {
id: "bctenet",
displayName: "BC T-Net (RSS)",
providesSources: ["bctenet"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const defaults = ["https://www.bctechnology.com/rss/jobs/tnetjobs.xml"];
const configured = readUrls(context.settings.bctenetRssUrls);
const urls = configured.length > 0 ? configured : defaults;
const maxJobs = context.settings.bctenetMaxJobsPerTerm
? Number.parseInt(context.settings.bctenetMaxJobsPerTerm, 10)
: 400;
const cap = Number.isFinite(maxJobs)
? Math.min(Math.max(maxJobs, 1), 2000)
: 400;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
const maxTotal = cap * Math.max(terms.length, 1);
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < urls.length; i += 1) {
if (context.shouldCancel?.()) break;
const rssUrl = urls[i];
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: urls.length,
currentUrl: rssUrl,
detail: `BC T-Net: fetching (${i + 1}/${urls.length})`,
});
const response = await fetch(rssUrl, {
headers: {
Accept: "application/rss+xml, application/xml, text/xml",
"User-Agent":
"Mozilla/5.0 (compatible; JobOps/1.0) BC T-Net RSS consumer",
},
});
if (!response.ok) {
throw new Error(`BC T-Net RSS failed: ${response.status}`);
}
const xml = await response.text();
const items = parseItems(xml);
for (const item of items) {
if (out.length >= maxTotal) break;
if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) {
continue;
}
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: urls.length,
currentUrl: rssUrl,
jobPagesProcessed: out.length,
detail: `BC T-Net: ${items.length} items (${out.length} kept total)`,
});
}
return { success: true, jobs: out };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
},
};
export default manifest;