Jobber/extractors/icims/manifest.ts
ilia c840f289e1
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m8s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m11s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m8s
CI / Type Check (orchestrator) (push) Successful in 1m23s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m6s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m7s
CI / Documentation (push) Successful in 1m54s
feat(extractors): expand catalog, smoke coverage, and sourcing docs
Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters
manifests with registry/settings/UI wiring; registers full extractor list in
smoke-extractors and documents supplementary board access paths. Aligns Careerjet
v4 with the url query parameter and fixes strict typing in QAJobsBoard.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-15 22:36:23 -04:00

234 lines
6.5 KiB
TypeScript

/**
* iCIMS tenant portal — anonymous HTML search (`/jobs/search`) pattern.
*
* Many tenants expose listings suitable for HTML extraction when loaded with
* `ss=1` + `in_iframe=1`. Job links typically follow `/jobs/{id}/{slug}/job`.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
interface ParsedJobRow {
url: string;
title: string;
}
function parseHosts(raw: string | undefined): string[] {
if (!raw) return [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
.filter(Boolean);
}
} catch {
// fall through
}
return raw
.split(/[\n,]+/)
.map((entry) => entry.trim())
.filter(Boolean);
}
function normalizeHost(hostOrUrl: string): string {
const trimmed = hostOrUrl.trim();
if (!trimmed) return "";
try {
if (trimmed.includes("://")) {
const url = new URL(trimmed);
return url.host;
}
} catch {
return trimmed.replace(/^\/\//, "");
}
return trimmed.replace(/^\/\//, "");
}
function canonicalJobUrl(url: string): string {
try {
const parsed = new URL(url);
parsed.search = "";
return parsed.toString();
} catch {
return url.replace(/\?[^#]*/, "");
}
}
function extractRows(html: string): ParsedJobRow[] {
const out: ParsedJobRow[] = [];
const seen = new Set<string>();
const primary =
/<a[^>]*href="(https:\/\/[^"]+\/jobs\/\d+\/[^"]+\/job)(?:\?[^"]*)?"[^>]*title="\d+\s*-\s*([^"]+)"/gi;
for (;;) {
const match = primary.exec(html);
if (match === null) break;
const url = canonicalJobUrl(match[1]);
const title = match[2]?.trim();
if (!url || !title || seen.has(url)) continue;
seen.add(url);
out.push({ url, title });
}
const fallback =
/<a[^>]*href="(https:\/\/[^"]+\/jobs\/\d+\/([^"/]+)\/job)(?:\?[^"]*)?"[^>]*>/gi;
for (;;) {
const match = fallback.exec(html);
if (match === null) break;
const url = canonicalJobUrl(match[1]);
const slug = match[2];
if (!url || seen.has(url)) continue;
seen.add(url);
const title = slug
? decodeURIComponent(slug.replace(/\+/g, " "))
: "Unknown Title";
out.push({ url, title });
}
return out;
}
function matchesTerm(row: ParsedJobRow, term: string): boolean {
const lower = term.toLowerCase();
return row.title.toLowerCase().includes(lower);
}
function employerFromHost(host: string): string {
const prefix = host.replace(/^careers-/, "").replace(/^careers\./, "");
const base = prefix.replace(/\.icims\.com$/i, "");
return base.replace(/[-_.]/g, " ").trim() || host;
}
export const manifest: ExtractorManifest = {
id: "icims",
displayName: "iCIMS tenants (HTML)",
providesSources: ["icims"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const hosts = parseHosts(context.settings.icimsTenants)
.map(normalizeHost)
.filter(Boolean);
if (hosts.length === 0) {
return {
success: false,
jobs: [],
error: "No icimsTenants configured",
};
}
const maxPagesRaw = context.settings.icimsMaxPagesPerSearch;
const maxPages = maxPagesRaw ? Number.parseInt(maxPagesRaw, 10) : 10;
const pages = Number.isFinite(maxPages)
? Math.min(Math.max(maxPages, 1), 50)
: 10;
const maxPerTenantRaw = context.settings.icimsMaxJobsPerTenant;
const maxPerTenant = maxPerTenantRaw
? Number.parseInt(maxPerTenantRaw, 10)
: 250;
const tenantCap = Number.isFinite(maxPerTenant)
? Math.min(Math.max(maxPerTenant, 1), 2000)
: 250;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
const jobs: CreateJobInput[] = [];
const seenGlobal = new Set<string>();
try {
let tenantIndex = 0;
for (const rawHost of hosts) {
if (context.shouldCancel?.()) break;
const host = normalizeHost(rawHost);
tenantIndex += 1;
let tenantCount = 0;
context.onProgress?.({
phase: "list",
termsProcessed: tenantIndex - 1,
termsTotal: hosts.length,
currentUrl: host,
detail: `iCIMS tenant ${tenantIndex}/${hosts.length}: ${host}`,
});
for (const term of terms) {
if (tenantCount >= tenantCap) break;
for (let page = 1; page <= pages; page += 1) {
if (tenantCount >= tenantCap) break;
const query = new URLSearchParams({
ss: "1",
in_iframe: "1",
searchKeyword: term,
pr: String(page),
});
const searchUrl = `https://${host}/jobs/search?${query.toString()}`;
const response = await fetch(searchUrl, {
headers: {
Accept: "text/html",
"User-Agent":
"Mozilla/5.0 (compatible; JobOps/1.0) iCIMS portal reader",
},
});
if (!response.ok) {
throw new Error(
`iCIMS fetch failed (${host}): ${response.status}`,
);
}
const html = await response.text();
const rows = extractRows(html).filter((row) =>
term ? matchesTerm(row, term) : true,
);
if (rows.length === 0) break;
for (const row of rows) {
if (tenantCount >= tenantCap) break;
if (seenGlobal.has(row.url)) continue;
seenGlobal.add(row.url);
tenantCount += 1;
jobs.push({
source: "icims",
sourceJobId: row.url,
title: row.title,
employer: employerFromHost(host),
jobUrl: row.url,
applicationLink: row.url,
});
}
context.onProgress?.({
phase: "list",
termsProcessed: tenantIndex - 1,
termsTotal: hosts.length,
currentUrl: host,
jobPagesProcessed: jobs.length,
detail: `iCIMS ${host}: page ${page}, +${rows.length} rows`,
});
}
}
}
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs, error: message };
}
},
};
export default manifest;