Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m8s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m11s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m8s
CI / Type Check (orchestrator) (push) Successful in 1m23s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m6s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m7s
CI / Documentation (push) Successful in 1m54s
Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters manifests with registry/settings/UI wiring; registers full extractor list in smoke-extractors and documents supplementary board access paths. Aligns Careerjet v4 with the url query parameter and fixes strict typing in QAJobsBoard. Co-authored-by: Cursor <cursoragent@cursor.com>
234 lines
6.5 KiB
TypeScript
234 lines
6.5 KiB
TypeScript
/**
|
|
* iCIMS tenant portal — anonymous HTML search (`/jobs/search`) pattern.
|
|
*
|
|
* Many tenants expose listings suitable for HTML extraction when loaded with
|
|
* `ss=1` + `in_iframe=1`. Job links typically follow `/jobs/{id}/{slug}/job`.
|
|
*/
|
|
|
|
import type {
|
|
ExtractorManifest,
|
|
ExtractorRunResult,
|
|
} from "@shared/types/extractors";
|
|
import type { CreateJobInput } from "@shared/types/jobs";
|
|
|
|
interface ParsedJobRow {
|
|
url: string;
|
|
title: string;
|
|
}
|
|
|
|
function parseHosts(raw: string | undefined): string[] {
|
|
if (!raw) return [];
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
if (Array.isArray(parsed)) {
|
|
return parsed
|
|
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
|
|
.filter(Boolean);
|
|
}
|
|
} catch {
|
|
// fall through
|
|
}
|
|
return raw
|
|
.split(/[\n,]+/)
|
|
.map((entry) => entry.trim())
|
|
.filter(Boolean);
|
|
}
|
|
|
|
function normalizeHost(hostOrUrl: string): string {
|
|
const trimmed = hostOrUrl.trim();
|
|
if (!trimmed) return "";
|
|
try {
|
|
if (trimmed.includes("://")) {
|
|
const url = new URL(trimmed);
|
|
return url.host;
|
|
}
|
|
} catch {
|
|
return trimmed.replace(/^\/\//, "");
|
|
}
|
|
return trimmed.replace(/^\/\//, "");
|
|
}
|
|
|
|
function canonicalJobUrl(url: string): string {
|
|
try {
|
|
const parsed = new URL(url);
|
|
parsed.search = "";
|
|
return parsed.toString();
|
|
} catch {
|
|
return url.replace(/\?[^#]*/, "");
|
|
}
|
|
}
|
|
|
|
function extractRows(html: string): ParsedJobRow[] {
|
|
const out: ParsedJobRow[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
const primary =
|
|
/<a[^>]*href="(https:\/\/[^"]+\/jobs\/\d+\/[^"]+\/job)(?:\?[^"]*)?"[^>]*title="\d+\s*-\s*([^"]+)"/gi;
|
|
for (;;) {
|
|
const match = primary.exec(html);
|
|
if (match === null) break;
|
|
const url = canonicalJobUrl(match[1]);
|
|
const title = match[2]?.trim();
|
|
if (!url || !title || seen.has(url)) continue;
|
|
seen.add(url);
|
|
out.push({ url, title });
|
|
}
|
|
|
|
const fallback =
|
|
/<a[^>]*href="(https:\/\/[^"]+\/jobs\/\d+\/([^"/]+)\/job)(?:\?[^"]*)?"[^>]*>/gi;
|
|
for (;;) {
|
|
const match = fallback.exec(html);
|
|
if (match === null) break;
|
|
const url = canonicalJobUrl(match[1]);
|
|
const slug = match[2];
|
|
if (!url || seen.has(url)) continue;
|
|
seen.add(url);
|
|
const title = slug
|
|
? decodeURIComponent(slug.replace(/\+/g, " "))
|
|
: "Unknown Title";
|
|
out.push({ url, title });
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
function matchesTerm(row: ParsedJobRow, term: string): boolean {
|
|
const lower = term.toLowerCase();
|
|
return row.title.toLowerCase().includes(lower);
|
|
}
|
|
|
|
function employerFromHost(host: string): string {
|
|
const prefix = host.replace(/^careers-/, "").replace(/^careers\./, "");
|
|
const base = prefix.replace(/\.icims\.com$/i, "");
|
|
return base.replace(/[-_.]/g, " ").trim() || host;
|
|
}
|
|
|
|
export const manifest: ExtractorManifest = {
|
|
id: "icims",
|
|
displayName: "iCIMS tenants (HTML)",
|
|
providesSources: ["icims"],
|
|
async run(context): Promise<ExtractorRunResult> {
|
|
if (context.shouldCancel?.()) return { success: true, jobs: [] };
|
|
|
|
const hosts = parseHosts(context.settings.icimsTenants)
|
|
.map(normalizeHost)
|
|
.filter(Boolean);
|
|
|
|
if (hosts.length === 0) {
|
|
return {
|
|
success: false,
|
|
jobs: [],
|
|
error: "No icimsTenants configured",
|
|
};
|
|
}
|
|
|
|
const maxPagesRaw = context.settings.icimsMaxPagesPerSearch;
|
|
const maxPages = maxPagesRaw ? Number.parseInt(maxPagesRaw, 10) : 10;
|
|
const pages = Number.isFinite(maxPages)
|
|
? Math.min(Math.max(maxPages, 1), 50)
|
|
: 10;
|
|
|
|
const maxPerTenantRaw = context.settings.icimsMaxJobsPerTenant;
|
|
const maxPerTenant = maxPerTenantRaw
|
|
? Number.parseInt(maxPerTenantRaw, 10)
|
|
: 250;
|
|
const tenantCap = Number.isFinite(maxPerTenant)
|
|
? Math.min(Math.max(maxPerTenant, 1), 2000)
|
|
: 250;
|
|
|
|
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
|
|
|
|
const jobs: CreateJobInput[] = [];
|
|
const seenGlobal = new Set<string>();
|
|
|
|
try {
|
|
let tenantIndex = 0;
|
|
for (const rawHost of hosts) {
|
|
if (context.shouldCancel?.()) break;
|
|
|
|
const host = normalizeHost(rawHost);
|
|
tenantIndex += 1;
|
|
let tenantCount = 0;
|
|
|
|
context.onProgress?.({
|
|
phase: "list",
|
|
termsProcessed: tenantIndex - 1,
|
|
termsTotal: hosts.length,
|
|
currentUrl: host,
|
|
detail: `iCIMS tenant ${tenantIndex}/${hosts.length}: ${host}`,
|
|
});
|
|
|
|
for (const term of terms) {
|
|
if (tenantCount >= tenantCap) break;
|
|
|
|
for (let page = 1; page <= pages; page += 1) {
|
|
if (tenantCount >= tenantCap) break;
|
|
|
|
const query = new URLSearchParams({
|
|
ss: "1",
|
|
in_iframe: "1",
|
|
searchKeyword: term,
|
|
pr: String(page),
|
|
});
|
|
|
|
const searchUrl = `https://${host}/jobs/search?${query.toString()}`;
|
|
const response = await fetch(searchUrl, {
|
|
headers: {
|
|
Accept: "text/html",
|
|
"User-Agent":
|
|
"Mozilla/5.0 (compatible; JobOps/1.0) iCIMS portal reader",
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(
|
|
`iCIMS fetch failed (${host}): ${response.status}`,
|
|
);
|
|
}
|
|
|
|
const html = await response.text();
|
|
const rows = extractRows(html).filter((row) =>
|
|
term ? matchesTerm(row, term) : true,
|
|
);
|
|
|
|
if (rows.length === 0) break;
|
|
|
|
for (const row of rows) {
|
|
if (tenantCount >= tenantCap) break;
|
|
if (seenGlobal.has(row.url)) continue;
|
|
|
|
seenGlobal.add(row.url);
|
|
tenantCount += 1;
|
|
|
|
jobs.push({
|
|
source: "icims",
|
|
sourceJobId: row.url,
|
|
title: row.title,
|
|
employer: employerFromHost(host),
|
|
jobUrl: row.url,
|
|
applicationLink: row.url,
|
|
});
|
|
}
|
|
|
|
context.onProgress?.({
|
|
phase: "list",
|
|
termsProcessed: tenantIndex - 1,
|
|
termsTotal: hosts.length,
|
|
currentUrl: host,
|
|
jobPagesProcessed: jobs.length,
|
|
detail: `iCIMS ${host}: page ${page}, +${rows.length} rows`,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return { success: true, jobs };
|
|
} catch (error) {
|
|
const message = error instanceof Error ? error.message : "Unknown error";
|
|
return { success: false, jobs, error: message };
|
|
}
|
|
},
|
|
};
|
|
|
|
export default manifest;
|