Jobber/extractors/arcdev/manifest.ts
ilia c840f289e1
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m8s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m11s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m8s
CI / Type Check (orchestrator) (push) Successful in 1m23s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m6s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m7s
CI / Documentation (push) Successful in 1m54s
feat(extractors): expand catalog, smoke coverage, and sourcing docs
Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters
manifests with registry/settings/UI wiring; registers full extractor list in
smoke-extractors and documents supplementary board access paths. Aligns Careerjet
v4 with the url query parameter and fixes strict typing in QAJobsBoard.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-15 22:36:23 -04:00

330 lines
9.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Arc.dev remote jobs — parse embedded Next.js __NEXT_DATA__ from SSR HTML.
*
* Listing URLs look like https://arc.dev/remote-jobs/playwright
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const ORIGIN = "https://arc.dev";
interface ArcCategory {
name?: string;
urlString?: string;
}
interface ArcCompanyJson {
randomKey?: string | null;
urlString?: string;
name?: string;
}
interface ArcJobJson {
randomKey?: string;
title?: string;
jobType?: string;
jobRole?: string;
urlString?: string;
postedAt?: number;
company?: ArcCompanyJson;
categories?: ArcCategory[];
requiredCountries?: string[];
minAnnualSalary?: number | null;
maxAnnualSalary?: number | null;
minHourlyRate?: number | null;
maxHourlyRate?: number | null;
timeZone?: string | null;
positionType?: string;
experienceLevel?: string;
experienceLevels?: string[];
}
function readPaths(raw: string | undefined): string[] {
if (!raw) return [];
try {
const parsed = JSON.parse(raw);
if (Array.isArray(parsed)) {
return parsed
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
.filter(Boolean);
}
} catch {
// fall through
}
return raw
.split(/[\n,;|]+/)
.map((entry) => entry.trim())
.filter(Boolean);
}
function defaultArcPaths(): string[] {
const raw =
typeof process !== "undefined" ? process.env.ARC_REMOTE_JOBS_PATHS : "";
const parsed = readPaths(raw);
return parsed.length > 0
? parsed
: ["/remote-jobs/playwright", "/remote-jobs/cypress"];
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const t = value.trim();
return t ? t : undefined;
}
function categoryHaystack(job: ArcJobJson): string {
if (!Array.isArray(job.categories)) return "";
return job.categories
.map((c) => `${c.name ?? ""} ${c.urlString ?? ""}`)
.join(" ")
.toLowerCase();
}
function matchesTerm(job: ArcJobJson, term: string): boolean {
const lower = term.toLowerCase();
if (job.title?.toLowerCase().includes(lower)) return true;
if (categoryHaystack(job).includes(lower)) return true;
if (job.jobRole?.toLowerCase().includes(lower)) return true;
if (job.positionType?.toLowerCase().includes(lower)) return true;
if (
Array.isArray(job.experienceLevels) &&
job.experienceLevels.some((l) => l.toLowerCase().includes(lower))
)
return true;
if (job.experienceLevel?.toLowerCase().includes(lower)) return true;
return false;
}
function salaryParts(job: ArcJobJson): string | undefined {
const bits: string[] = [];
if (
typeof job.minAnnualSalary === "number" &&
typeof job.maxAnnualSalary === "number"
) {
bits.push(`USD ${job.minAnnualSalary}${job.maxAnnualSalary} / yr`);
} else if (typeof job.minAnnualSalary === "number") {
bits.push(`USD ${job.minAnnualSalary}+ / yr`);
}
if (
typeof job.minHourlyRate === "number" ||
typeof job.maxHourlyRate === "number"
) {
bits.push(`$${job.minHourlyRate ?? "?"}${job.maxHourlyRate ?? "?"} / hr`);
}
return bits.length > 0 ? bits.join("; ") : undefined;
}
function locationLine(job: ArcJobJson): string {
if (
Array.isArray(job.requiredCountries) &&
job.requiredCountries.length > 0
) {
return job.requiredCountries.join(", ");
}
if (job.timeZone) return job.timeZone;
return "Remote";
}
function postedIso(postedAt: number | undefined): string | undefined {
if (typeof postedAt !== "number" || !Number.isFinite(postedAt))
return undefined;
return new Date(postedAt * 1000).toISOString();
}
function parseNextPageProps(html: string): {
arcJobs: ArcJobJson[];
externalJobs: ArcJobJson[];
} | null {
const match = html.match(
/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/,
);
if (!match?.[1]) return null;
try {
const parsed = JSON.parse(match[1]) as {
props?: { pageProps?: unknown };
};
const pageProps = parsed.props?.pageProps as
| {
arcJobs?: ArcJobJson[];
externalJobs?: ArcJobJson[];
}
| undefined;
if (!pageProps) return null;
return {
arcJobs: Array.isArray(pageProps.arcJobs) ? pageProps.arcJobs : [],
externalJobs: Array.isArray(pageProps.externalJobs)
? pageProps.externalJobs
: [],
};
} catch {
return null;
}
}
function mapExternalJob(job: ArcJobJson): CreateJobInput | null {
const rk = asString(job.randomKey);
const slug = asString(job.urlString);
if (!rk || !slug) return null;
const jobUrl = `${ORIGIN}/remote-jobs/j/${slug}-${rk}`;
const employer = asString(job.company?.name)?.trim() || "Unknown employer";
const disciplines = Array.isArray(job.categories)
? job.categories
.map((c) => c.name?.trim())
.filter((v): v is string => Boolean(v))
.join(", ")
: undefined;
return {
source: "arcdev",
sourceJobId: slug,
title: asString(job.title) ?? "Unknown Title",
employer,
jobUrl,
applicationLink: jobUrl,
location: locationLine(job),
datePosted: postedIso(job.postedAt),
jobType: asString(job.jobType),
salary: salaryParts(job),
disciplines,
jobLevel:
job.experienceLevels?.join(", ") ??
asString(job.experienceLevel) ??
undefined,
isRemote: true,
};
}
function mapArcManagedJob(job: ArcJobJson): CreateJobInput | null {
const rk = asString(job.randomKey);
const slug = asString(job.urlString);
if (!rk || !slug) return null;
const jobUrl = `${ORIGIN}/remote-jobs/details/${slug}-${rk}`;
const disciplines = Array.isArray(job.categories)
? job.categories
.map((c) => c.name?.trim())
.filter((v): v is string => Boolean(v))
.join(", ")
: undefined;
const employer = "Arc talent network";
return {
source: "arcdev",
sourceJobId: `${slug}-${rk}`,
title: asString(job.title) ?? "Unknown Title",
employer,
jobUrl,
applicationLink: jobUrl,
location: locationLine(job),
datePosted: postedIso(job.postedAt),
jobType: asString(job.jobType),
salary: salaryParts(job),
disciplines,
jobLevel: asString(job.experienceLevel),
jobFunction: asString(job.jobRole),
isRemote: true,
};
}
export const manifest: ExtractorManifest = {
id: "arcdev",
displayName: "Arc.dev (remote)",
providesSources: ["arcdev"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
let paths = readPaths(context.settings.arcRemoteJobsPaths);
if (paths.length === 0) paths = defaultArcPaths();
paths = paths.map((p) => (p.startsWith("/") ? p : `/${p}`));
const maxPerPath = context.settings.arcMaxJobsPerPath
? Number.parseInt(context.settings.arcMaxJobsPerPath, 10)
: 120;
const cap = Number.isFinite(maxPerPath)
? Math.min(Math.max(maxPerPath, 1), 300)
: 120;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
try {
for (let i = 0; i < paths.length; i += 1) {
if (context.shouldCancel?.()) break;
const path = paths[i];
const pageUrl = `${ORIGIN}${path}`;
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: paths.length,
currentUrl: pageUrl,
detail: `Arc.dev: fetching (${i + 1}/${paths.length}) ${path}`,
});
const response = await fetch(pageUrl, {
headers: {
Accept: "text/html",
"User-Agent":
"Mozilla/5.0 (compatible; JobOps/1.0; +https://github.com)",
},
});
if (!response.ok) {
throw new Error(
`Arc.dev "${path}" failed with status ${response.status}`,
);
}
const html = await response.text();
const payload = parseNextPageProps(html);
if (!payload) {
throw new Error(`Arc.dev "${path}": missing __NEXT_DATA__ payload`);
}
let pathAdded = 0;
const labeled = [
...payload.arcJobs.map((job) => ({ job, kind: "arc" as const })),
...payload.externalJobs.map((job) => ({ job, kind: "ext" as const })),
];
for (const { job: raw, kind } of labeled) {
if (pathAdded >= cap) break;
if (terms.length > 0 && !terms.some((t) => matchesTerm(raw, t))) {
continue;
}
const mapped =
kind === "arc" ? mapArcManagedJob(raw) : mapExternalJob(raw);
if (!mapped) continue;
if (seen.has(mapped.jobUrl)) continue;
seen.add(mapped.jobUrl);
out.push(mapped);
pathAdded += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: paths.length,
currentUrl: pageUrl,
jobPagesProcessed: out.length,
detail: `Arc.dev: ${path}${pathAdded} kept (${payload.arcJobs.length} arc + ${payload.externalJobs.length} external rows)`,
});
}
return { success: true, jobs: out };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: out, error: message };
}
},
};
export default manifest;