Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m8s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m11s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m8s
CI / Type Check (orchestrator) (push) Successful in 1m23s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m6s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m7s
CI / Documentation (push) Successful in 1m54s
Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters manifests with registry/settings/UI wiring; registers full extractor list in smoke-extractors and documents supplementary board access paths. Aligns Careerjet v4 with the url query parameter and fixes strict typing in QAJobsBoard. Co-authored-by: Cursor <cursoragent@cursor.com>
330 lines
9.3 KiB
TypeScript
330 lines
9.3 KiB
TypeScript
/**
|
||
* Arc.dev remote jobs — parse embedded Next.js __NEXT_DATA__ from SSR HTML.
|
||
*
|
||
* Listing URLs look like https://arc.dev/remote-jobs/playwright
|
||
*/
|
||
|
||
import type {
|
||
ExtractorManifest,
|
||
ExtractorRunResult,
|
||
} from "@shared/types/extractors";
|
||
import type { CreateJobInput } from "@shared/types/jobs";
|
||
|
||
const ORIGIN = "https://arc.dev";
|
||
|
||
interface ArcCategory {
|
||
name?: string;
|
||
urlString?: string;
|
||
}
|
||
|
||
interface ArcCompanyJson {
|
||
randomKey?: string | null;
|
||
urlString?: string;
|
||
name?: string;
|
||
}
|
||
|
||
interface ArcJobJson {
|
||
randomKey?: string;
|
||
title?: string;
|
||
jobType?: string;
|
||
jobRole?: string;
|
||
urlString?: string;
|
||
postedAt?: number;
|
||
company?: ArcCompanyJson;
|
||
categories?: ArcCategory[];
|
||
requiredCountries?: string[];
|
||
minAnnualSalary?: number | null;
|
||
maxAnnualSalary?: number | null;
|
||
minHourlyRate?: number | null;
|
||
maxHourlyRate?: number | null;
|
||
timeZone?: string | null;
|
||
positionType?: string;
|
||
experienceLevel?: string;
|
||
experienceLevels?: string[];
|
||
}
|
||
|
||
function readPaths(raw: string | undefined): string[] {
|
||
if (!raw) return [];
|
||
try {
|
||
const parsed = JSON.parse(raw);
|
||
if (Array.isArray(parsed)) {
|
||
return parsed
|
||
.map((entry) => (typeof entry === "string" ? entry.trim() : ""))
|
||
.filter(Boolean);
|
||
}
|
||
} catch {
|
||
// fall through
|
||
}
|
||
return raw
|
||
.split(/[\n,;|]+/)
|
||
.map((entry) => entry.trim())
|
||
.filter(Boolean);
|
||
}
|
||
|
||
function defaultArcPaths(): string[] {
|
||
const raw =
|
||
typeof process !== "undefined" ? process.env.ARC_REMOTE_JOBS_PATHS : "";
|
||
const parsed = readPaths(raw);
|
||
return parsed.length > 0
|
||
? parsed
|
||
: ["/remote-jobs/playwright", "/remote-jobs/cypress"];
|
||
}
|
||
|
||
function asString(value: unknown): string | undefined {
|
||
if (typeof value !== "string") return undefined;
|
||
const t = value.trim();
|
||
return t ? t : undefined;
|
||
}
|
||
|
||
function categoryHaystack(job: ArcJobJson): string {
|
||
if (!Array.isArray(job.categories)) return "";
|
||
return job.categories
|
||
.map((c) => `${c.name ?? ""} ${c.urlString ?? ""}`)
|
||
.join(" ")
|
||
.toLowerCase();
|
||
}
|
||
|
||
function matchesTerm(job: ArcJobJson, term: string): boolean {
|
||
const lower = term.toLowerCase();
|
||
if (job.title?.toLowerCase().includes(lower)) return true;
|
||
if (categoryHaystack(job).includes(lower)) return true;
|
||
if (job.jobRole?.toLowerCase().includes(lower)) return true;
|
||
if (job.positionType?.toLowerCase().includes(lower)) return true;
|
||
if (
|
||
Array.isArray(job.experienceLevels) &&
|
||
job.experienceLevels.some((l) => l.toLowerCase().includes(lower))
|
||
)
|
||
return true;
|
||
if (job.experienceLevel?.toLowerCase().includes(lower)) return true;
|
||
return false;
|
||
}
|
||
|
||
function salaryParts(job: ArcJobJson): string | undefined {
|
||
const bits: string[] = [];
|
||
if (
|
||
typeof job.minAnnualSalary === "number" &&
|
||
typeof job.maxAnnualSalary === "number"
|
||
) {
|
||
bits.push(`USD ${job.minAnnualSalary}–${job.maxAnnualSalary} / yr`);
|
||
} else if (typeof job.minAnnualSalary === "number") {
|
||
bits.push(`USD ${job.minAnnualSalary}+ / yr`);
|
||
}
|
||
if (
|
||
typeof job.minHourlyRate === "number" ||
|
||
typeof job.maxHourlyRate === "number"
|
||
) {
|
||
bits.push(`$${job.minHourlyRate ?? "?"}–${job.maxHourlyRate ?? "?"} / hr`);
|
||
}
|
||
return bits.length > 0 ? bits.join("; ") : undefined;
|
||
}
|
||
|
||
function locationLine(job: ArcJobJson): string {
|
||
if (
|
||
Array.isArray(job.requiredCountries) &&
|
||
job.requiredCountries.length > 0
|
||
) {
|
||
return job.requiredCountries.join(", ");
|
||
}
|
||
if (job.timeZone) return job.timeZone;
|
||
return "Remote";
|
||
}
|
||
|
||
function postedIso(postedAt: number | undefined): string | undefined {
|
||
if (typeof postedAt !== "number" || !Number.isFinite(postedAt))
|
||
return undefined;
|
||
return new Date(postedAt * 1000).toISOString();
|
||
}
|
||
|
||
function parseNextPageProps(html: string): {
|
||
arcJobs: ArcJobJson[];
|
||
externalJobs: ArcJobJson[];
|
||
} | null {
|
||
const match = html.match(
|
||
/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/,
|
||
);
|
||
if (!match?.[1]) return null;
|
||
try {
|
||
const parsed = JSON.parse(match[1]) as {
|
||
props?: { pageProps?: unknown };
|
||
};
|
||
const pageProps = parsed.props?.pageProps as
|
||
| {
|
||
arcJobs?: ArcJobJson[];
|
||
externalJobs?: ArcJobJson[];
|
||
}
|
||
| undefined;
|
||
if (!pageProps) return null;
|
||
return {
|
||
arcJobs: Array.isArray(pageProps.arcJobs) ? pageProps.arcJobs : [],
|
||
externalJobs: Array.isArray(pageProps.externalJobs)
|
||
? pageProps.externalJobs
|
||
: [],
|
||
};
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
function mapExternalJob(job: ArcJobJson): CreateJobInput | null {
|
||
const rk = asString(job.randomKey);
|
||
const slug = asString(job.urlString);
|
||
if (!rk || !slug) return null;
|
||
const jobUrl = `${ORIGIN}/remote-jobs/j/${slug}-${rk}`;
|
||
const employer = asString(job.company?.name)?.trim() || "Unknown employer";
|
||
|
||
const disciplines = Array.isArray(job.categories)
|
||
? job.categories
|
||
.map((c) => c.name?.trim())
|
||
.filter((v): v is string => Boolean(v))
|
||
.join(", ")
|
||
: undefined;
|
||
|
||
return {
|
||
source: "arcdev",
|
||
sourceJobId: slug,
|
||
title: asString(job.title) ?? "Unknown Title",
|
||
employer,
|
||
jobUrl,
|
||
applicationLink: jobUrl,
|
||
location: locationLine(job),
|
||
datePosted: postedIso(job.postedAt),
|
||
jobType: asString(job.jobType),
|
||
salary: salaryParts(job),
|
||
disciplines,
|
||
jobLevel:
|
||
job.experienceLevels?.join(", ") ??
|
||
asString(job.experienceLevel) ??
|
||
undefined,
|
||
isRemote: true,
|
||
};
|
||
}
|
||
|
||
function mapArcManagedJob(job: ArcJobJson): CreateJobInput | null {
|
||
const rk = asString(job.randomKey);
|
||
const slug = asString(job.urlString);
|
||
if (!rk || !slug) return null;
|
||
const jobUrl = `${ORIGIN}/remote-jobs/details/${slug}-${rk}`;
|
||
|
||
const disciplines = Array.isArray(job.categories)
|
||
? job.categories
|
||
.map((c) => c.name?.trim())
|
||
.filter((v): v is string => Boolean(v))
|
||
.join(", ")
|
||
: undefined;
|
||
|
||
const employer = "Arc talent network";
|
||
|
||
return {
|
||
source: "arcdev",
|
||
sourceJobId: `${slug}-${rk}`,
|
||
title: asString(job.title) ?? "Unknown Title",
|
||
employer,
|
||
jobUrl,
|
||
applicationLink: jobUrl,
|
||
location: locationLine(job),
|
||
datePosted: postedIso(job.postedAt),
|
||
jobType: asString(job.jobType),
|
||
salary: salaryParts(job),
|
||
disciplines,
|
||
jobLevel: asString(job.experienceLevel),
|
||
jobFunction: asString(job.jobRole),
|
||
isRemote: true,
|
||
};
|
||
}
|
||
|
||
export const manifest: ExtractorManifest = {
|
||
id: "arcdev",
|
||
displayName: "Arc.dev (remote)",
|
||
providesSources: ["arcdev"],
|
||
async run(context): Promise<ExtractorRunResult> {
|
||
if (context.shouldCancel?.()) return { success: true, jobs: [] };
|
||
|
||
let paths = readPaths(context.settings.arcRemoteJobsPaths);
|
||
if (paths.length === 0) paths = defaultArcPaths();
|
||
|
||
paths = paths.map((p) => (p.startsWith("/") ? p : `/${p}`));
|
||
|
||
const maxPerPath = context.settings.arcMaxJobsPerPath
|
||
? Number.parseInt(context.settings.arcMaxJobsPerPath, 10)
|
||
: 120;
|
||
const cap = Number.isFinite(maxPerPath)
|
||
? Math.min(Math.max(maxPerPath, 1), 300)
|
||
: 120;
|
||
|
||
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
|
||
const seen = new Set<string>();
|
||
const out: CreateJobInput[] = [];
|
||
|
||
try {
|
||
for (let i = 0; i < paths.length; i += 1) {
|
||
if (context.shouldCancel?.()) break;
|
||
const path = paths[i];
|
||
const pageUrl = `${ORIGIN}${path}`;
|
||
|
||
context.onProgress?.({
|
||
phase: "list",
|
||
termsProcessed: i,
|
||
termsTotal: paths.length,
|
||
currentUrl: pageUrl,
|
||
detail: `Arc.dev: fetching (${i + 1}/${paths.length}) ${path}`,
|
||
});
|
||
|
||
const response = await fetch(pageUrl, {
|
||
headers: {
|
||
Accept: "text/html",
|
||
"User-Agent":
|
||
"Mozilla/5.0 (compatible; JobOps/1.0; +https://github.com)",
|
||
},
|
||
});
|
||
if (!response.ok) {
|
||
throw new Error(
|
||
`Arc.dev "${path}" failed with status ${response.status}`,
|
||
);
|
||
}
|
||
const html = await response.text();
|
||
const payload = parseNextPageProps(html);
|
||
if (!payload) {
|
||
throw new Error(`Arc.dev "${path}": missing __NEXT_DATA__ payload`);
|
||
}
|
||
|
||
let pathAdded = 0;
|
||
|
||
const labeled = [
|
||
...payload.arcJobs.map((job) => ({ job, kind: "arc" as const })),
|
||
...payload.externalJobs.map((job) => ({ job, kind: "ext" as const })),
|
||
];
|
||
|
||
for (const { job: raw, kind } of labeled) {
|
||
if (pathAdded >= cap) break;
|
||
if (terms.length > 0 && !terms.some((t) => matchesTerm(raw, t))) {
|
||
continue;
|
||
}
|
||
const mapped =
|
||
kind === "arc" ? mapArcManagedJob(raw) : mapExternalJob(raw);
|
||
if (!mapped) continue;
|
||
if (seen.has(mapped.jobUrl)) continue;
|
||
seen.add(mapped.jobUrl);
|
||
out.push(mapped);
|
||
pathAdded += 1;
|
||
}
|
||
|
||
context.onProgress?.({
|
||
phase: "list",
|
||
termsProcessed: i + 1,
|
||
termsTotal: paths.length,
|
||
currentUrl: pageUrl,
|
||
jobPagesProcessed: out.length,
|
||
detail: `Arc.dev: ${path} → ${pathAdded} kept (${payload.arcJobs.length} arc + ${payload.externalJobs.length} external rows)`,
|
||
});
|
||
}
|
||
|
||
return { success: true, jobs: out };
|
||
} catch (error) {
|
||
const message = error instanceof Error ? error.message : "Unknown error";
|
||
return { success: false, jobs: out, error: message };
|
||
}
|
||
},
|
||
};
|
||
|
||
export default manifest;
|