ilia 03d293699a
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m13s
CI / Type Check (adzuna-extractor) (push) Successful in 1m10s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m30s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m12s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m13s
CI / Documentation (push) Successful in 1m57s
feat(qajobsboard): fetch job detail pages for concrete location text
The jobs.json feed often labels roles Remote/Worldwide while the public
job page JSON-LD and description include the real city (e.g. Mumbai/Nagpur).
Enrich vague rows by reading each QAJobsBoard detail URL before import.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-16 17:20:53 -04:00

274 lines
7.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* QAJobsBoard (JobBoardly) — public jobs listing JSON.
*
* https://qajobsboard.jobboardly.com/jobs.json
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
import {
extractJobLocationFromText,
fetchQaJobDetailEnrichment,
stripHtml,
} from "./src/detail-page.js";
const JOBS_URL = "https://qajobsboard.jobboardly.com/jobs.json";
interface JobCategory {
name?: string;
}
interface SalaryBand {
schedule?: string;
minimum?: number | null;
maximum?: number | null;
}
interface DescriptionBlock {
html?: string;
}
interface QaJobBoardlyJob {
title?: string;
arrangement?: string;
location?: string;
location_limits?: string[];
published_at?: string;
application_link?: string;
description?: DescriptionBlock;
company?: { name?: string; logo?: string };
salary?: SalaryBand;
categories?: JobCategory[];
links?: { self?: string };
}
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function salaryLabel(raw: SalaryBand | undefined): string | undefined {
if (!raw) return undefined;
const schedule = raw.schedule ? `${raw.schedule}: ` : "";
if (
typeof raw.minimum === "number" &&
typeof raw.maximum === "number" &&
Number.isFinite(raw.minimum) &&
Number.isFinite(raw.maximum)
) {
return `${schedule}${raw.minimum}${raw.maximum}`;
}
if (typeof raw.minimum === "number" && Number.isFinite(raw.minimum)) {
return `${schedule}${raw.minimum}+`;
}
if (typeof raw.maximum === "number" && Number.isFinite(raw.maximum)) {
return `${schedule}${raw.maximum}`;
}
return schedule.trim() || undefined;
}
const VAGUE_LOCATION_LABELS = new Set([
"worldwide",
"global",
"anywhere",
"remote",
"unknown",
]);
function isVagueLocationLabel(value: string): boolean {
return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase());
}
function locationLabel(job: QaJobBoardlyJob): string {
const limits = Array.isArray(job.location_limits)
? job.location_limits
.map((v) => (typeof v === "string" ? v.trim() : ""))
.filter((v) => v.length > 0 && !isVagueLocationLabel(v))
: [];
if (limits.length > 0) return limits.join(", ");
const loc = asString(job.location);
if (loc && !isVagueLocationLabel(loc)) return loc;
const fromDescription = job.description?.html
? extractJobLocationFromText(stripHtml(job.description.html))
: undefined;
if (fromDescription) return fromDescription;
if (loc) return loc;
return "Unknown";
}
function matchesTerm(job: QaJobBoardlyJob, term: string): boolean {
const lower = term.toLowerCase();
if (job.title?.toLowerCase().includes(lower)) return true;
const cats = Array.isArray(job.categories)
? job.categories.map((c) => c.name?.toLowerCase() ?? "").join(" ")
: "";
if (cats.includes(lower)) return true;
const html = job.description?.html ?? "";
if (stripHtml(html).toLowerCase().includes(lower)) return true;
return false;
}
function mapJob(raw: QaJobBoardlyJob): CreateJobInput | null {
const jobUrl = asString(raw.links?.self);
if (!jobUrl) return null;
const employer =
asString(raw.company?.name)
?.replace(/^[\s-]+/, "")
.trim() || "Unknown Employer";
const applicationLink = asString(raw.application_link) ?? jobUrl;
const descHtml = raw.description?.html;
const jobDescription = descHtml ? stripHtml(descHtml) : undefined;
const salary = salaryLabel(raw.salary);
const cats = Array.isArray(raw.categories)
? raw.categories
.map((c) => c?.name?.trim())
.filter((v): v is string => Boolean(v))
.join(", ")
: undefined;
return {
source: "qajobsboard",
sourceJobId: jobUrl.split("/").pop(),
title: asString(raw.title) ?? "Unknown Title",
employer,
jobUrl,
applicationLink,
location: locationLabel(raw),
isRemote: asString(raw.location)?.toLowerCase() === "remote",
datePosted: asString(raw.published_at),
jobDescription,
jobType: asString(raw.arrangement),
salary,
disciplines: cats,
companyLogo: asString(raw.company?.logo),
};
}
const DETAIL_ENRICH_CONCURRENCY = 4;
function needsDetailEnrichment(location: string | undefined): boolean {
if (!location?.trim()) return true;
return isVagueLocationLabel(location);
}
async function enrichJobsFromDetailPages(
jobs: CreateJobInput[],
shouldCancel?: () => boolean,
): Promise<CreateJobInput[]> {
const enriched = jobs.map((job) => ({ ...job }));
const targets = enriched
.map((job, index) => ({ job, index }))
.filter(({ job }) => needsDetailEnrichment(job.location));
for (let offset = 0; offset < targets.length; offset += DETAIL_ENRICH_CONCURRENCY) {
if (shouldCancel?.()) break;
const batch = targets.slice(offset, offset + DETAIL_ENRICH_CONCURRENCY);
await Promise.all(
batch.map(async ({ job, index }) => {
try {
const detail = await fetchQaJobDetailEnrichment(job.jobUrl);
if (!detail) return;
enriched[index] = {
...job,
...(detail.location ? { location: detail.location } : {}),
...(detail.jobDescription
? { jobDescription: detail.jobDescription }
: {}),
};
} catch {
// keep feed row when detail page fetch fails
}
}),
);
}
return enriched;
}
export const manifest: ExtractorManifest = {
id: "qajobsboard",
displayName: "QAJobsBoard",
providesSources: ["qajobsboard"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.qajobsboardMaxJobsPerTerm
? Number.parseInt(context.settings.qajobsboardMaxJobsPerTerm, 10)
: 100;
const cap = Number.isFinite(maxJobs)
? Math.min(Math.max(maxJobs, 1), 500)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: JOBS_URL,
detail: "QAJobsBoard: fetching jobs.json",
});
try {
const response = await fetch(JOBS_URL, {
headers: { Accept: "application/json", "User-Agent": "JobOps/1.0" },
});
if (!response.ok) {
throw new Error(
`QAJobsBoard request failed with status ${response.status}`,
);
}
const body = (await response.json()) as unknown;
const rows = Array.isArray(body) ? body : [];
const seen = new Set<string>();
const out: CreateJobInput[] = [];
for (const row of rows as QaJobBoardlyJob[]) {
if (out.length >= cap) break;
if (terms.length > 0 && !terms.some((t) => matchesTerm(row, t)))
continue;
const mapped = mapJob(row);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
const withDetails = await enrichJobsFromDetailPages(
out,
context.shouldCancel,
);
context.onProgress?.({
phase: "list",
termsProcessed: 1,
termsTotal: 1,
currentUrl: JOBS_URL,
jobPagesProcessed: withDetails.length,
detail: `QAJobsBoard: ${withDetails.length} matched (${rows.length} total listings, detail pages for vague locations)`,
});
return { success: true, jobs: withDetails };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };
}
},
};
export default manifest;