feat(qajobsboard): fetch job detail pages for concrete location text
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m13s
CI / Type Check (adzuna-extractor) (push) Successful in 1m10s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m30s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m12s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m13s
CI / Documentation (push) Successful in 1m57s

The jobs.json feed often labels roles Remote/Worldwide while the public
job page JSON-LD and description include the real city (e.g. Mumbai/Nagpur).
Enrich vague rows by reading each QAJobsBoard detail URL before import.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ilia 2026-05-16 17:20:53 -04:00
parent 0a63316100
commit 03d293699a
3 changed files with 263 additions and 34 deletions

View File

@ -9,6 +9,11 @@ import type {
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
import {
extractJobLocationFromText,
fetchQaJobDetailEnrichment,
stripHtml,
} from "./src/detail-page.js";
const JOBS_URL = "https://qajobsboard.jobboardly.com/jobs.json";
@ -46,21 +51,6 @@ function asString(value: unknown): string | undefined {
return trimmed ? trimmed : undefined;
}
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, " ");
}
function stripHtml(html: string): string {
const noTags = html.replace(/<[^>]+>/g, " ");
return decodeHtmlEntities(noTags).replace(/\s+/g, " ").trim();
}
function salaryLabel(raw: SalaryBand | undefined): string | undefined {
if (!raw) return undefined;
const schedule = raw.schedule ? `${raw.schedule}: ` : "";
@ -93,19 +83,6 @@ function isVagueLocationLabel(value: string): boolean {
return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase());
}
function extractJobLocationFromDescription(
html: string | undefined,
): string | undefined {
if (!html) return undefined;
const text = stripHtml(html);
const match = text.match(
/\bjob\s+location\s*:\s*([^\n.]{2,120})/i,
);
if (!match?.[1]) return undefined;
const extracted = match[1].trim();
return extracted || undefined;
}
function locationLabel(job: QaJobBoardlyJob): string {
const limits = Array.isArray(job.location_limits)
? job.location_limits
@ -117,9 +94,9 @@ function locationLabel(job: QaJobBoardlyJob): string {
const loc = asString(job.location);
if (loc && !isVagueLocationLabel(loc)) return loc;
const fromDescription = extractJobLocationFromDescription(
job.description?.html,
);
const fromDescription = job.description?.html
? extractJobLocationFromText(stripHtml(job.description.html))
: undefined;
if (fromDescription) return fromDescription;
if (loc) return loc;
@ -179,6 +156,47 @@ function mapJob(raw: QaJobBoardlyJob): CreateJobInput | null {
};
}
const DETAIL_ENRICH_CONCURRENCY = 4;
function needsDetailEnrichment(location: string | undefined): boolean {
if (!location?.trim()) return true;
return isVagueLocationLabel(location);
}
async function enrichJobsFromDetailPages(
jobs: CreateJobInput[],
shouldCancel?: () => boolean,
): Promise<CreateJobInput[]> {
const enriched = jobs.map((job) => ({ ...job }));
const targets = enriched
.map((job, index) => ({ job, index }))
.filter(({ job }) => needsDetailEnrichment(job.location));
for (let offset = 0; offset < targets.length; offset += DETAIL_ENRICH_CONCURRENCY) {
if (shouldCancel?.()) break;
const batch = targets.slice(offset, offset + DETAIL_ENRICH_CONCURRENCY);
await Promise.all(
batch.map(async ({ job, index }) => {
try {
const detail = await fetchQaJobDetailEnrichment(job.jobUrl);
if (!detail) return;
enriched[index] = {
...job,
...(detail.location ? { location: detail.location } : {}),
...(detail.jobDescription
? { jobDescription: detail.jobDescription }
: {}),
};
} catch {
// keep feed row when detail page fetch fails
}
}),
);
}
return enriched;
}
export const manifest: ExtractorManifest = {
id: "qajobsboard",
displayName: "QAJobsBoard",
@ -230,16 +248,21 @@ export const manifest: ExtractorManifest = {
out.push(mapped);
}
const withDetails = await enrichJobsFromDetailPages(
out,
context.shouldCancel,
);
context.onProgress?.({
phase: "list",
termsProcessed: 1,
termsTotal: 1,
currentUrl: JOBS_URL,
jobPagesProcessed: out.length,
detail: `QAJobsBoard: ${out.length} matched (${rows.length} total listings)`,
jobPagesProcessed: withDetails.length,
detail: `QAJobsBoard: ${withDetails.length} matched (${rows.length} total listings, detail pages for vague locations)`,
});
return { success: true, jobs: out };
return { success: true, jobs: withDetails };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };

View File

@ -0,0 +1,173 @@
/**
* Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone.
*/
export interface QaJobDetailEnrichment {
location?: string;
jobDescription?: string;
}
const JSON_LD_RE =
/<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, " ");
}
export function stripHtml(html: string): string {
const noTags = html.replace(/<[^>]+>/g, " ");
return decodeHtmlEntities(noTags).replace(/\s+/g, " ").trim();
}
export function extractJobLocationFromText(text: string): string | undefined {
const match = text.match(/\bjob\s+location\s*:\s*([^\n]+)/i);
if (!match?.[1]) return undefined;
let extracted = match[1].trim();
extracted =
extracted.split(
/\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i,
)[0]?.trim() ?? extracted;
// Drop office footers like "India | UK | USA" glued onto the city line.
extracted =
extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ??
extracted;
if (extracted.length > 120) {
extracted = extracted.slice(0, 120).trim();
}
return extracted || undefined;
}
function asRecord(value: unknown): Record<string, unknown> | null {
return value && typeof value === "object" && !Array.isArray(value)
? (value as Record<string, unknown>)
: null;
}
function parseJsonLdBlocks(html: string): unknown[] {
const out: unknown[] = [];
for (const match of html.matchAll(JSON_LD_RE)) {
const raw = match[1]?.trim();
if (!raw) continue;
try {
out.push(JSON.parse(raw));
} catch {
// ignore malformed blocks
}
}
return out;
}
function findJobPostingNode(node: unknown): Record<string, unknown> | null {
if (Array.isArray(node)) {
for (const item of node) {
const found = findJobPostingNode(item);
if (found) return found;
}
return null;
}
const record = asRecord(node);
if (!record) return null;
const type = record["@type"];
const types = Array.isArray(type) ? type : type ? [type] : [];
if (types.some((t) => String(t).toLowerCase() === "jobposting")) {
return record;
}
if (record["@graph"]) {
return findJobPostingNode(record["@graph"]);
}
return null;
}
function formatPostalAddress(address: Record<string, unknown>): string | undefined {
const parts = [
address.addressLocality,
address.addressRegion,
address.addressCountry,
]
.map((v) => (typeof v === "string" ? v.trim() : ""))
.filter(Boolean);
return parts.length > 0 ? parts.join(", ") : undefined;
}
function locationFromJobPosting(posting: Record<string, unknown>): string | undefined {
const description =
typeof posting.description === "string" ? posting.description : "";
const fromDescription = extractJobLocationFromText(stripHtml(description));
if (fromDescription) return fromDescription;
const jobLocation = posting.jobLocation;
if (Array.isArray(jobLocation)) {
for (const entry of jobLocation) {
const loc = locationFromJobLocationNode(entry);
if (loc) return loc;
}
} else if (jobLocation) {
const loc = locationFromJobLocationNode(jobLocation);
if (loc) return loc;
}
return undefined;
}
function locationFromJobLocationNode(node: unknown): string | undefined {
const record = asRecord(node);
if (!record) return undefined;
const address = asRecord(record.address);
if (address) {
return formatPostalAddress(address);
}
if (typeof record.name === "string" && record.name.trim()) {
return record.name.trim();
}
return undefined;
}
export function parseQaJobDetailPage(html: string): QaJobDetailEnrichment | null {
for (const block of parseJsonLdBlocks(html)) {
const posting = findJobPostingNode(block);
if (!posting) continue;
const description =
typeof posting.description === "string"
? stripHtml(posting.description)
: undefined;
const location = locationFromJobPosting(posting);
if (!description && !location) continue;
return {
...(location ? { location } : {}),
...(description ? { jobDescription: description } : {}),
};
}
const bodyText = stripHtml(html);
const location = extractJobLocationFromText(bodyText);
if (!location && bodyText.length < 40) return null;
return {
...(location ? { location } : {}),
...(bodyText ? { jobDescription: bodyText.slice(0, 12000) } : {}),
};
}
export async function fetchQaJobDetailEnrichment(
jobUrl: string,
fetchImpl: typeof fetch = fetch,
): Promise<QaJobDetailEnrichment | null> {
const response = await fetchImpl(jobUrl, {
headers: {
Accept: "text/html,application/xhtml+xml",
"User-Agent": "JobOps/1.0",
},
});
if (!response.ok) {
throw new Error(`QAJobsBoard detail page ${response.status} for ${jobUrl}`);
}
const html = await response.text();
return parseQaJobDetailPage(html);
}

View File

@ -0,0 +1,33 @@
import { describe, expect, it } from "vitest";
import {
extractJobLocationFromText,
parseQaJobDetailPage,
} from "../src/detail-page.js";
const SAMPLE_JSON_LD = `<!DOCTYPE html><html><body>
<script type="application/ld+json">
{
"@context": "https://schema.org/",
"@type": "JobPosting",
"title": "Sr. QA Automation Engineer",
"description": "<p><strong>Job Location: Mumbai/Nagpur</strong></p><p>India | UK | USA</p>",
"jobLocationType": "TELECOMMUTE"
}
</script>
</body></html>`;
describe("parseQaJobDetailPage", () => {
it("extracts job location from JSON-LD description on the detail page", () => {
const result = parseQaJobDetailPage(SAMPLE_JSON_LD);
expect(result?.location).toBe("Mumbai/Nagpur");
expect(result?.jobDescription).toContain("Mumbai/Nagpur");
});
it("extracts job location lines from plain text", () => {
expect(
extractJobLocationFromText(
"Job Location: Vancouver, British Columbia, Canada",
),
).toBe("Vancouver, British Columbia, Canada");
});
});