From 03d293699acd3fa6b98e1d8339331fc9915e4201 Mon Sep 17 00:00:00 2001 From: ilia Date: Sat, 16 May 2026 17:20:53 -0400 Subject: [PATCH] feat(qajobsboard): fetch job detail pages for concrete location text The jobs.json feed often labels roles Remote/Worldwide while the public job page JSON-LD and description include the real city (e.g. Mumbai/Nagpur). Enrich vague rows by reading each QAJobsBoard detail URL before import. Co-authored-by: Cursor --- extractors/qajobsboard/manifest.ts | 91 +++++---- extractors/qajobsboard/src/detail-page.ts | 173 ++++++++++++++++++ .../qajobsboard/tests/detail-page.test.ts | 33 ++++ 3 files changed, 263 insertions(+), 34 deletions(-) create mode 100644 extractors/qajobsboard/src/detail-page.ts create mode 100644 extractors/qajobsboard/tests/detail-page.test.ts diff --git a/extractors/qajobsboard/manifest.ts b/extractors/qajobsboard/manifest.ts index ff19205..287217c 100644 --- a/extractors/qajobsboard/manifest.ts +++ b/extractors/qajobsboard/manifest.ts @@ -9,6 +9,11 @@ import type { ExtractorRunResult, } from "@shared/types/extractors"; import type { CreateJobInput } from "@shared/types/jobs"; +import { + extractJobLocationFromText, + fetchQaJobDetailEnrichment, + stripHtml, +} from "./src/detail-page.js"; const JOBS_URL = "https://qajobsboard.jobboardly.com/jobs.json"; @@ -46,21 +51,6 @@ function asString(value: unknown): string | undefined { return trimmed ? trimmed : undefined; } -function decodeHtmlEntities(value: string): string { - return value - .replace(/&/g, "&") - .replace(/</g, "<") - .replace(/>/g, ">") - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/ /g, " "); -} - -function stripHtml(html: string): string { - const noTags = html.replace(/<[^>]+>/g, " "); - return decodeHtmlEntities(noTags).replace(/\s+/g, " ").trim(); -} - function salaryLabel(raw: SalaryBand | undefined): string | undefined { if (!raw) return undefined; const schedule = raw.schedule ? `${raw.schedule}: ` : ""; @@ -93,19 +83,6 @@ function isVagueLocationLabel(value: string): boolean { return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase()); } -function extractJobLocationFromDescription( - html: string | undefined, -): string | undefined { - if (!html) return undefined; - const text = stripHtml(html); - const match = text.match( - /\bjob\s+location\s*:\s*([^\n.]{2,120})/i, - ); - if (!match?.[1]) return undefined; - const extracted = match[1].trim(); - return extracted || undefined; -} - function locationLabel(job: QaJobBoardlyJob): string { const limits = Array.isArray(job.location_limits) ? job.location_limits @@ -117,9 +94,9 @@ function locationLabel(job: QaJobBoardlyJob): string { const loc = asString(job.location); if (loc && !isVagueLocationLabel(loc)) return loc; - const fromDescription = extractJobLocationFromDescription( - job.description?.html, - ); + const fromDescription = job.description?.html + ? extractJobLocationFromText(stripHtml(job.description.html)) + : undefined; if (fromDescription) return fromDescription; if (loc) return loc; @@ -179,6 +156,47 @@ function mapJob(raw: QaJobBoardlyJob): CreateJobInput | null { }; } +const DETAIL_ENRICH_CONCURRENCY = 4; + +function needsDetailEnrichment(location: string | undefined): boolean { + if (!location?.trim()) return true; + return isVagueLocationLabel(location); +} + +async function enrichJobsFromDetailPages( + jobs: CreateJobInput[], + shouldCancel?: () => boolean, +): Promise { + const enriched = jobs.map((job) => ({ ...job })); + const targets = enriched + .map((job, index) => ({ job, index })) + .filter(({ job }) => needsDetailEnrichment(job.location)); + + for (let offset = 0; offset < targets.length; offset += DETAIL_ENRICH_CONCURRENCY) { + if (shouldCancel?.()) break; + const batch = targets.slice(offset, offset + DETAIL_ENRICH_CONCURRENCY); + await Promise.all( + batch.map(async ({ job, index }) => { + try { + const detail = await fetchQaJobDetailEnrichment(job.jobUrl); + if (!detail) return; + enriched[index] = { + ...job, + ...(detail.location ? { location: detail.location } : {}), + ...(detail.jobDescription + ? { jobDescription: detail.jobDescription } + : {}), + }; + } catch { + // keep feed row when detail page fetch fails + } + }), + ); + } + + return enriched; +} + export const manifest: ExtractorManifest = { id: "qajobsboard", displayName: "QAJobsBoard", @@ -230,16 +248,21 @@ export const manifest: ExtractorManifest = { out.push(mapped); } + const withDetails = await enrichJobsFromDetailPages( + out, + context.shouldCancel, + ); + context.onProgress?.({ phase: "list", termsProcessed: 1, termsTotal: 1, currentUrl: JOBS_URL, - jobPagesProcessed: out.length, - detail: `QAJobsBoard: ${out.length} matched (${rows.length} total listings)`, + jobPagesProcessed: withDetails.length, + detail: `QAJobsBoard: ${withDetails.length} matched (${rows.length} total listings, detail pages for vague locations)`, }); - return { success: true, jobs: out }; + return { success: true, jobs: withDetails }; } catch (error) { const message = error instanceof Error ? error.message : "Unknown error"; return { success: false, jobs: [], error: message }; diff --git a/extractors/qajobsboard/src/detail-page.ts b/extractors/qajobsboard/src/detail-page.ts new file mode 100644 index 0000000..f1030be --- /dev/null +++ b/extractors/qajobsboard/src/detail-page.ts @@ -0,0 +1,173 @@ +/** + * Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone. + */ + +export interface QaJobDetailEnrichment { + location?: string; + jobDescription?: string; +} + +const JSON_LD_RE = + /]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; + +function decodeHtmlEntities(value: string): string { + return value + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, " "); +} + +export function stripHtml(html: string): string { + const noTags = html.replace(/<[^>]+>/g, " "); + return decodeHtmlEntities(noTags).replace(/\s+/g, " ").trim(); +} + +export function extractJobLocationFromText(text: string): string | undefined { + const match = text.match(/\bjob\s+location\s*:\s*([^\n]+)/i); + if (!match?.[1]) return undefined; + let extracted = match[1].trim(); + extracted = + extracted.split( + /\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i, + )[0]?.trim() ?? extracted; + // Drop office footers like "India | UK | USA" glued onto the city line. + extracted = + extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ?? + extracted; + if (extracted.length > 120) { + extracted = extracted.slice(0, 120).trim(); + } + return extracted || undefined; +} + +function asRecord(value: unknown): Record | null { + return value && typeof value === "object" && !Array.isArray(value) + ? (value as Record) + : null; +} + +function parseJsonLdBlocks(html: string): unknown[] { + const out: unknown[] = []; + for (const match of html.matchAll(JSON_LD_RE)) { + const raw = match[1]?.trim(); + if (!raw) continue; + try { + out.push(JSON.parse(raw)); + } catch { + // ignore malformed blocks + } + } + return out; +} + +function findJobPostingNode(node: unknown): Record | null { + if (Array.isArray(node)) { + for (const item of node) { + const found = findJobPostingNode(item); + if (found) return found; + } + return null; + } + const record = asRecord(node); + if (!record) return null; + const type = record["@type"]; + const types = Array.isArray(type) ? type : type ? [type] : []; + if (types.some((t) => String(t).toLowerCase() === "jobposting")) { + return record; + } + if (record["@graph"]) { + return findJobPostingNode(record["@graph"]); + } + return null; +} + +function formatPostalAddress(address: Record): string | undefined { + const parts = [ + address.addressLocality, + address.addressRegion, + address.addressCountry, + ] + .map((v) => (typeof v === "string" ? v.trim() : "")) + .filter(Boolean); + return parts.length > 0 ? parts.join(", ") : undefined; +} + +function locationFromJobPosting(posting: Record): string | undefined { + const description = + typeof posting.description === "string" ? posting.description : ""; + const fromDescription = extractJobLocationFromText(stripHtml(description)); + if (fromDescription) return fromDescription; + + const jobLocation = posting.jobLocation; + if (Array.isArray(jobLocation)) { + for (const entry of jobLocation) { + const loc = locationFromJobLocationNode(entry); + if (loc) return loc; + } + } else if (jobLocation) { + const loc = locationFromJobLocationNode(jobLocation); + if (loc) return loc; + } + + return undefined; +} + +function locationFromJobLocationNode(node: unknown): string | undefined { + const record = asRecord(node); + if (!record) return undefined; + const address = asRecord(record.address); + if (address) { + return formatPostalAddress(address); + } + if (typeof record.name === "string" && record.name.trim()) { + return record.name.trim(); + } + return undefined; +} + +export function parseQaJobDetailPage(html: string): QaJobDetailEnrichment | null { + for (const block of parseJsonLdBlocks(html)) { + const posting = findJobPostingNode(block); + if (!posting) continue; + + const description = + typeof posting.description === "string" + ? stripHtml(posting.description) + : undefined; + const location = locationFromJobPosting(posting); + + if (!description && !location) continue; + return { + ...(location ? { location } : {}), + ...(description ? { jobDescription: description } : {}), + }; + } + + const bodyText = stripHtml(html); + const location = extractJobLocationFromText(bodyText); + if (!location && bodyText.length < 40) return null; + return { + ...(location ? { location } : {}), + ...(bodyText ? { jobDescription: bodyText.slice(0, 12000) } : {}), + }; +} + +export async function fetchQaJobDetailEnrichment( + jobUrl: string, + fetchImpl: typeof fetch = fetch, +): Promise { + const response = await fetchImpl(jobUrl, { + headers: { + Accept: "text/html,application/xhtml+xml", + "User-Agent": "JobOps/1.0", + }, + }); + if (!response.ok) { + throw new Error(`QAJobsBoard detail page ${response.status} for ${jobUrl}`); + } + const html = await response.text(); + return parseQaJobDetailPage(html); +} diff --git a/extractors/qajobsboard/tests/detail-page.test.ts b/extractors/qajobsboard/tests/detail-page.test.ts new file mode 100644 index 0000000..4d28af0 --- /dev/null +++ b/extractors/qajobsboard/tests/detail-page.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, it } from "vitest"; +import { + extractJobLocationFromText, + parseQaJobDetailPage, +} from "../src/detail-page.js"; + +const SAMPLE_JSON_LD = ` + +`; + +describe("parseQaJobDetailPage", () => { + it("extracts job location from JSON-LD description on the detail page", () => { + const result = parseQaJobDetailPage(SAMPLE_JSON_LD); + expect(result?.location).toBe("Mumbai/Nagpur"); + expect(result?.jobDescription).toContain("Mumbai/Nagpur"); + }); + + it("extracts job location lines from plain text", () => { + expect( + extractJobLocationFromText( + "Job Location: Vancouver, British Columbia, Canada", + ), + ).toBe("Vancouver, British Columbia, Canada"); + }); +});