feat(qajobsboard): fetch job detail pages for concrete location text
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m13s
CI / Type Check (adzuna-extractor) (push) Successful in 1m10s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m30s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m12s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m13s
CI / Documentation (push) Successful in 1m57s
Some checks failed
CI / Linting (Biome) (push) Failing after 40s
CI / Tests (push) Successful in 5m13s
CI / Type Check (adzuna-extractor) (push) Successful in 1m10s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m14s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m11s
CI / Type Check (orchestrator) (push) Successful in 1m30s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m12s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m13s
CI / Documentation (push) Successful in 1m57s
The jobs.json feed often labels roles Remote/Worldwide while the public job page JSON-LD and description include the real city (e.g. Mumbai/Nagpur). Enrich vague rows by reading each QAJobsBoard detail URL before import. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
0a63316100
commit
03d293699a
@ -9,6 +9,11 @@ import type {
|
||||
ExtractorRunResult,
|
||||
} from "@shared/types/extractors";
|
||||
import type { CreateJobInput } from "@shared/types/jobs";
|
||||
import {
|
||||
extractJobLocationFromText,
|
||||
fetchQaJobDetailEnrichment,
|
||||
stripHtml,
|
||||
} from "./src/detail-page.js";
|
||||
|
||||
const JOBS_URL = "https://qajobsboard.jobboardly.com/jobs.json";
|
||||
|
||||
@ -46,21 +51,6 @@ function asString(value: unknown): string | undefined {
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(value: string): string {
|
||||
return value
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/ /g, " ");
|
||||
}
|
||||
|
||||
function stripHtml(html: string): string {
|
||||
const noTags = html.replace(/<[^>]+>/g, " ");
|
||||
return decodeHtmlEntities(noTags).replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function salaryLabel(raw: SalaryBand | undefined): string | undefined {
|
||||
if (!raw) return undefined;
|
||||
const schedule = raw.schedule ? `${raw.schedule}: ` : "";
|
||||
@ -93,19 +83,6 @@ function isVagueLocationLabel(value: string): boolean {
|
||||
return VAGUE_LOCATION_LABELS.has(value.trim().toLowerCase());
|
||||
}
|
||||
|
||||
function extractJobLocationFromDescription(
|
||||
html: string | undefined,
|
||||
): string | undefined {
|
||||
if (!html) return undefined;
|
||||
const text = stripHtml(html);
|
||||
const match = text.match(
|
||||
/\bjob\s+location\s*:\s*([^\n.]{2,120})/i,
|
||||
);
|
||||
if (!match?.[1]) return undefined;
|
||||
const extracted = match[1].trim();
|
||||
return extracted || undefined;
|
||||
}
|
||||
|
||||
function locationLabel(job: QaJobBoardlyJob): string {
|
||||
const limits = Array.isArray(job.location_limits)
|
||||
? job.location_limits
|
||||
@ -117,9 +94,9 @@ function locationLabel(job: QaJobBoardlyJob): string {
|
||||
const loc = asString(job.location);
|
||||
if (loc && !isVagueLocationLabel(loc)) return loc;
|
||||
|
||||
const fromDescription = extractJobLocationFromDescription(
|
||||
job.description?.html,
|
||||
);
|
||||
const fromDescription = job.description?.html
|
||||
? extractJobLocationFromText(stripHtml(job.description.html))
|
||||
: undefined;
|
||||
if (fromDescription) return fromDescription;
|
||||
|
||||
if (loc) return loc;
|
||||
@ -179,6 +156,47 @@ function mapJob(raw: QaJobBoardlyJob): CreateJobInput | null {
|
||||
};
|
||||
}
|
||||
|
||||
const DETAIL_ENRICH_CONCURRENCY = 4;
|
||||
|
||||
function needsDetailEnrichment(location: string | undefined): boolean {
|
||||
if (!location?.trim()) return true;
|
||||
return isVagueLocationLabel(location);
|
||||
}
|
||||
|
||||
async function enrichJobsFromDetailPages(
|
||||
jobs: CreateJobInput[],
|
||||
shouldCancel?: () => boolean,
|
||||
): Promise<CreateJobInput[]> {
|
||||
const enriched = jobs.map((job) => ({ ...job }));
|
||||
const targets = enriched
|
||||
.map((job, index) => ({ job, index }))
|
||||
.filter(({ job }) => needsDetailEnrichment(job.location));
|
||||
|
||||
for (let offset = 0; offset < targets.length; offset += DETAIL_ENRICH_CONCURRENCY) {
|
||||
if (shouldCancel?.()) break;
|
||||
const batch = targets.slice(offset, offset + DETAIL_ENRICH_CONCURRENCY);
|
||||
await Promise.all(
|
||||
batch.map(async ({ job, index }) => {
|
||||
try {
|
||||
const detail = await fetchQaJobDetailEnrichment(job.jobUrl);
|
||||
if (!detail) return;
|
||||
enriched[index] = {
|
||||
...job,
|
||||
...(detail.location ? { location: detail.location } : {}),
|
||||
...(detail.jobDescription
|
||||
? { jobDescription: detail.jobDescription }
|
||||
: {}),
|
||||
};
|
||||
} catch {
|
||||
// keep feed row when detail page fetch fails
|
||||
}
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
return enriched;
|
||||
}
|
||||
|
||||
export const manifest: ExtractorManifest = {
|
||||
id: "qajobsboard",
|
||||
displayName: "QAJobsBoard",
|
||||
@ -230,16 +248,21 @@ export const manifest: ExtractorManifest = {
|
||||
out.push(mapped);
|
||||
}
|
||||
|
||||
const withDetails = await enrichJobsFromDetailPages(
|
||||
out,
|
||||
context.shouldCancel,
|
||||
);
|
||||
|
||||
context.onProgress?.({
|
||||
phase: "list",
|
||||
termsProcessed: 1,
|
||||
termsTotal: 1,
|
||||
currentUrl: JOBS_URL,
|
||||
jobPagesProcessed: out.length,
|
||||
detail: `QAJobsBoard: ${out.length} matched (${rows.length} total listings)`,
|
||||
jobPagesProcessed: withDetails.length,
|
||||
detail: `QAJobsBoard: ${withDetails.length} matched (${rows.length} total listings, detail pages for vague locations)`,
|
||||
});
|
||||
|
||||
return { success: true, jobs: out };
|
||||
return { success: true, jobs: withDetails };
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : "Unknown error";
|
||||
return { success: false, jobs: [], error: message };
|
||||
|
||||
173
extractors/qajobsboard/src/detail-page.ts
Normal file
173
extractors/qajobsboard/src/detail-page.ts
Normal file
@ -0,0 +1,173 @@
|
||||
/**
|
||||
* Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone.
|
||||
*/
|
||||
|
||||
export interface QaJobDetailEnrichment {
|
||||
location?: string;
|
||||
jobDescription?: string;
|
||||
}
|
||||
|
||||
const JSON_LD_RE =
|
||||
/<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
||||
|
||||
function decodeHtmlEntities(value: string): string {
|
||||
return value
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/ /g, " ");
|
||||
}
|
||||
|
||||
export function stripHtml(html: string): string {
|
||||
const noTags = html.replace(/<[^>]+>/g, " ");
|
||||
return decodeHtmlEntities(noTags).replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
export function extractJobLocationFromText(text: string): string | undefined {
|
||||
const match = text.match(/\bjob\s+location\s*:\s*([^\n]+)/i);
|
||||
if (!match?.[1]) return undefined;
|
||||
let extracted = match[1].trim();
|
||||
extracted =
|
||||
extracted.split(
|
||||
/\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i,
|
||||
)[0]?.trim() ?? extracted;
|
||||
// Drop office footers like "India | UK | USA" glued onto the city line.
|
||||
extracted =
|
||||
extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ??
|
||||
extracted;
|
||||
if (extracted.length > 120) {
|
||||
extracted = extracted.slice(0, 120).trim();
|
||||
}
|
||||
return extracted || undefined;
|
||||
}
|
||||
|
||||
function asRecord(value: unknown): Record<string, unknown> | null {
|
||||
return value && typeof value === "object" && !Array.isArray(value)
|
||||
? (value as Record<string, unknown>)
|
||||
: null;
|
||||
}
|
||||
|
||||
function parseJsonLdBlocks(html: string): unknown[] {
|
||||
const out: unknown[] = [];
|
||||
for (const match of html.matchAll(JSON_LD_RE)) {
|
||||
const raw = match[1]?.trim();
|
||||
if (!raw) continue;
|
||||
try {
|
||||
out.push(JSON.parse(raw));
|
||||
} catch {
|
||||
// ignore malformed blocks
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function findJobPostingNode(node: unknown): Record<string, unknown> | null {
|
||||
if (Array.isArray(node)) {
|
||||
for (const item of node) {
|
||||
const found = findJobPostingNode(item);
|
||||
if (found) return found;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
const record = asRecord(node);
|
||||
if (!record) return null;
|
||||
const type = record["@type"];
|
||||
const types = Array.isArray(type) ? type : type ? [type] : [];
|
||||
if (types.some((t) => String(t).toLowerCase() === "jobposting")) {
|
||||
return record;
|
||||
}
|
||||
if (record["@graph"]) {
|
||||
return findJobPostingNode(record["@graph"]);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function formatPostalAddress(address: Record<string, unknown>): string | undefined {
|
||||
const parts = [
|
||||
address.addressLocality,
|
||||
address.addressRegion,
|
||||
address.addressCountry,
|
||||
]
|
||||
.map((v) => (typeof v === "string" ? v.trim() : ""))
|
||||
.filter(Boolean);
|
||||
return parts.length > 0 ? parts.join(", ") : undefined;
|
||||
}
|
||||
|
||||
function locationFromJobPosting(posting: Record<string, unknown>): string | undefined {
|
||||
const description =
|
||||
typeof posting.description === "string" ? posting.description : "";
|
||||
const fromDescription = extractJobLocationFromText(stripHtml(description));
|
||||
if (fromDescription) return fromDescription;
|
||||
|
||||
const jobLocation = posting.jobLocation;
|
||||
if (Array.isArray(jobLocation)) {
|
||||
for (const entry of jobLocation) {
|
||||
const loc = locationFromJobLocationNode(entry);
|
||||
if (loc) return loc;
|
||||
}
|
||||
} else if (jobLocation) {
|
||||
const loc = locationFromJobLocationNode(jobLocation);
|
||||
if (loc) return loc;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function locationFromJobLocationNode(node: unknown): string | undefined {
|
||||
const record = asRecord(node);
|
||||
if (!record) return undefined;
|
||||
const address = asRecord(record.address);
|
||||
if (address) {
|
||||
return formatPostalAddress(address);
|
||||
}
|
||||
if (typeof record.name === "string" && record.name.trim()) {
|
||||
return record.name.trim();
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function parseQaJobDetailPage(html: string): QaJobDetailEnrichment | null {
|
||||
for (const block of parseJsonLdBlocks(html)) {
|
||||
const posting = findJobPostingNode(block);
|
||||
if (!posting) continue;
|
||||
|
||||
const description =
|
||||
typeof posting.description === "string"
|
||||
? stripHtml(posting.description)
|
||||
: undefined;
|
||||
const location = locationFromJobPosting(posting);
|
||||
|
||||
if (!description && !location) continue;
|
||||
return {
|
||||
...(location ? { location } : {}),
|
||||
...(description ? { jobDescription: description } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
const bodyText = stripHtml(html);
|
||||
const location = extractJobLocationFromText(bodyText);
|
||||
if (!location && bodyText.length < 40) return null;
|
||||
return {
|
||||
...(location ? { location } : {}),
|
||||
...(bodyText ? { jobDescription: bodyText.slice(0, 12000) } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
export async function fetchQaJobDetailEnrichment(
|
||||
jobUrl: string,
|
||||
fetchImpl: typeof fetch = fetch,
|
||||
): Promise<QaJobDetailEnrichment | null> {
|
||||
const response = await fetchImpl(jobUrl, {
|
||||
headers: {
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"User-Agent": "JobOps/1.0",
|
||||
},
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`QAJobsBoard detail page ${response.status} for ${jobUrl}`);
|
||||
}
|
||||
const html = await response.text();
|
||||
return parseQaJobDetailPage(html);
|
||||
}
|
||||
33
extractors/qajobsboard/tests/detail-page.test.ts
Normal file
33
extractors/qajobsboard/tests/detail-page.test.ts
Normal file
@ -0,0 +1,33 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
extractJobLocationFromText,
|
||||
parseQaJobDetailPage,
|
||||
} from "../src/detail-page.js";
|
||||
|
||||
const SAMPLE_JSON_LD = `<!DOCTYPE html><html><body>
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@context": "https://schema.org/",
|
||||
"@type": "JobPosting",
|
||||
"title": "Sr. QA Automation Engineer",
|
||||
"description": "<p><strong>Job Location: Mumbai/Nagpur</strong></p><p>India | UK | USA</p>",
|
||||
"jobLocationType": "TELECOMMUTE"
|
||||
}
|
||||
</script>
|
||||
</body></html>`;
|
||||
|
||||
describe("parseQaJobDetailPage", () => {
|
||||
it("extracts job location from JSON-LD description on the detail page", () => {
|
||||
const result = parseQaJobDetailPage(SAMPLE_JSON_LD);
|
||||
expect(result?.location).toBe("Mumbai/Nagpur");
|
||||
expect(result?.jobDescription).toContain("Mumbai/Nagpur");
|
||||
});
|
||||
|
||||
it("extracts job location lines from plain text", () => {
|
||||
expect(
|
||||
extractJobLocationFromText(
|
||||
"Job Location: Vancouver, British Columbia, Canada",
|
||||
),
|
||||
).toBe("Vancouver, British Columbia, Canada");
|
||||
});
|
||||
});
|
||||
Loading…
x
Reference in New Issue
Block a user