From d28a6221e4b85cc26dcba1e54e1cfcd595b70f2c Mon Sep 17 00:00:00 2001 From: ilia Date: Sat, 16 May 2026 17:42:19 -0400 Subject: [PATCH] fix(qajobsboard): drop expired LinkedIn reposts and resolve hiring location Probe application links for closed listings and feed expires_at; enrich vague Remote/Worldwide rows with real country before blocked-countries filtering. Co-authored-by: Cursor --- extractors/qajobsboard/manifest.ts | 65 ++++++----- .../qajobsboard/src/application-link.ts | 102 ++++++++++++++++++ extractors/qajobsboard/src/detail-page.ts | 33 ++++-- .../tests/application-link.test.ts | 75 +++++++++++++ .../qajobsboard/tests/detail-page.test.ts | 7 ++ 5 files changed, 252 insertions(+), 30 deletions(-) create mode 100644 extractors/qajobsboard/src/application-link.ts create mode 100644 extractors/qajobsboard/tests/application-link.test.ts diff --git a/extractors/qajobsboard/manifest.ts b/extractors/qajobsboard/manifest.ts index 287217c..250ae85 100644 --- a/extractors/qajobsboard/manifest.ts +++ b/extractors/qajobsboard/manifest.ts @@ -9,6 +9,11 @@ import type { ExtractorRunResult, } from "@shared/types/extractors"; import type { CreateJobInput } from "@shared/types/jobs"; +import { + isLinkedInJobUrl, + isPostingExpiredByDate, + probeApplicationLink, +} from "./src/application-link.js"; import { extractJobLocationFromText, fetchQaJobDetailEnrichment, @@ -37,6 +42,7 @@ interface QaJobBoardlyJob { location?: string; location_limits?: string[]; published_at?: string; + expires_at?: string; application_link?: string; description?: DescriptionBlock; company?: { name?: string; logo?: string }; @@ -156,8 +162,6 @@ function mapJob(raw: QaJobBoardlyJob): CreateJobInput | null { }; } -const DETAIL_ENRICH_CONCURRENCY = 4; - function needsDetailEnrichment(location: string | undefined): boolean { if (!location?.trim()) return true; return isVagueLocationLabel(location); @@ -167,31 +171,43 @@ async function enrichJobsFromDetailPages( jobs: CreateJobInput[], shouldCancel?: () => boolean, ): Promise { - const enriched = jobs.map((job) => ({ ...job })); - const targets = enriched - .map((job, index) => ({ job, index })) - .filter(({ job }) => needsDetailEnrichment(job.location)); + const enriched: CreateJobInput[] = []; - for (let offset = 0; offset < targets.length; offset += DETAIL_ENRICH_CONCURRENCY) { + for (const job of jobs) { if (shouldCancel?.()) break; - const batch = targets.slice(offset, offset + DETAIL_ENRICH_CONCURRENCY); - await Promise.all( - batch.map(async ({ job, index }) => { - try { - const detail = await fetchQaJobDetailEnrichment(job.jobUrl); - if (!detail) return; - enriched[index] = { - ...job, - ...(detail.location ? { location: detail.location } : {}), - ...(detail.jobDescription - ? { jobDescription: detail.jobDescription } - : {}), - }; - } catch { - // keep feed row when detail page fetch fails + + let current = { ...job }; + + const applicationLink = job.applicationLink?.trim(); + if (applicationLink && isLinkedInJobUrl(applicationLink)) { + try { + const probe = await probeApplicationLink(applicationLink); + if (probe?.expired) continue; + if (probe?.location && needsDetailEnrichment(current.location)) { + current = { ...current, location: probe.location }; } - }), - ); + } catch { + // keep row when LinkedIn probe fails + } + } + + if (needsDetailEnrichment(current.location)) { + try { + const detail = await fetchQaJobDetailEnrichment(current.jobUrl); + if (detail?.expired) continue; + current = { + ...current, + ...(detail?.location ? { location: detail.location } : {}), + ...(detail?.jobDescription + ? { jobDescription: detail.jobDescription } + : {}), + }; + } catch { + // keep feed row when detail page fetch fails + } + } + + enriched.push(current); } return enriched; @@ -238,6 +254,7 @@ export const manifest: ExtractorManifest = { for (const row of rows as QaJobBoardlyJob[]) { if (out.length >= cap) break; + if (isPostingExpiredByDate(asString(row.expires_at))) continue; if (terms.length > 0 && !terms.some((t) => matchesTerm(row, t))) continue; const mapped = mapJob(row); diff --git a/extractors/qajobsboard/src/application-link.ts b/extractors/qajobsboard/src/application-link.ts new file mode 100644 index 0000000..56a8a53 --- /dev/null +++ b/extractors/qajobsboard/src/application-link.ts @@ -0,0 +1,102 @@ +/** + * Probe external application URLs (mainly LinkedIn) for expiry and hiring location. + */ + +export interface ApplicationLinkProbe { + expired: boolean; + location?: string; +} + +const LINKEDIN_JOB_RE = + /^https?:\/\/(?:[a-z]+\.)?linkedin\.com\/jobs\/view\/\d+/i; + +const EXPIRED_URL_RE = /expired_jd_redirect|unavailable|no longer available/i; +const EXPIRED_BODY_RE = + /\bno longer accepting applications\b|\bjob you were looking for is no longer available\b|\bthis job is no longer available\b/i; + +export function isLinkedInJobUrl(url: string | undefined): boolean { + return Boolean(url?.trim() && LINKEDIN_JOB_RE.test(url.trim())); +} + +export function parseIsoDate(value: string | undefined): Date | null { + if (!value?.trim()) return null; + const parsed = Date.parse(value); + if (Number.isNaN(parsed)) return null; + return new Date(parsed); +} + +export function isPostingExpiredByDate( + expiresAt: string | undefined, + now: Date = new Date(), +): boolean { + const expiry = parseIsoDate(expiresAt); + if (!expiry) return false; + return expiry.getTime() < now.getTime(); +} + +function extractLinkedInLocationFromHtml(html: string): string | undefined { + const patterns = [ + /"addressLocality"\s*:\s*"([^"]+)"[^}]*"addressCountry"\s*:\s*"([^"]+)"/i, + /"addressCountry"\s*:\s*"([^"]+)"[^}]*"addressLocality"\s*:\s*"([^"]+)"/i, + /"jobLocation"\s*:\s*\{[^}]*"name"\s*:\s*"([^"]+)"/i, + ]; + + for (const pattern of patterns) { + const match = html.match(pattern); + if (!match) continue; + if (match.length === 3) { + const a = match[1]?.trim(); + const b = match[2]?.trim(); + if (a && b) { + const countryLike = + /\b(india|canada|united states|uk|united kingdom)\b/i; + if (countryLike.test(b)) return `${a}, ${b}`; + if (countryLike.test(a)) return `${b}, ${a}`; + return `${a}, ${b}`; + } + } + if (match[1]?.trim()) return match[1].trim(); + } + + const indiaCity = html.match( + /\b(Bengaluru|Bangalore|Mumbai|Hyderabad|Pune|Chennai|Delhi|Gurgaon|Noida)[^<]{0,40}\bIndia\b/i, + ); + if (indiaCity?.[0]) { + return indiaCity[0].replace(/\s+/g, " ").trim(); + } + + const countryOnly = html.match( + /\b(?:job\s+)?location[^<]{0,40}\b(India|Canada|United States|United Kingdom)\b/i, + ); + if (countryOnly?.[1]) return countryOnly[1].trim(); + + return undefined; +} + +export async function probeApplicationLink( + url: string, + fetchImpl: typeof fetch = fetch, +): Promise { + if (!url?.trim()) return null; + if (!isLinkedInJobUrl(url)) return null; + + const response = await fetchImpl(url, { + redirect: "follow", + headers: { + Accept: "text/html,application/xhtml+xml", + "User-Agent": "JobOps/1.0", + }, + }); + + const finalUrl = response.url ?? url; + const html = await response.text(); + + const expired = + EXPIRED_URL_RE.test(finalUrl) || + EXPIRED_BODY_RE.test(html) || + response.status === 404; + + const location = expired ? undefined : extractLinkedInLocationFromHtml(html); + + return { expired, ...(location ? { location } : {}) }; +} diff --git a/extractors/qajobsboard/src/detail-page.ts b/extractors/qajobsboard/src/detail-page.ts index f1030be..de5809c 100644 --- a/extractors/qajobsboard/src/detail-page.ts +++ b/extractors/qajobsboard/src/detail-page.ts @@ -2,9 +2,12 @@ * Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone. */ +import { isPostingExpiredByDate } from "./application-link.js"; + export interface QaJobDetailEnrichment { location?: string; jobDescription?: string; + expired?: boolean; } const JSON_LD_RE = @@ -30,9 +33,11 @@ export function extractJobLocationFromText(text: string): string | undefined { if (!match?.[1]) return undefined; let extracted = match[1].trim(); extracted = - extracted.split( - /\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i, - )[0]?.trim() ?? extracted; + extracted + .split( + /\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i, + )[0] + ?.trim() ?? extracted; // Drop office footers like "India | UK | USA" glued onto the city line. extracted = extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ?? @@ -84,7 +89,9 @@ function findJobPostingNode(node: unknown): Record | null { return null; } -function formatPostalAddress(address: Record): string | undefined { +function formatPostalAddress( + address: Record, +): string | undefined { const parts = [ address.addressLocality, address.addressRegion, @@ -95,7 +102,9 @@ function formatPostalAddress(address: Record): string | undefin return parts.length > 0 ? parts.join(", ") : undefined; } -function locationFromJobPosting(posting: Record): string | undefined { +function locationFromJobPosting( + posting: Record, +): string | undefined { const description = typeof posting.description === "string" ? posting.description : ""; const fromDescription = extractJobLocationFromText(stripHtml(description)); @@ -128,11 +137,23 @@ function locationFromJobLocationNode(node: unknown): string | undefined { return undefined; } -export function parseQaJobDetailPage(html: string): QaJobDetailEnrichment | null { +function postingIsExpired(posting: Record): boolean { + const validThrough = + typeof posting.validThrough === "string" ? posting.validThrough : undefined; + return isPostingExpiredByDate(validThrough); +} + +export function parseQaJobDetailPage( + html: string, +): QaJobDetailEnrichment | null { for (const block of parseJsonLdBlocks(html)) { const posting = findJobPostingNode(block); if (!posting) continue; + if (postingIsExpired(posting)) { + return { expired: true }; + } + const description = typeof posting.description === "string" ? stripHtml(posting.description) diff --git a/extractors/qajobsboard/tests/application-link.test.ts b/extractors/qajobsboard/tests/application-link.test.ts new file mode 100644 index 0000000..40dee44 --- /dev/null +++ b/extractors/qajobsboard/tests/application-link.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it } from "vitest"; +import { + isLinkedInJobUrl, + isPostingExpiredByDate, + probeApplicationLink, +} from "../src/application-link.js"; + +describe("isLinkedInJobUrl", () => { + it("matches LinkedIn job view URLs", () => { + expect( + isLinkedInJobUrl("https://www.linkedin.com/jobs/view/4362000000"), + ).toBe(true); + }); + + it("rejects non-LinkedIn URLs", () => { + expect(isLinkedInJobUrl("https://example.com/jobs/1")).toBe(false); + }); +}); + +describe("isPostingExpiredByDate", () => { + it("returns true when expires_at is in the past", () => { + expect( + isPostingExpiredByDate( + "2020-01-01T00:00:00.000Z", + new Date("2025-01-01"), + ), + ).toBe(true); + }); + + it("returns false when expires_at is in the future", () => { + expect( + isPostingExpiredByDate( + "2099-01-01T00:00:00.000Z", + new Date("2025-01-01"), + ), + ).toBe(false); + }); +}); + +describe("probeApplicationLink", () => { + it("marks expired when final URL is LinkedIn expired redirect", async () => { + const fetchImpl = async () => + ({ + url: "https://www.linkedin.com/jobs/view/expired_jd_redirect/", + status: 200, + text: async () => "", + }) as Response; + + const result = await probeApplicationLink( + "https://www.linkedin.com/jobs/view/123", + fetchImpl, + ); + expect(result?.expired).toBe(true); + }); + + it("extracts India location from HTML when not expired", async () => { + const html = ` + `; + const fetchImpl = async () => + ({ + url: "https://www.linkedin.com/jobs/view/123", + status: 200, + text: async () => html, + }) as Response; + + const result = await probeApplicationLink( + "https://www.linkedin.com/jobs/view/123", + fetchImpl, + ); + expect(result?.expired).toBe(false); + expect(result?.location).toMatch(/India/i); + }); +}); diff --git a/extractors/qajobsboard/tests/detail-page.test.ts b/extractors/qajobsboard/tests/detail-page.test.ts index 4d28af0..a1d798c 100644 --- a/extractors/qajobsboard/tests/detail-page.test.ts +++ b/extractors/qajobsboard/tests/detail-page.test.ts @@ -23,6 +23,13 @@ describe("parseQaJobDetailPage", () => { expect(result?.jobDescription).toContain("Mumbai/Nagpur"); }); + it("marks posting expired when validThrough is in the past", () => { + const html = ``; + expect(parseQaJobDetailPage(html)).toEqual({ expired: true }); + }); + it("extracts job location lines from plain text", () => { expect( extractJobLocationFromText(