fix(qajobsboard): drop expired LinkedIn reposts and resolve hiring location
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 6m10s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Failing after 1m16s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m9s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m10s
CI / Documentation (push) Successful in 1m56s

Probe application links for closed listings and feed expires_at; enrich vague Remote/Worldwide rows with real country before blocked-countries filtering.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ilia 2026-05-16 17:42:19 -04:00
parent 0de7f90278
commit d28a6221e4
5 changed files with 252 additions and 30 deletions

View File

@ -9,6 +9,11 @@ import type {
ExtractorRunResult, ExtractorRunResult,
} from "@shared/types/extractors"; } from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs"; import type { CreateJobInput } from "@shared/types/jobs";
import {
isLinkedInJobUrl,
isPostingExpiredByDate,
probeApplicationLink,
} from "./src/application-link.js";
import { import {
extractJobLocationFromText, extractJobLocationFromText,
fetchQaJobDetailEnrichment, fetchQaJobDetailEnrichment,
@ -37,6 +42,7 @@ interface QaJobBoardlyJob {
location?: string; location?: string;
location_limits?: string[]; location_limits?: string[];
published_at?: string; published_at?: string;
expires_at?: string;
application_link?: string; application_link?: string;
description?: DescriptionBlock; description?: DescriptionBlock;
company?: { name?: string; logo?: string }; company?: { name?: string; logo?: string };
@ -156,8 +162,6 @@ function mapJob(raw: QaJobBoardlyJob): CreateJobInput | null {
}; };
} }
const DETAIL_ENRICH_CONCURRENCY = 4;
function needsDetailEnrichment(location: string | undefined): boolean { function needsDetailEnrichment(location: string | undefined): boolean {
if (!location?.trim()) return true; if (!location?.trim()) return true;
return isVagueLocationLabel(location); return isVagueLocationLabel(location);
@ -167,31 +171,43 @@ async function enrichJobsFromDetailPages(
jobs: CreateJobInput[], jobs: CreateJobInput[],
shouldCancel?: () => boolean, shouldCancel?: () => boolean,
): Promise<CreateJobInput[]> { ): Promise<CreateJobInput[]> {
const enriched = jobs.map((job) => ({ ...job })); const enriched: CreateJobInput[] = [];
const targets = enriched
.map((job, index) => ({ job, index }))
.filter(({ job }) => needsDetailEnrichment(job.location));
for (let offset = 0; offset < targets.length; offset += DETAIL_ENRICH_CONCURRENCY) { for (const job of jobs) {
if (shouldCancel?.()) break; if (shouldCancel?.()) break;
const batch = targets.slice(offset, offset + DETAIL_ENRICH_CONCURRENCY);
await Promise.all( let current = { ...job };
batch.map(async ({ job, index }) => {
const applicationLink = job.applicationLink?.trim();
if (applicationLink && isLinkedInJobUrl(applicationLink)) {
try { try {
const detail = await fetchQaJobDetailEnrichment(job.jobUrl); const probe = await probeApplicationLink(applicationLink);
if (!detail) return; if (probe?.expired) continue;
enriched[index] = { if (probe?.location && needsDetailEnrichment(current.location)) {
...job, current = { ...current, location: probe.location };
...(detail.location ? { location: detail.location } : {}), }
...(detail.jobDescription } catch {
// keep row when LinkedIn probe fails
}
}
if (needsDetailEnrichment(current.location)) {
try {
const detail = await fetchQaJobDetailEnrichment(current.jobUrl);
if (detail?.expired) continue;
current = {
...current,
...(detail?.location ? { location: detail.location } : {}),
...(detail?.jobDescription
? { jobDescription: detail.jobDescription } ? { jobDescription: detail.jobDescription }
: {}), : {}),
}; };
} catch { } catch {
// keep feed row when detail page fetch fails // keep feed row when detail page fetch fails
} }
}), }
);
enriched.push(current);
} }
return enriched; return enriched;
@ -238,6 +254,7 @@ export const manifest: ExtractorManifest = {
for (const row of rows as QaJobBoardlyJob[]) { for (const row of rows as QaJobBoardlyJob[]) {
if (out.length >= cap) break; if (out.length >= cap) break;
if (isPostingExpiredByDate(asString(row.expires_at))) continue;
if (terms.length > 0 && !terms.some((t) => matchesTerm(row, t))) if (terms.length > 0 && !terms.some((t) => matchesTerm(row, t)))
continue; continue;
const mapped = mapJob(row); const mapped = mapJob(row);

View File

@ -0,0 +1,102 @@
/**
* Probe external application URLs (mainly LinkedIn) for expiry and hiring location.
*/
export interface ApplicationLinkProbe {
expired: boolean;
location?: string;
}
const LINKEDIN_JOB_RE =
/^https?:\/\/(?:[a-z]+\.)?linkedin\.com\/jobs\/view\/\d+/i;
const EXPIRED_URL_RE = /expired_jd_redirect|unavailable|no longer available/i;
const EXPIRED_BODY_RE =
/\bno longer accepting applications\b|\bjob you were looking for is no longer available\b|\bthis job is no longer available\b/i;
export function isLinkedInJobUrl(url: string | undefined): boolean {
return Boolean(url?.trim() && LINKEDIN_JOB_RE.test(url.trim()));
}
export function parseIsoDate(value: string | undefined): Date | null {
if (!value?.trim()) return null;
const parsed = Date.parse(value);
if (Number.isNaN(parsed)) return null;
return new Date(parsed);
}
export function isPostingExpiredByDate(
expiresAt: string | undefined,
now: Date = new Date(),
): boolean {
const expiry = parseIsoDate(expiresAt);
if (!expiry) return false;
return expiry.getTime() < now.getTime();
}
function extractLinkedInLocationFromHtml(html: string): string | undefined {
const patterns = [
/"addressLocality"\s*:\s*"([^"]+)"[^}]*"addressCountry"\s*:\s*"([^"]+)"/i,
/"addressCountry"\s*:\s*"([^"]+)"[^}]*"addressLocality"\s*:\s*"([^"]+)"/i,
/"jobLocation"\s*:\s*\{[^}]*"name"\s*:\s*"([^"]+)"/i,
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (!match) continue;
if (match.length === 3) {
const a = match[1]?.trim();
const b = match[2]?.trim();
if (a && b) {
const countryLike =
/\b(india|canada|united states|uk|united kingdom)\b/i;
if (countryLike.test(b)) return `${a}, ${b}`;
if (countryLike.test(a)) return `${b}, ${a}`;
return `${a}, ${b}`;
}
}
if (match[1]?.trim()) return match[1].trim();
}
const indiaCity = html.match(
/\b(Bengaluru|Bangalore|Mumbai|Hyderabad|Pune|Chennai|Delhi|Gurgaon|Noida)[^<]{0,40}\bIndia\b/i,
);
if (indiaCity?.[0]) {
return indiaCity[0].replace(/\s+/g, " ").trim();
}
const countryOnly = html.match(
/\b(?:job\s+)?location[^<]{0,40}\b(India|Canada|United States|United Kingdom)\b/i,
);
if (countryOnly?.[1]) return countryOnly[1].trim();
return undefined;
}
export async function probeApplicationLink(
url: string,
fetchImpl: typeof fetch = fetch,
): Promise<ApplicationLinkProbe | null> {
if (!url?.trim()) return null;
if (!isLinkedInJobUrl(url)) return null;
const response = await fetchImpl(url, {
redirect: "follow",
headers: {
Accept: "text/html,application/xhtml+xml",
"User-Agent": "JobOps/1.0",
},
});
const finalUrl = response.url ?? url;
const html = await response.text();
const expired =
EXPIRED_URL_RE.test(finalUrl) ||
EXPIRED_BODY_RE.test(html) ||
response.status === 404;
const location = expired ? undefined : extractLinkedInLocationFromHtml(html);
return { expired, ...(location ? { location } : {}) };
}

View File

@ -2,9 +2,12 @@
* Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone. * Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone.
*/ */
import { isPostingExpiredByDate } from "./application-link.js";
export interface QaJobDetailEnrichment { export interface QaJobDetailEnrichment {
location?: string; location?: string;
jobDescription?: string; jobDescription?: string;
expired?: boolean;
} }
const JSON_LD_RE = const JSON_LD_RE =
@ -30,9 +33,11 @@ export function extractJobLocationFromText(text: string): string | undefined {
if (!match?.[1]) return undefined; if (!match?.[1]) return undefined;
let extracted = match[1].trim(); let extracted = match[1].trim();
extracted = extracted =
extracted.split( extracted
.split(
/\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i, /\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i,
)[0]?.trim() ?? extracted; )[0]
?.trim() ?? extracted;
// Drop office footers like "India | UK | USA" glued onto the city line. // Drop office footers like "India | UK | USA" glued onto the city line.
extracted = extracted =
extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ?? extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ??
@ -84,7 +89,9 @@ function findJobPostingNode(node: unknown): Record<string, unknown> | null {
return null; return null;
} }
function formatPostalAddress(address: Record<string, unknown>): string | undefined { function formatPostalAddress(
address: Record<string, unknown>,
): string | undefined {
const parts = [ const parts = [
address.addressLocality, address.addressLocality,
address.addressRegion, address.addressRegion,
@ -95,7 +102,9 @@ function formatPostalAddress(address: Record<string, unknown>): string | undefin
return parts.length > 0 ? parts.join(", ") : undefined; return parts.length > 0 ? parts.join(", ") : undefined;
} }
function locationFromJobPosting(posting: Record<string, unknown>): string | undefined { function locationFromJobPosting(
posting: Record<string, unknown>,
): string | undefined {
const description = const description =
typeof posting.description === "string" ? posting.description : ""; typeof posting.description === "string" ? posting.description : "";
const fromDescription = extractJobLocationFromText(stripHtml(description)); const fromDescription = extractJobLocationFromText(stripHtml(description));
@ -128,11 +137,23 @@ function locationFromJobLocationNode(node: unknown): string | undefined {
return undefined; return undefined;
} }
export function parseQaJobDetailPage(html: string): QaJobDetailEnrichment | null { function postingIsExpired(posting: Record<string, unknown>): boolean {
const validThrough =
typeof posting.validThrough === "string" ? posting.validThrough : undefined;
return isPostingExpiredByDate(validThrough);
}
export function parseQaJobDetailPage(
html: string,
): QaJobDetailEnrichment | null {
for (const block of parseJsonLdBlocks(html)) { for (const block of parseJsonLdBlocks(html)) {
const posting = findJobPostingNode(block); const posting = findJobPostingNode(block);
if (!posting) continue; if (!posting) continue;
if (postingIsExpired(posting)) {
return { expired: true };
}
const description = const description =
typeof posting.description === "string" typeof posting.description === "string"
? stripHtml(posting.description) ? stripHtml(posting.description)

View File

@ -0,0 +1,75 @@
import { describe, expect, it } from "vitest";
import {
isLinkedInJobUrl,
isPostingExpiredByDate,
probeApplicationLink,
} from "../src/application-link.js";
describe("isLinkedInJobUrl", () => {
it("matches LinkedIn job view URLs", () => {
expect(
isLinkedInJobUrl("https://www.linkedin.com/jobs/view/4362000000"),
).toBe(true);
});
it("rejects non-LinkedIn URLs", () => {
expect(isLinkedInJobUrl("https://example.com/jobs/1")).toBe(false);
});
});
describe("isPostingExpiredByDate", () => {
it("returns true when expires_at is in the past", () => {
expect(
isPostingExpiredByDate(
"2020-01-01T00:00:00.000Z",
new Date("2025-01-01"),
),
).toBe(true);
});
it("returns false when expires_at is in the future", () => {
expect(
isPostingExpiredByDate(
"2099-01-01T00:00:00.000Z",
new Date("2025-01-01"),
),
).toBe(false);
});
});
describe("probeApplicationLink", () => {
it("marks expired when final URL is LinkedIn expired redirect", async () => {
const fetchImpl = async () =>
({
url: "https://www.linkedin.com/jobs/view/expired_jd_redirect/",
status: 200,
text: async () => "<html></html>",
}) as Response;
const result = await probeApplicationLink(
"https://www.linkedin.com/jobs/view/123",
fetchImpl,
);
expect(result?.expired).toBe(true);
});
it("extracts India location from HTML when not expired", async () => {
const html = `
<script type="application/ld+json">
{"@type":"JobPosting","jobLocation":{"address":{"addressLocality":"Bengaluru","addressCountry":"India"}}}
</script>`;
const fetchImpl = async () =>
({
url: "https://www.linkedin.com/jobs/view/123",
status: 200,
text: async () => html,
}) as Response;
const result = await probeApplicationLink(
"https://www.linkedin.com/jobs/view/123",
fetchImpl,
);
expect(result?.expired).toBe(false);
expect(result?.location).toMatch(/India/i);
});
});

View File

@ -23,6 +23,13 @@ describe("parseQaJobDetailPage", () => {
expect(result?.jobDescription).toContain("Mumbai/Nagpur"); expect(result?.jobDescription).toContain("Mumbai/Nagpur");
}); });
it("marks posting expired when validThrough is in the past", () => {
const html = `<script type="application/ld+json">
{"@type":"JobPosting","validThrough":"2020-01-01","title":"Old role"}
</script>`;
expect(parseQaJobDetailPage(html)).toEqual({ expired: true });
});
it("extracts job location lines from plain text", () => { it("extracts job location lines from plain text", () => {
expect( expect(
extractJobLocationFromText( extractJobLocationFromText(