fix(qajobsboard): drop expired LinkedIn reposts and resolve hiring location
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 6m10s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Failing after 1m16s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m9s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m10s
CI / Documentation (push) Successful in 1m56s
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 6m10s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Failing after 1m16s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m9s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m10s
CI / Documentation (push) Successful in 1m56s
Probe application links for closed listings and feed expires_at; enrich vague Remote/Worldwide rows with real country before blocked-countries filtering. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
0de7f90278
commit
d28a6221e4
@ -9,6 +9,11 @@ import type {
|
|||||||
ExtractorRunResult,
|
ExtractorRunResult,
|
||||||
} from "@shared/types/extractors";
|
} from "@shared/types/extractors";
|
||||||
import type { CreateJobInput } from "@shared/types/jobs";
|
import type { CreateJobInput } from "@shared/types/jobs";
|
||||||
|
import {
|
||||||
|
isLinkedInJobUrl,
|
||||||
|
isPostingExpiredByDate,
|
||||||
|
probeApplicationLink,
|
||||||
|
} from "./src/application-link.js";
|
||||||
import {
|
import {
|
||||||
extractJobLocationFromText,
|
extractJobLocationFromText,
|
||||||
fetchQaJobDetailEnrichment,
|
fetchQaJobDetailEnrichment,
|
||||||
@ -37,6 +42,7 @@ interface QaJobBoardlyJob {
|
|||||||
location?: string;
|
location?: string;
|
||||||
location_limits?: string[];
|
location_limits?: string[];
|
||||||
published_at?: string;
|
published_at?: string;
|
||||||
|
expires_at?: string;
|
||||||
application_link?: string;
|
application_link?: string;
|
||||||
description?: DescriptionBlock;
|
description?: DescriptionBlock;
|
||||||
company?: { name?: string; logo?: string };
|
company?: { name?: string; logo?: string };
|
||||||
@ -156,8 +162,6 @@ function mapJob(raw: QaJobBoardlyJob): CreateJobInput | null {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const DETAIL_ENRICH_CONCURRENCY = 4;
|
|
||||||
|
|
||||||
function needsDetailEnrichment(location: string | undefined): boolean {
|
function needsDetailEnrichment(location: string | undefined): boolean {
|
||||||
if (!location?.trim()) return true;
|
if (!location?.trim()) return true;
|
||||||
return isVagueLocationLabel(location);
|
return isVagueLocationLabel(location);
|
||||||
@ -167,31 +171,43 @@ async function enrichJobsFromDetailPages(
|
|||||||
jobs: CreateJobInput[],
|
jobs: CreateJobInput[],
|
||||||
shouldCancel?: () => boolean,
|
shouldCancel?: () => boolean,
|
||||||
): Promise<CreateJobInput[]> {
|
): Promise<CreateJobInput[]> {
|
||||||
const enriched = jobs.map((job) => ({ ...job }));
|
const enriched: CreateJobInput[] = [];
|
||||||
const targets = enriched
|
|
||||||
.map((job, index) => ({ job, index }))
|
|
||||||
.filter(({ job }) => needsDetailEnrichment(job.location));
|
|
||||||
|
|
||||||
for (let offset = 0; offset < targets.length; offset += DETAIL_ENRICH_CONCURRENCY) {
|
for (const job of jobs) {
|
||||||
if (shouldCancel?.()) break;
|
if (shouldCancel?.()) break;
|
||||||
const batch = targets.slice(offset, offset + DETAIL_ENRICH_CONCURRENCY);
|
|
||||||
await Promise.all(
|
let current = { ...job };
|
||||||
batch.map(async ({ job, index }) => {
|
|
||||||
try {
|
const applicationLink = job.applicationLink?.trim();
|
||||||
const detail = await fetchQaJobDetailEnrichment(job.jobUrl);
|
if (applicationLink && isLinkedInJobUrl(applicationLink)) {
|
||||||
if (!detail) return;
|
try {
|
||||||
enriched[index] = {
|
const probe = await probeApplicationLink(applicationLink);
|
||||||
...job,
|
if (probe?.expired) continue;
|
||||||
...(detail.location ? { location: detail.location } : {}),
|
if (probe?.location && needsDetailEnrichment(current.location)) {
|
||||||
...(detail.jobDescription
|
current = { ...current, location: probe.location };
|
||||||
? { jobDescription: detail.jobDescription }
|
|
||||||
: {}),
|
|
||||||
};
|
|
||||||
} catch {
|
|
||||||
// keep feed row when detail page fetch fails
|
|
||||||
}
|
}
|
||||||
}),
|
} catch {
|
||||||
);
|
// keep row when LinkedIn probe fails
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (needsDetailEnrichment(current.location)) {
|
||||||
|
try {
|
||||||
|
const detail = await fetchQaJobDetailEnrichment(current.jobUrl);
|
||||||
|
if (detail?.expired) continue;
|
||||||
|
current = {
|
||||||
|
...current,
|
||||||
|
...(detail?.location ? { location: detail.location } : {}),
|
||||||
|
...(detail?.jobDescription
|
||||||
|
? { jobDescription: detail.jobDescription }
|
||||||
|
: {}),
|
||||||
|
};
|
||||||
|
} catch {
|
||||||
|
// keep feed row when detail page fetch fails
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enriched.push(current);
|
||||||
}
|
}
|
||||||
|
|
||||||
return enriched;
|
return enriched;
|
||||||
@ -238,6 +254,7 @@ export const manifest: ExtractorManifest = {
|
|||||||
|
|
||||||
for (const row of rows as QaJobBoardlyJob[]) {
|
for (const row of rows as QaJobBoardlyJob[]) {
|
||||||
if (out.length >= cap) break;
|
if (out.length >= cap) break;
|
||||||
|
if (isPostingExpiredByDate(asString(row.expires_at))) continue;
|
||||||
if (terms.length > 0 && !terms.some((t) => matchesTerm(row, t)))
|
if (terms.length > 0 && !terms.some((t) => matchesTerm(row, t)))
|
||||||
continue;
|
continue;
|
||||||
const mapped = mapJob(row);
|
const mapped = mapJob(row);
|
||||||
|
|||||||
102
extractors/qajobsboard/src/application-link.ts
Normal file
102
extractors/qajobsboard/src/application-link.ts
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
/**
|
||||||
|
* Probe external application URLs (mainly LinkedIn) for expiry and hiring location.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export interface ApplicationLinkProbe {
|
||||||
|
expired: boolean;
|
||||||
|
location?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const LINKEDIN_JOB_RE =
|
||||||
|
/^https?:\/\/(?:[a-z]+\.)?linkedin\.com\/jobs\/view\/\d+/i;
|
||||||
|
|
||||||
|
const EXPIRED_URL_RE = /expired_jd_redirect|unavailable|no longer available/i;
|
||||||
|
const EXPIRED_BODY_RE =
|
||||||
|
/\bno longer accepting applications\b|\bjob you were looking for is no longer available\b|\bthis job is no longer available\b/i;
|
||||||
|
|
||||||
|
export function isLinkedInJobUrl(url: string | undefined): boolean {
|
||||||
|
return Boolean(url?.trim() && LINKEDIN_JOB_RE.test(url.trim()));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseIsoDate(value: string | undefined): Date | null {
|
||||||
|
if (!value?.trim()) return null;
|
||||||
|
const parsed = Date.parse(value);
|
||||||
|
if (Number.isNaN(parsed)) return null;
|
||||||
|
return new Date(parsed);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isPostingExpiredByDate(
|
||||||
|
expiresAt: string | undefined,
|
||||||
|
now: Date = new Date(),
|
||||||
|
): boolean {
|
||||||
|
const expiry = parseIsoDate(expiresAt);
|
||||||
|
if (!expiry) return false;
|
||||||
|
return expiry.getTime() < now.getTime();
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractLinkedInLocationFromHtml(html: string): string | undefined {
|
||||||
|
const patterns = [
|
||||||
|
/"addressLocality"\s*:\s*"([^"]+)"[^}]*"addressCountry"\s*:\s*"([^"]+)"/i,
|
||||||
|
/"addressCountry"\s*:\s*"([^"]+)"[^}]*"addressLocality"\s*:\s*"([^"]+)"/i,
|
||||||
|
/"jobLocation"\s*:\s*\{[^}]*"name"\s*:\s*"([^"]+)"/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const pattern of patterns) {
|
||||||
|
const match = html.match(pattern);
|
||||||
|
if (!match) continue;
|
||||||
|
if (match.length === 3) {
|
||||||
|
const a = match[1]?.trim();
|
||||||
|
const b = match[2]?.trim();
|
||||||
|
if (a && b) {
|
||||||
|
const countryLike =
|
||||||
|
/\b(india|canada|united states|uk|united kingdom)\b/i;
|
||||||
|
if (countryLike.test(b)) return `${a}, ${b}`;
|
||||||
|
if (countryLike.test(a)) return `${b}, ${a}`;
|
||||||
|
return `${a}, ${b}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (match[1]?.trim()) return match[1].trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const indiaCity = html.match(
|
||||||
|
/\b(Bengaluru|Bangalore|Mumbai|Hyderabad|Pune|Chennai|Delhi|Gurgaon|Noida)[^<]{0,40}\bIndia\b/i,
|
||||||
|
);
|
||||||
|
if (indiaCity?.[0]) {
|
||||||
|
return indiaCity[0].replace(/\s+/g, " ").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const countryOnly = html.match(
|
||||||
|
/\b(?:job\s+)?location[^<]{0,40}\b(India|Canada|United States|United Kingdom)\b/i,
|
||||||
|
);
|
||||||
|
if (countryOnly?.[1]) return countryOnly[1].trim();
|
||||||
|
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function probeApplicationLink(
|
||||||
|
url: string,
|
||||||
|
fetchImpl: typeof fetch = fetch,
|
||||||
|
): Promise<ApplicationLinkProbe | null> {
|
||||||
|
if (!url?.trim()) return null;
|
||||||
|
if (!isLinkedInJobUrl(url)) return null;
|
||||||
|
|
||||||
|
const response = await fetchImpl(url, {
|
||||||
|
redirect: "follow",
|
||||||
|
headers: {
|
||||||
|
Accept: "text/html,application/xhtml+xml",
|
||||||
|
"User-Agent": "JobOps/1.0",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const finalUrl = response.url ?? url;
|
||||||
|
const html = await response.text();
|
||||||
|
|
||||||
|
const expired =
|
||||||
|
EXPIRED_URL_RE.test(finalUrl) ||
|
||||||
|
EXPIRED_BODY_RE.test(html) ||
|
||||||
|
response.status === 404;
|
||||||
|
|
||||||
|
const location = expired ? undefined : extractLinkedInLocationFromHtml(html);
|
||||||
|
|
||||||
|
return { expired, ...(location ? { location } : {}) };
|
||||||
|
}
|
||||||
@ -2,9 +2,12 @@
|
|||||||
* Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone.
|
* Fetch a QAJobsBoard job detail page for richer JSON-LD and HTML than jobs.json alone.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import { isPostingExpiredByDate } from "./application-link.js";
|
||||||
|
|
||||||
export interface QaJobDetailEnrichment {
|
export interface QaJobDetailEnrichment {
|
||||||
location?: string;
|
location?: string;
|
||||||
jobDescription?: string;
|
jobDescription?: string;
|
||||||
|
expired?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
const JSON_LD_RE =
|
const JSON_LD_RE =
|
||||||
@ -30,9 +33,11 @@ export function extractJobLocationFromText(text: string): string | undefined {
|
|||||||
if (!match?.[1]) return undefined;
|
if (!match?.[1]) return undefined;
|
||||||
let extracted = match[1].trim();
|
let extracted = match[1].trim();
|
||||||
extracted =
|
extracted =
|
||||||
extracted.split(
|
extracted
|
||||||
/\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i,
|
.split(
|
||||||
)[0]?.trim() ?? extracted;
|
/\s+(?=experience|shift timings|company profile|responsibilities|technical requirements)\b/i,
|
||||||
|
)[0]
|
||||||
|
?.trim() ?? extracted;
|
||||||
// Drop office footers like "India | UK | USA" glued onto the city line.
|
// Drop office footers like "India | UK | USA" glued onto the city line.
|
||||||
extracted =
|
extracted =
|
||||||
extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ??
|
extracted.split(/\s+(?=(?:india|uk|usa|us|canada)\s*\|)/i)[0]?.trim() ??
|
||||||
@ -84,7 +89,9 @@ function findJobPostingNode(node: unknown): Record<string, unknown> | null {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatPostalAddress(address: Record<string, unknown>): string | undefined {
|
function formatPostalAddress(
|
||||||
|
address: Record<string, unknown>,
|
||||||
|
): string | undefined {
|
||||||
const parts = [
|
const parts = [
|
||||||
address.addressLocality,
|
address.addressLocality,
|
||||||
address.addressRegion,
|
address.addressRegion,
|
||||||
@ -95,7 +102,9 @@ function formatPostalAddress(address: Record<string, unknown>): string | undefin
|
|||||||
return parts.length > 0 ? parts.join(", ") : undefined;
|
return parts.length > 0 ? parts.join(", ") : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
function locationFromJobPosting(posting: Record<string, unknown>): string | undefined {
|
function locationFromJobPosting(
|
||||||
|
posting: Record<string, unknown>,
|
||||||
|
): string | undefined {
|
||||||
const description =
|
const description =
|
||||||
typeof posting.description === "string" ? posting.description : "";
|
typeof posting.description === "string" ? posting.description : "";
|
||||||
const fromDescription = extractJobLocationFromText(stripHtml(description));
|
const fromDescription = extractJobLocationFromText(stripHtml(description));
|
||||||
@ -128,11 +137,23 @@ function locationFromJobLocationNode(node: unknown): string | undefined {
|
|||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function parseQaJobDetailPage(html: string): QaJobDetailEnrichment | null {
|
function postingIsExpired(posting: Record<string, unknown>): boolean {
|
||||||
|
const validThrough =
|
||||||
|
typeof posting.validThrough === "string" ? posting.validThrough : undefined;
|
||||||
|
return isPostingExpiredByDate(validThrough);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseQaJobDetailPage(
|
||||||
|
html: string,
|
||||||
|
): QaJobDetailEnrichment | null {
|
||||||
for (const block of parseJsonLdBlocks(html)) {
|
for (const block of parseJsonLdBlocks(html)) {
|
||||||
const posting = findJobPostingNode(block);
|
const posting = findJobPostingNode(block);
|
||||||
if (!posting) continue;
|
if (!posting) continue;
|
||||||
|
|
||||||
|
if (postingIsExpired(posting)) {
|
||||||
|
return { expired: true };
|
||||||
|
}
|
||||||
|
|
||||||
const description =
|
const description =
|
||||||
typeof posting.description === "string"
|
typeof posting.description === "string"
|
||||||
? stripHtml(posting.description)
|
? stripHtml(posting.description)
|
||||||
|
|||||||
75
extractors/qajobsboard/tests/application-link.test.ts
Normal file
75
extractors/qajobsboard/tests/application-link.test.ts
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import {
|
||||||
|
isLinkedInJobUrl,
|
||||||
|
isPostingExpiredByDate,
|
||||||
|
probeApplicationLink,
|
||||||
|
} from "../src/application-link.js";
|
||||||
|
|
||||||
|
describe("isLinkedInJobUrl", () => {
|
||||||
|
it("matches LinkedIn job view URLs", () => {
|
||||||
|
expect(
|
||||||
|
isLinkedInJobUrl("https://www.linkedin.com/jobs/view/4362000000"),
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("rejects non-LinkedIn URLs", () => {
|
||||||
|
expect(isLinkedInJobUrl("https://example.com/jobs/1")).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("isPostingExpiredByDate", () => {
|
||||||
|
it("returns true when expires_at is in the past", () => {
|
||||||
|
expect(
|
||||||
|
isPostingExpiredByDate(
|
||||||
|
"2020-01-01T00:00:00.000Z",
|
||||||
|
new Date("2025-01-01"),
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false when expires_at is in the future", () => {
|
||||||
|
expect(
|
||||||
|
isPostingExpiredByDate(
|
||||||
|
"2099-01-01T00:00:00.000Z",
|
||||||
|
new Date("2025-01-01"),
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("probeApplicationLink", () => {
|
||||||
|
it("marks expired when final URL is LinkedIn expired redirect", async () => {
|
||||||
|
const fetchImpl = async () =>
|
||||||
|
({
|
||||||
|
url: "https://www.linkedin.com/jobs/view/expired_jd_redirect/",
|
||||||
|
status: 200,
|
||||||
|
text: async () => "<html></html>",
|
||||||
|
}) as Response;
|
||||||
|
|
||||||
|
const result = await probeApplicationLink(
|
||||||
|
"https://www.linkedin.com/jobs/view/123",
|
||||||
|
fetchImpl,
|
||||||
|
);
|
||||||
|
expect(result?.expired).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("extracts India location from HTML when not expired", async () => {
|
||||||
|
const html = `
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{"@type":"JobPosting","jobLocation":{"address":{"addressLocality":"Bengaluru","addressCountry":"India"}}}
|
||||||
|
</script>`;
|
||||||
|
const fetchImpl = async () =>
|
||||||
|
({
|
||||||
|
url: "https://www.linkedin.com/jobs/view/123",
|
||||||
|
status: 200,
|
||||||
|
text: async () => html,
|
||||||
|
}) as Response;
|
||||||
|
|
||||||
|
const result = await probeApplicationLink(
|
||||||
|
"https://www.linkedin.com/jobs/view/123",
|
||||||
|
fetchImpl,
|
||||||
|
);
|
||||||
|
expect(result?.expired).toBe(false);
|
||||||
|
expect(result?.location).toMatch(/India/i);
|
||||||
|
});
|
||||||
|
});
|
||||||
@ -23,6 +23,13 @@ describe("parseQaJobDetailPage", () => {
|
|||||||
expect(result?.jobDescription).toContain("Mumbai/Nagpur");
|
expect(result?.jobDescription).toContain("Mumbai/Nagpur");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("marks posting expired when validThrough is in the past", () => {
|
||||||
|
const html = `<script type="application/ld+json">
|
||||||
|
{"@type":"JobPosting","validThrough":"2020-01-01","title":"Old role"}
|
||||||
|
</script>`;
|
||||||
|
expect(parseQaJobDetailPage(html)).toEqual({ expired: true });
|
||||||
|
});
|
||||||
|
|
||||||
it("extracts job location lines from plain text", () => {
|
it("extracts job location lines from plain text", () => {
|
||||||
expect(
|
expect(
|
||||||
extractJobLocationFromText(
|
extractJobLocationFromText(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user