Jobber/extractors/qajobsboard/src/application-link.ts
ilia d28a6221e4
Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 6m10s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Failing after 1m16s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m9s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m10s
CI / Documentation (push) Successful in 1m56s
fix(qajobsboard): drop expired LinkedIn reposts and resolve hiring location
Probe application links for closed listings and feed expires_at; enrich vague Remote/Worldwide rows with real country before blocked-countries filtering.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-16 17:42:19 -04:00

103 lines
3.0 KiB
TypeScript

/**
* Probe external application URLs (mainly LinkedIn) for expiry and hiring location.
*/
export interface ApplicationLinkProbe {
expired: boolean;
location?: string;
}
const LINKEDIN_JOB_RE =
/^https?:\/\/(?:[a-z]+\.)?linkedin\.com\/jobs\/view\/\d+/i;
const EXPIRED_URL_RE = /expired_jd_redirect|unavailable|no longer available/i;
const EXPIRED_BODY_RE =
/\bno longer accepting applications\b|\bjob you were looking for is no longer available\b|\bthis job is no longer available\b/i;
export function isLinkedInJobUrl(url: string | undefined): boolean {
return Boolean(url?.trim() && LINKEDIN_JOB_RE.test(url.trim()));
}
export function parseIsoDate(value: string | undefined): Date | null {
if (!value?.trim()) return null;
const parsed = Date.parse(value);
if (Number.isNaN(parsed)) return null;
return new Date(parsed);
}
export function isPostingExpiredByDate(
expiresAt: string | undefined,
now: Date = new Date(),
): boolean {
const expiry = parseIsoDate(expiresAt);
if (!expiry) return false;
return expiry.getTime() < now.getTime();
}
function extractLinkedInLocationFromHtml(html: string): string | undefined {
const patterns = [
/"addressLocality"\s*:\s*"([^"]+)"[^}]*"addressCountry"\s*:\s*"([^"]+)"/i,
/"addressCountry"\s*:\s*"([^"]+)"[^}]*"addressLocality"\s*:\s*"([^"]+)"/i,
/"jobLocation"\s*:\s*\{[^}]*"name"\s*:\s*"([^"]+)"/i,
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (!match) continue;
if (match.length === 3) {
const a = match[1]?.trim();
const b = match[2]?.trim();
if (a && b) {
const countryLike =
/\b(india|canada|united states|uk|united kingdom)\b/i;
if (countryLike.test(b)) return `${a}, ${b}`;
if (countryLike.test(a)) return `${b}, ${a}`;
return `${a}, ${b}`;
}
}
if (match[1]?.trim()) return match[1].trim();
}
const indiaCity = html.match(
/\b(Bengaluru|Bangalore|Mumbai|Hyderabad|Pune|Chennai|Delhi|Gurgaon|Noida)[^<]{0,40}\bIndia\b/i,
);
if (indiaCity?.[0]) {
return indiaCity[0].replace(/\s+/g, " ").trim();
}
const countryOnly = html.match(
/\b(?:job\s+)?location[^<]{0,40}\b(India|Canada|United States|United Kingdom)\b/i,
);
if (countryOnly?.[1]) return countryOnly[1].trim();
return undefined;
}
export async function probeApplicationLink(
url: string,
fetchImpl: typeof fetch = fetch,
): Promise<ApplicationLinkProbe | null> {
if (!url?.trim()) return null;
if (!isLinkedInJobUrl(url)) return null;
const response = await fetchImpl(url, {
redirect: "follow",
headers: {
Accept: "text/html,application/xhtml+xml",
"User-Agent": "JobOps/1.0",
},
});
const finalUrl = response.url ?? url;
const html = await response.text();
const expired =
EXPIRED_URL_RE.test(finalUrl) ||
EXPIRED_BODY_RE.test(html) ||
response.status === 404;
const location = expired ? undefined : extractLinkedInLocationFromHtml(html);
return { expired, ...(location ? { location } : {}) };
}