Some checks failed
CI / Linting (Biome) (push) Failing after 41s
CI / Tests (push) Successful in 6m10s
CI / Type Check (adzuna-extractor) (push) Successful in 1m9s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m13s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m9s
CI / Type Check (orchestrator) (push) Failing after 1m16s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m9s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m10s
CI / Documentation (push) Successful in 1m56s
Probe application links for closed listings and feed expires_at; enrich vague Remote/Worldwide rows with real country before blocked-countries filtering. Co-authored-by: Cursor <cursoragent@cursor.com>
103 lines
3.0 KiB
TypeScript
103 lines
3.0 KiB
TypeScript
/**
|
|
* Probe external application URLs (mainly LinkedIn) for expiry and hiring location.
|
|
*/
|
|
|
|
export interface ApplicationLinkProbe {
|
|
expired: boolean;
|
|
location?: string;
|
|
}
|
|
|
|
const LINKEDIN_JOB_RE =
|
|
/^https?:\/\/(?:[a-z]+\.)?linkedin\.com\/jobs\/view\/\d+/i;
|
|
|
|
const EXPIRED_URL_RE = /expired_jd_redirect|unavailable|no longer available/i;
|
|
const EXPIRED_BODY_RE =
|
|
/\bno longer accepting applications\b|\bjob you were looking for is no longer available\b|\bthis job is no longer available\b/i;
|
|
|
|
export function isLinkedInJobUrl(url: string | undefined): boolean {
|
|
return Boolean(url?.trim() && LINKEDIN_JOB_RE.test(url.trim()));
|
|
}
|
|
|
|
export function parseIsoDate(value: string | undefined): Date | null {
|
|
if (!value?.trim()) return null;
|
|
const parsed = Date.parse(value);
|
|
if (Number.isNaN(parsed)) return null;
|
|
return new Date(parsed);
|
|
}
|
|
|
|
export function isPostingExpiredByDate(
|
|
expiresAt: string | undefined,
|
|
now: Date = new Date(),
|
|
): boolean {
|
|
const expiry = parseIsoDate(expiresAt);
|
|
if (!expiry) return false;
|
|
return expiry.getTime() < now.getTime();
|
|
}
|
|
|
|
function extractLinkedInLocationFromHtml(html: string): string | undefined {
|
|
const patterns = [
|
|
/"addressLocality"\s*:\s*"([^"]+)"[^}]*"addressCountry"\s*:\s*"([^"]+)"/i,
|
|
/"addressCountry"\s*:\s*"([^"]+)"[^}]*"addressLocality"\s*:\s*"([^"]+)"/i,
|
|
/"jobLocation"\s*:\s*\{[^}]*"name"\s*:\s*"([^"]+)"/i,
|
|
];
|
|
|
|
for (const pattern of patterns) {
|
|
const match = html.match(pattern);
|
|
if (!match) continue;
|
|
if (match.length === 3) {
|
|
const a = match[1]?.trim();
|
|
const b = match[2]?.trim();
|
|
if (a && b) {
|
|
const countryLike =
|
|
/\b(india|canada|united states|uk|united kingdom)\b/i;
|
|
if (countryLike.test(b)) return `${a}, ${b}`;
|
|
if (countryLike.test(a)) return `${b}, ${a}`;
|
|
return `${a}, ${b}`;
|
|
}
|
|
}
|
|
if (match[1]?.trim()) return match[1].trim();
|
|
}
|
|
|
|
const indiaCity = html.match(
|
|
/\b(Bengaluru|Bangalore|Mumbai|Hyderabad|Pune|Chennai|Delhi|Gurgaon|Noida)[^<]{0,40}\bIndia\b/i,
|
|
);
|
|
if (indiaCity?.[0]) {
|
|
return indiaCity[0].replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
const countryOnly = html.match(
|
|
/\b(?:job\s+)?location[^<]{0,40}\b(India|Canada|United States|United Kingdom)\b/i,
|
|
);
|
|
if (countryOnly?.[1]) return countryOnly[1].trim();
|
|
|
|
return undefined;
|
|
}
|
|
|
|
export async function probeApplicationLink(
|
|
url: string,
|
|
fetchImpl: typeof fetch = fetch,
|
|
): Promise<ApplicationLinkProbe | null> {
|
|
if (!url?.trim()) return null;
|
|
if (!isLinkedInJobUrl(url)) return null;
|
|
|
|
const response = await fetchImpl(url, {
|
|
redirect: "follow",
|
|
headers: {
|
|
Accept: "text/html,application/xhtml+xml",
|
|
"User-Agent": "JobOps/1.0",
|
|
},
|
|
});
|
|
|
|
const finalUrl = response.url ?? url;
|
|
const html = await response.text();
|
|
|
|
const expired =
|
|
EXPIRED_URL_RE.test(finalUrl) ||
|
|
EXPIRED_BODY_RE.test(html) ||
|
|
response.status === 404;
|
|
|
|
const location = expired ? undefined : extractLinkedInLocationFromHtml(html);
|
|
|
|
return { expired, ...(location ? { location } : {}) };
|
|
}
|