From 43d307bea0b9b8d39bbfaf4e16cb58b487b6d919 Mon Sep 17 00:00:00 2001 From: DaKheera47 Date: Tue, 6 Jan 2026 22:16:18 +0000 Subject: [PATCH] get job description with the job url for the ukvisajobs extractor --- .../src/server/services/ukvisajobs.ts | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/orchestrator/src/server/services/ukvisajobs.ts b/orchestrator/src/server/services/ukvisajobs.ts index 0b3fc7c..802a52c 100644 --- a/orchestrator/src/server/services/ukvisajobs.ts +++ b/orchestrator/src/server/services/ukvisajobs.ts @@ -29,6 +29,83 @@ export interface UkVisaJobsResult { error?: string; } +/** + * Basic HTML to text conversion to extract job description. + */ +function cleanHtml(html: string): string { + // Remove script, style tags and their content + let text = html.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/gi, ''); + + // Try to extract content between
tags if present, or fallback to body + const mainMatch = html.match(/]*>([\s\S]*?)<\/main>/i); + const bodyMatch = html.match(/]*>([\s\S]*?)<\/body>/i); + if (mainMatch) { + text = mainMatch[1]; + } else if (bodyMatch) { + text = bodyMatch[1]; + } + + // Remove remaining HTML tags + text = text.replace(/<[^>]+>/g, ' '); + + // Unescape common entities + text = text.replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"'); + + // Normalize whitespace + text = text.replace(/\s+/g, ' ').trim(); + + // Limit length to avoid blowing up AI context + if (text.length > 8000) { + text = text.substring(0, 8000) + '...'; + } + + return text; +} + +/** + * Fetch job description from the job URL. + */ +async function fetchJobDescription(url: string): Promise { + try { + console.log(` Fetching description from ${url}...`); + + // Build cookies if present in env (similar to extractor) + const cookieParts: string[] = []; + if (process.env.UKVISAJOBS_CSRF_TOKEN) cookieParts.push(`csrf_token=${process.env.UKVISAJOBS_CSRF_TOKEN}`); + if (process.env.UKVISAJOBS_CI_SESSION) cookieParts.push(`ci_session=${process.env.UKVISAJOBS_CI_SESSION}`); + const token = process.env.UKVISAJOBS_AUTH_TOKEN || process.env.UKVISAJOBS_TOKEN; + if (token) cookieParts.push(`authToken=${token}`); + + const headers: Record = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + }; + + if (cookieParts.length > 0) { + headers['Cookie'] = cookieParts.join('; '); + } + + const response = await fetch(url, { + headers, + signal: AbortSignal.timeout(10000) // 10s timeout + }); + + if (!response.ok) return null; + + const html = await response.text(); + const cleaned = cleanHtml(html); + + // If we only got a tiny bit of text, it might have failed + return cleaned.length > 100 ? cleaned : null; + } catch (error) { + console.warn(` ⚠️ Failed to fetch description: ${error instanceof Error ? error.message : 'Unknown error'}`); + return null; + } +} + /** * Clear previous extraction results. */ @@ -95,6 +172,21 @@ export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise const id = job.sourceJobId || job.jobUrl; if (!seenIds.has(id)) { seenIds.add(id); + + // Enrich description if missing or poor + const isPoorDescription = !job.jobDescription || + job.jobDescription.length < 100 || + job.jobDescription.startsWith('Visa sponsorship info:'); + + if (isPoorDescription && job.jobUrl) { + const enriched = await fetchJobDescription(job.jobUrl); + if (enriched) { + job.jobDescription = enriched; + } + // Small delay to avoid hammering the server + await new Promise((resolve) => setTimeout(resolve, 500)); + } + allJobs.push(job); newCount++; }