get job description with the job url for the ukvisajobs extractor
This commit is contained in:
parent
e6c37998b9
commit
43d307bea0
@ -29,6 +29,83 @@ export interface UkVisaJobsResult {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic HTML to text conversion to extract job description.
|
||||
*/
|
||||
function cleanHtml(html: string): string {
|
||||
// Remove script, style tags and their content
|
||||
let text = html.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/gi, '');
|
||||
|
||||
// Try to extract content between <main> tags if present, or fallback to body
|
||||
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
||||
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
||||
if (mainMatch) {
|
||||
text = mainMatch[1];
|
||||
} else if (bodyMatch) {
|
||||
text = bodyMatch[1];
|
||||
}
|
||||
|
||||
// Remove remaining HTML tags
|
||||
text = text.replace(/<[^>]+>/g, ' ');
|
||||
|
||||
// Unescape common entities
|
||||
text = text.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"');
|
||||
|
||||
// Normalize whitespace
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
|
||||
// Limit length to avoid blowing up AI context
|
||||
if (text.length > 8000) {
|
||||
text = text.substring(0, 8000) + '...';
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch job description from the job URL.
|
||||
*/
|
||||
async function fetchJobDescription(url: string): Promise<string | null> {
|
||||
try {
|
||||
console.log(` Fetching description from ${url}...`);
|
||||
|
||||
// Build cookies if present in env (similar to extractor)
|
||||
const cookieParts: string[] = [];
|
||||
if (process.env.UKVISAJOBS_CSRF_TOKEN) cookieParts.push(`csrf_token=${process.env.UKVISAJOBS_CSRF_TOKEN}`);
|
||||
if (process.env.UKVISAJOBS_CI_SESSION) cookieParts.push(`ci_session=${process.env.UKVISAJOBS_CI_SESSION}`);
|
||||
const token = process.env.UKVISAJOBS_AUTH_TOKEN || process.env.UKVISAJOBS_TOKEN;
|
||||
if (token) cookieParts.push(`authToken=${token}`);
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
};
|
||||
|
||||
if (cookieParts.length > 0) {
|
||||
headers['Cookie'] = cookieParts.join('; ');
|
||||
}
|
||||
|
||||
const response = await fetch(url, {
|
||||
headers,
|
||||
signal: AbortSignal.timeout(10000) // 10s timeout
|
||||
});
|
||||
|
||||
if (!response.ok) return null;
|
||||
|
||||
const html = await response.text();
|
||||
const cleaned = cleanHtml(html);
|
||||
|
||||
// If we only got a tiny bit of text, it might have failed
|
||||
return cleaned.length > 100 ? cleaned : null;
|
||||
} catch (error) {
|
||||
console.warn(` ⚠️ Failed to fetch description: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear previous extraction results.
|
||||
*/
|
||||
@ -95,6 +172,21 @@ export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise
|
||||
const id = job.sourceJobId || job.jobUrl;
|
||||
if (!seenIds.has(id)) {
|
||||
seenIds.add(id);
|
||||
|
||||
// Enrich description if missing or poor
|
||||
const isPoorDescription = !job.jobDescription ||
|
||||
job.jobDescription.length < 100 ||
|
||||
job.jobDescription.startsWith('Visa sponsorship info:');
|
||||
|
||||
if (isPoorDescription && job.jobUrl) {
|
||||
const enriched = await fetchJobDescription(job.jobUrl);
|
||||
if (enriched) {
|
||||
job.jobDescription = enriched;
|
||||
}
|
||||
// Small delay to avoid hammering the server
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
}
|
||||
|
||||
allJobs.push(job);
|
||||
newCount++;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user