549 lines
19 KiB
TypeScript
549 lines
19 KiB
TypeScript
/**
|
|
* Service for running the UK Visa Jobs extractor (extractors/ukvisajobs).
|
|
*
|
|
* Spawns the extractor as a child process and reads its output dataset.
|
|
*/
|
|
|
|
import { spawn } from 'child_process';
|
|
import { readdir, readFile, rm, mkdir } from 'fs/promises';
|
|
import { join, dirname } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import type { CreateJobInput } from '../../shared/types.js';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const UKVISAJOBS_DIR = join(__dirname, '../../../../extractors/ukvisajobs');
|
|
const STORAGE_DIR = join(UKVISAJOBS_DIR, 'storage/datasets/default');
|
|
const AUTH_CACHE_PATH = join(UKVISAJOBS_DIR, 'storage/ukvisajobs-auth.json');
|
|
const UKVISAJOBS_API_URL = 'https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data';
|
|
const UKVISAJOBS_PAGE_SIZE = 15;
|
|
let isUkVisaJobsRunning = false;
|
|
|
|
interface UkVisaJobsAuthSession {
|
|
token?: string;
|
|
authToken?: string;
|
|
csrfToken?: string;
|
|
ciSession?: string;
|
|
}
|
|
|
|
export interface RunUkVisaJobsOptions {
|
|
/** Maximum number of jobs to fetch per search term. Defaults to 50, max 200. */
|
|
maxJobs?: number;
|
|
/** Search keyword filter (single) - legacy support */
|
|
searchKeyword?: string;
|
|
/** List of search terms to run sequentially */
|
|
searchTerms?: string[];
|
|
}
|
|
|
|
export interface UkVisaJobsResult {
|
|
success: boolean;
|
|
jobs: CreateJobInput[];
|
|
error?: string;
|
|
}
|
|
|
|
function toStringOrNull(value: unknown): string | null {
|
|
if (value === null || value === undefined) return null;
|
|
if (typeof value === 'string') {
|
|
const trimmed = value.trim();
|
|
return trimmed.length > 0 ? trimmed : null;
|
|
}
|
|
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
|
|
return null;
|
|
}
|
|
|
|
function toNumberOrNull(value: unknown): number | null {
|
|
if (value === null || value === undefined) return null;
|
|
if (typeof value === 'number') return Number.isFinite(value) ? value : null;
|
|
if (typeof value === 'string') {
|
|
const trimmed = value.trim();
|
|
if (!trimmed) return null;
|
|
const parsed = Number(trimmed);
|
|
return Number.isFinite(parsed) ? parsed : null;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function buildCookieHeader(session: UkVisaJobsAuthSession): string {
|
|
const cookieParts: string[] = [];
|
|
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
|
|
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
|
|
const token = session.authToken || session.token;
|
|
if (token) cookieParts.push(`authToken=${token}`);
|
|
return cookieParts.join('; ');
|
|
}
|
|
|
|
function buildVisaInfoDescription(raw: UkVisaJobsApiJob): string | undefined {
|
|
const visaInfo: string[] = [];
|
|
if (raw.visa_acceptance?.toLowerCase() === 'yes') visaInfo.push('Visa acceptance: Yes');
|
|
if (raw.applicants_outside_uk?.toLowerCase() === 'yes') visaInfo.push('Accepts applicants outside UK');
|
|
if (raw.likely_to_sponsor?.toLowerCase() === 'yes') visaInfo.push('Likely to sponsor');
|
|
if (raw.definitely_sponsored?.toLowerCase() === 'yes') visaInfo.push('Definitely sponsored');
|
|
if (raw.new_entrant?.toLowerCase() === 'yes') visaInfo.push('New entrant friendly');
|
|
if (raw.student_graduate?.toLowerCase() === 'yes') visaInfo.push('Student/Graduate friendly');
|
|
if (visaInfo.length === 0) return undefined;
|
|
return `Visa sponsorship info: ${visaInfo.join(', ')}`;
|
|
}
|
|
|
|
function formatSalary(raw: UkVisaJobsApiJob): string | undefined {
|
|
const minSalary = toNumberOrNull(raw.min_salary);
|
|
const maxSalary = toNumberOrNull(raw.max_salary);
|
|
const interval = toStringOrNull(raw.salary_interval);
|
|
|
|
if (minSalary && maxSalary && maxSalary > 0) {
|
|
return `GBP ${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}${interval ? ` / ${interval}` : ''}`;
|
|
}
|
|
if (maxSalary && maxSalary > 0) {
|
|
return `GBP ${maxSalary.toLocaleString()}${interval ? ` / ${interval}` : ''}`;
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function mapApiJob(raw: UkVisaJobsApiJob): CreateJobInput {
|
|
const description = toStringOrNull(raw.description) ?? buildVisaInfoDescription(raw);
|
|
return {
|
|
source: 'ukvisajobs',
|
|
sourceJobId: toStringOrNull(raw.id) ?? undefined,
|
|
title: toStringOrNull(raw.title) ?? 'Unknown Title',
|
|
employer: toStringOrNull(raw.company_name) ?? 'Unknown Employer',
|
|
employerUrl: toStringOrNull(raw.company_link) ?? undefined,
|
|
jobUrl: toStringOrNull(raw.job_link) ?? '',
|
|
applicationLink: toStringOrNull(raw.job_link) ?? undefined,
|
|
location: toStringOrNull(raw.city) ?? undefined,
|
|
deadline: toStringOrNull(raw.job_expire) ?? undefined,
|
|
salary: formatSalary(raw),
|
|
jobDescription: description ?? undefined,
|
|
datePosted: toStringOrNull(raw.created_date) ?? undefined,
|
|
degreeRequired: toStringOrNull(raw.degree_requirement) ?? undefined,
|
|
jobType: toStringOrNull(raw.job_type) ?? undefined,
|
|
jobLevel: toStringOrNull(raw.job_level) ?? undefined,
|
|
};
|
|
}
|
|
|
|
interface UkVisaJobsApiJob {
|
|
id: string;
|
|
title: string;
|
|
company_name: string;
|
|
company_link?: string;
|
|
job_link: string;
|
|
city?: string;
|
|
created_date?: string;
|
|
job_expire?: string;
|
|
description?: string;
|
|
min_salary?: string;
|
|
max_salary?: string;
|
|
salary_interval?: string;
|
|
salary_method?: string;
|
|
degree_requirement?: string;
|
|
job_type?: string;
|
|
job_level?: string;
|
|
job_industry?: string;
|
|
visa_acceptance?: string;
|
|
applicants_outside_uk?: string;
|
|
likely_to_sponsor?: string;
|
|
definitely_sponsored?: string;
|
|
new_entrant?: string;
|
|
student_graduate?: string;
|
|
}
|
|
|
|
interface UkVisaJobsApiResponse {
|
|
status: number;
|
|
totalJobs: number;
|
|
query?: string;
|
|
jobs: UkVisaJobsApiJob[];
|
|
}
|
|
|
|
/**
|
|
* Basic HTML to text conversion to extract job description.
|
|
*/
|
|
function cleanHtml(html: string): string {
|
|
// Remove script, style tags and their content
|
|
let text = html.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/gi, '');
|
|
|
|
// Try to extract content between <main> tags if present, or fallback to body
|
|
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
if (mainMatch) {
|
|
text = mainMatch[1];
|
|
} else if (bodyMatch) {
|
|
text = bodyMatch[1];
|
|
}
|
|
|
|
// Remove remaining HTML tags
|
|
text = text.replace(/<[^>]+>/g, ' ');
|
|
|
|
// Unescape common entities
|
|
text = text.replace(/ /g, ' ')
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"');
|
|
|
|
// Normalize whitespace
|
|
text = text.replace(/\s+/g, ' ').trim();
|
|
|
|
// Limit length to avoid blowing up AI context
|
|
if (text.length > 8000) {
|
|
text = text.substring(0, 8000) + '...';
|
|
}
|
|
|
|
return text;
|
|
}
|
|
|
|
/**
|
|
* Fetch job description from the job URL.
|
|
*/
|
|
async function fetchJobDescription(url: string): Promise<string | null> {
|
|
try {
|
|
console.log(` Fetching description from ${url}...`);
|
|
|
|
const authSession = await loadCachedAuthSession();
|
|
const cookieParts: string[] = [];
|
|
if (authSession?.csrfToken) cookieParts.push(`csrf_token=${authSession.csrfToken}`);
|
|
if (authSession?.ciSession) cookieParts.push(`ci_session=${authSession.ciSession}`);
|
|
const token = authSession?.authToken || authSession?.token;
|
|
if (token) cookieParts.push(`authToken=${token}`);
|
|
|
|
const headers: Record<string, string> = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
};
|
|
|
|
if (cookieParts.length > 0) {
|
|
headers['Cookie'] = cookieParts.join('; ');
|
|
}
|
|
|
|
const response = await fetch(url, {
|
|
headers,
|
|
signal: AbortSignal.timeout(10000) // 10s timeout
|
|
});
|
|
|
|
if (!response.ok) return null;
|
|
|
|
const html = await response.text();
|
|
const cleaned = cleanHtml(html);
|
|
|
|
// If we only got a tiny bit of text, it might have failed
|
|
return cleaned.length > 100 ? cleaned : null;
|
|
} catch (error) {
|
|
console.warn(` âš ï¸ Failed to fetch description: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
|
try {
|
|
const data = await readFile(AUTH_CACHE_PATH, 'utf-8');
|
|
return JSON.parse(data) as UkVisaJobsAuthSession;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function getAuthToken(session: UkVisaJobsAuthSession | null): string | null {
|
|
if (!session) return null;
|
|
return session.authToken || session.token || null;
|
|
}
|
|
|
|
function hasAuthToken(session: UkVisaJobsAuthSession | null): session is UkVisaJobsAuthSession {
|
|
return Boolean(session && (session.authToken || session.token));
|
|
}
|
|
|
|
function isAuthErrorResponse(status: number, bodyText: string): boolean {
|
|
if (status === 401 || status === 403) return true;
|
|
if (status !== 400) return false;
|
|
try {
|
|
const parsed = JSON.parse(bodyText) as { errorType?: string; message?: string };
|
|
if (parsed?.errorType === 'expired') return true;
|
|
if (parsed?.message && parsed.message.toLowerCase().includes('expired')) return true;
|
|
} catch {
|
|
// Ignore parse errors
|
|
}
|
|
return bodyText.toLowerCase().includes('expired');
|
|
}
|
|
|
|
async function refreshUkVisaJobsAuthSession(): Promise<void> {
|
|
const email = process.env.UKVISAJOBS_EMAIL;
|
|
const password = process.env.UKVISAJOBS_PASSWORD;
|
|
if (!email || !password) {
|
|
throw new Error('UK Visa Jobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
|
}
|
|
|
|
await new Promise<void>((resolve, reject) => {
|
|
const child = spawn('npx', ['tsx', 'src/main.ts'], {
|
|
cwd: UKVISAJOBS_DIR,
|
|
stdio: 'inherit',
|
|
env: {
|
|
...process.env,
|
|
UKVISAJOBS_REFRESH_ONLY: '1',
|
|
},
|
|
});
|
|
|
|
child.on('close', (code) => {
|
|
if (code === 0) resolve();
|
|
else reject(new Error(`UK Visa Jobs auth refresh exited with code ${code}`));
|
|
});
|
|
child.on('error', reject);
|
|
});
|
|
}
|
|
|
|
async function loadAuthSessionOrRefresh(): Promise<UkVisaJobsAuthSession> {
|
|
let authSession = await loadCachedAuthSession();
|
|
if (hasAuthToken(authSession)) {
|
|
return authSession;
|
|
}
|
|
|
|
await refreshUkVisaJobsAuthSession();
|
|
|
|
authSession = await loadCachedAuthSession();
|
|
if (!hasAuthToken(authSession)) {
|
|
throw new Error('UK Visa Jobs auth session missing. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
|
}
|
|
|
|
return authSession;
|
|
}
|
|
|
|
/**
|
|
* Clear previous extraction results.
|
|
*/
|
|
async function clearStorageDataset(): Promise<void> {
|
|
try {
|
|
await rm(STORAGE_DIR, { recursive: true, force: true });
|
|
} catch {
|
|
// Ignore if directory doesn't exist
|
|
}
|
|
}
|
|
|
|
export async function fetchUkVisaJobsPage(options: { searchKeyword?: string; page?: number } = {}): Promise<{
|
|
jobs: CreateJobInput[];
|
|
totalJobs: number;
|
|
page: number;
|
|
pageSize: number;
|
|
}> {
|
|
const page = options.page && options.page > 0 ? options.page : 1;
|
|
let authSession = await loadAuthSessionOrRefresh();
|
|
|
|
const fetchWithSession = async (session: UkVisaJobsAuthSession) => {
|
|
const token = getAuthToken(session);
|
|
if (!token) {
|
|
throw new Error('UK Visa Jobs auth session missing. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
|
}
|
|
|
|
const formData = new FormData();
|
|
formData.append('is_global', '0');
|
|
formData.append('sortBy', 'desc');
|
|
formData.append('pageNo', String(page));
|
|
formData.append('visaAcceptance', 'false');
|
|
formData.append('applicants_outside_uk', 'false');
|
|
formData.append('searchKeyword', options.searchKeyword ? options.searchKeyword : 'null');
|
|
formData.append('token', token);
|
|
|
|
const cookies = buildCookieHeader({
|
|
token: session?.token,
|
|
authToken: session?.authToken,
|
|
csrfToken: session?.csrfToken,
|
|
ciSession: session?.ciSession,
|
|
});
|
|
|
|
const response = await fetch(UKVISAJOBS_API_URL, {
|
|
method: 'POST',
|
|
headers: {
|
|
'accept': 'application/json, text/plain, */*',
|
|
'cookie': cookies,
|
|
'origin': 'https://my.ukvisajobs.com',
|
|
'referer': `https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&pageNo=${page}&visaAcceptance=false&applicants_outside_uk=false`,
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
},
|
|
body: formData,
|
|
});
|
|
|
|
const text = await response.text();
|
|
return { response, text };
|
|
};
|
|
|
|
let { response, text } = await fetchWithSession(authSession);
|
|
|
|
if (!response.ok && isAuthErrorResponse(response.status, text)) {
|
|
await refreshUkVisaJobsAuthSession();
|
|
const refreshedSession = await loadCachedAuthSession();
|
|
if (!hasAuthToken(refreshedSession)) {
|
|
throw new Error('UK Visa Jobs auth session missing. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
|
}
|
|
authSession = refreshedSession;
|
|
({ response, text } = await fetchWithSession(authSession));
|
|
}
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`UK Visa Jobs API returned ${response.status}: ${text}`);
|
|
}
|
|
|
|
let data: UkVisaJobsApiResponse;
|
|
try {
|
|
data = JSON.parse(text) as UkVisaJobsApiResponse;
|
|
} catch (error) {
|
|
throw new Error('UK Visa Jobs API returned an invalid response.');
|
|
}
|
|
|
|
if (data.status !== 1) {
|
|
throw new Error(`UK Visa Jobs API returned status ${data.status}`);
|
|
}
|
|
|
|
const jobs = (data.jobs || [])
|
|
.map(mapApiJob)
|
|
.filter((job) => Boolean(job.jobUrl));
|
|
|
|
const totalJobs = Number.isFinite(data.totalJobs) ? data.totalJobs : jobs.length;
|
|
|
|
return {
|
|
jobs,
|
|
totalJobs,
|
|
page,
|
|
pageSize: UKVISAJOBS_PAGE_SIZE,
|
|
};
|
|
}
|
|
|
|
export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise<UkVisaJobsResult> {
|
|
if (isUkVisaJobsRunning) {
|
|
return { success: false, jobs: [], error: 'UK Visa Jobs extractor is already running' };
|
|
}
|
|
|
|
isUkVisaJobsRunning = true;
|
|
try {
|
|
console.log('🇬🇧 Running UK Visa Jobs extractor...');
|
|
|
|
// Determine terms to run
|
|
const terms: string[] = [];
|
|
if (options.searchTerms && options.searchTerms.length > 0) {
|
|
terms.push(...options.searchTerms);
|
|
} else if (options.searchKeyword) {
|
|
terms.push(options.searchKeyword);
|
|
} else {
|
|
// No search terms = run once without keyword
|
|
terms.push('');
|
|
}
|
|
|
|
const allJobs: CreateJobInput[] = [];
|
|
const seenIds = new Set<string>();
|
|
|
|
for (let i = 0; i < terms.length; i++) {
|
|
const term = terms[i];
|
|
const termLabel = term ? `"${term}"` : 'all jobs';
|
|
console.log(` Running for ${termLabel}...`);
|
|
|
|
try {
|
|
// Clear previous results for this run
|
|
await clearStorageDataset();
|
|
await mkdir(STORAGE_DIR, { recursive: true });
|
|
|
|
// Run the extractor
|
|
await new Promise<void>((resolve, reject) => {
|
|
const child = spawn('npx', ['tsx', 'src/main.ts'], {
|
|
cwd: UKVISAJOBS_DIR,
|
|
stdio: 'inherit',
|
|
env: {
|
|
...process.env,
|
|
UKVISAJOBS_MAX_JOBS: String(options.maxJobs ?? 50),
|
|
UKVISAJOBS_SEARCH_KEYWORD: term,
|
|
},
|
|
});
|
|
|
|
child.on('close', (code) => {
|
|
if (code === 0) resolve();
|
|
else reject(new Error(`UK Visa Jobs extractor exited with code ${code}`));
|
|
});
|
|
child.on('error', reject);
|
|
});
|
|
|
|
// Read the output dataset and accumulate
|
|
const runJobs = await readDataset();
|
|
let newCount = 0;
|
|
|
|
for (const job of runJobs) {
|
|
// Deduplicate by sourceJobId or jobUrl
|
|
const id = job.sourceJobId || job.jobUrl;
|
|
if (!seenIds.has(id)) {
|
|
seenIds.add(id);
|
|
|
|
// Enrich description if missing or poor
|
|
const isPoorDescription = !job.jobDescription ||
|
|
job.jobDescription.length < 100 ||
|
|
job.jobDescription.startsWith('Visa sponsorship info:');
|
|
|
|
if (isPoorDescription && job.jobUrl) {
|
|
const enriched = await fetchJobDescription(job.jobUrl);
|
|
if (enriched) {
|
|
job.jobDescription = enriched;
|
|
}
|
|
// Small delay to avoid hammering the server
|
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
}
|
|
|
|
allJobs.push(job);
|
|
newCount++;
|
|
}
|
|
}
|
|
|
|
console.log(` ✅ Fetched ${runJobs.length} jobs for ${termLabel} (${newCount} new unique)`);
|
|
|
|
} catch (error) {
|
|
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
console.error(`⌠UK Visa Jobs failed for ${termLabel}: ${message}`);
|
|
// Continue to next term instead of failing completely
|
|
}
|
|
|
|
// Delay between terms
|
|
if (i < terms.length - 1) {
|
|
console.log(' Waiting 5s before next search term...');
|
|
await new Promise((resolve) => setTimeout(resolve, 5000));
|
|
}
|
|
}
|
|
|
|
console.log(`✅ UK Visa Jobs: imported total ${allJobs.length} unique jobs`);
|
|
return { success: true, jobs: allJobs };
|
|
} finally {
|
|
isUkVisaJobsRunning = false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Read jobs from the extractor's output dataset.
|
|
*/
|
|
async function readDataset(): Promise<CreateJobInput[]> {
|
|
const jobs: CreateJobInput[] = [];
|
|
|
|
try {
|
|
const files = await readdir(STORAGE_DIR);
|
|
const jsonFiles = files.filter((f) => f.endsWith('.json') && f !== 'jobs.json');
|
|
|
|
for (const file of jsonFiles.sort()) {
|
|
try {
|
|
const content = await readFile(join(STORAGE_DIR, file), 'utf-8');
|
|
const job = JSON.parse(content);
|
|
|
|
// Map to CreateJobInput format
|
|
jobs.push({
|
|
source: 'ukvisajobs',
|
|
sourceJobId: job.sourceJobId,
|
|
title: job.title || 'Unknown Title',
|
|
employer: job.employer || 'Unknown Employer',
|
|
employerUrl: job.employerUrl,
|
|
jobUrl: job.jobUrl,
|
|
applicationLink: job.applicationLink || job.jobUrl,
|
|
location: job.location,
|
|
deadline: job.deadline,
|
|
salary: job.salary,
|
|
jobDescription: job.jobDescription,
|
|
datePosted: job.datePosted,
|
|
degreeRequired: job.degreeRequired,
|
|
jobType: job.jobType,
|
|
jobLevel: job.jobLevel,
|
|
});
|
|
} catch {
|
|
// Skip invalid files
|
|
}
|
|
}
|
|
} catch {
|
|
// Dataset directory doesn't exist yet
|
|
}
|
|
|
|
return jobs;
|
|
}
|
|
|