autologin for ukvisajobs
This commit is contained in:
parent
4726c463c8
commit
2b2af06bb8
@ -40,9 +40,8 @@ JOBSPY_LINKEDIN_FETCH_DESCRIPTION=1
|
||||
# =============================================================================
|
||||
# UKVisaJobs (UK visa sponsorship jobs) - optional
|
||||
# =============================================================================
|
||||
# Get these tokens from browser dev tools after logging into my.ukvisajobs.com
|
||||
# Provide email/password for automatic login and token refresh.
|
||||
# See extractors/ukvisajobs/README.md for detailed instructions.
|
||||
UKVISAJOBS_TOKEN=
|
||||
UKVISAJOBS_AUTH_TOKEN=
|
||||
UKVISAJOBS_CSRF_TOKEN=
|
||||
UKVISAJOBS_CI_SESSION=
|
||||
UKVISAJOBS_EMAIL=
|
||||
UKVISAJOBS_PASSWORD=
|
||||
UKVISAJOBS_HEADLESS=true
|
||||
|
||||
5
.gitignore
vendored
5
.gitignore
vendored
@ -5,6 +5,9 @@
|
||||
# Data directory (bind mount in Docker)
|
||||
data/
|
||||
|
||||
# Extractor storage outputs and cached auth
|
||||
extractors/ukvisajobs/storage/
|
||||
|
||||
# OS files
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
Thumbs.db
|
||||
|
||||
@ -51,10 +51,9 @@ services:
|
||||
- WEBHOOK_SECRET=${WEBHOOK_SECRET:-}
|
||||
|
||||
# UKVisaJobs (UK visa sponsorship jobs) - optional
|
||||
- UKVISAJOBS_TOKEN=${UKVISAJOBS_TOKEN:-}
|
||||
- UKVISAJOBS_AUTH_TOKEN=${UKVISAJOBS_AUTH_TOKEN:-}
|
||||
- UKVISAJOBS_CSRF_TOKEN=${UKVISAJOBS_CSRF_TOKEN:-}
|
||||
- UKVISAJOBS_CI_SESSION=${UKVISAJOBS_CI_SESSION:-}
|
||||
- UKVISAJOBS_EMAIL=${UKVISAJOBS_EMAIL:-}
|
||||
- UKVISAJOBS_PASSWORD=${UKVISAJOBS_PASSWORD:-}
|
||||
- UKVISAJOBS_HEADLESS=${UKVISAJOBS_HEADLESS:-true}
|
||||
- UKVISAJOBS_SEARCH_KEYWORD=${UKVISAJOBS_SEARCH_KEYWORD:-}
|
||||
|
||||
# Python path (uses system python in container)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# UK Visa Jobs Extractor
|
||||
# UK Visa Jobs Extractor
|
||||
|
||||
Fetches job listings from [my.ukvisajobs.com](https://my.ukvisajobs.com) that may sponsor work visas.
|
||||
|
||||
@ -8,28 +8,38 @@ Fetches job listings from [my.ukvisajobs.com](https://my.ukvisajobs.com) that ma
|
||||
npm install
|
||||
```
|
||||
|
||||
If Playwright browsers are skipped in your environment, install Firefox:
|
||||
|
||||
```bash
|
||||
npx playwright install firefox
|
||||
```
|
||||
|
||||
If Camoufox assets are missing, fetch them:
|
||||
|
||||
```bash
|
||||
npx camoufox-js fetch
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Set the following environment variables (you can get these from your browser's dev tools after logging in):
|
||||
Set the following environment variables:
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `UKVISAJOBS_TOKEN` | JWT token from the request body (required) |
|
||||
| `UKVISAJOBS_AUTH_TOKEN` | Auth cookie token (defaults to UKVISAJOBS_TOKEN) |
|
||||
| `UKVISAJOBS_CSRF_TOKEN` | CSRF token from cookies |
|
||||
| `UKVISAJOBS_CI_SESSION` | CI session ID from cookies |
|
||||
| `UKVISAJOBS_EMAIL` | Login email for automatic token refresh |
|
||||
| `UKVISAJOBS_PASSWORD` | Login password for automatic token refresh |
|
||||
| `UKVISAJOBS_HEADLESS` | Set to `false` to show the browser (default: true) |
|
||||
| `UKVISAJOBS_MAX_JOBS` | Maximum jobs to fetch (default: 50, max: 200) |
|
||||
| `UKVISAJOBS_SEARCH_KEYWORD` | Optional search filter |
|
||||
|
||||
## How to get tokens
|
||||
## Automatic login & cache
|
||||
|
||||
1. Log into `my.ukvisajobs.com` in your browser
|
||||
2. Open Developer Tools → Network tab
|
||||
3. Navigate to the jobs page
|
||||
4. Find the `fetch-jobs-data` POST request
|
||||
5. Copy values:
|
||||
- From **Request Body**: copy the `token` field → `UKVISAJOBS_TOKEN`
|
||||
- From **Cookies**: copy `authToken`, `csrf_token`, `ci_session`
|
||||
The extractor will:
|
||||
|
||||
1. Launch a Camoufox (Playwright Firefox) browser and sign in
|
||||
2. Navigate to the open jobs page and capture the token/cookies
|
||||
3. Cache the session to `storage/ukvisajobs-auth.json`
|
||||
4. Reuse the cached values until the API reports an expired token, then refresh
|
||||
|
||||
## Running
|
||||
|
||||
@ -38,3 +48,4 @@ npm start
|
||||
```
|
||||
|
||||
Output is written to `storage/datasets/default/` as JSON files.
|
||||
|
||||
|
||||
1110
extractors/ukvisajobs/package-lock.json
generated
1110
extractors/ukvisajobs/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,10 @@
|
||||
"type": "module",
|
||||
"description": "UK Visa Jobs extractor - fetches job listings that may sponsor work visas",
|
||||
"main": "dist/main.js",
|
||||
"dependencies": {},
|
||||
"dependencies": {
|
||||
"camoufox-js": "^0.8.0",
|
||||
"playwright": "^1.57.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@apify/tsconfig": "^0.1.0",
|
||||
"@types/node": "^24.0.0",
|
||||
@ -15,8 +18,10 @@
|
||||
"start": "npm run start:dev",
|
||||
"start:prod": "node dist/main.js",
|
||||
"start:dev": "tsx src/main.ts",
|
||||
"build": "tsc"
|
||||
"build": "tsc",
|
||||
"get-binaries": "camoufox-js fetch",
|
||||
"postinstall": "npm run get-binaries"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC"
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,25 +1,28 @@
|
||||
/**
|
||||
/**
|
||||
* UK Visa Jobs Extractor
|
||||
*
|
||||
* Fetches job listings from my.ukvisajobs.com that may sponsor work visas.
|
||||
* Outputs JSON to stdout for the orchestrator to consume.
|
||||
*
|
||||
* Environment variables:
|
||||
* UKVISAJOBS_TOKEN - JWT token (required)
|
||||
* UKVISAJOBS_AUTH_TOKEN - Auth cookie token (defaults to UKVISAJOBS_TOKEN)
|
||||
* UKVISAJOBS_CSRF_TOKEN - CSRF token cookie
|
||||
* UKVISAJOBS_CI_SESSION - CI session cookie
|
||||
* UKVISAJOBS_EMAIL - Login email for auto-refresh
|
||||
* UKVISAJOBS_PASSWORD - Login password for auto-refresh
|
||||
* UKVISAJOBS_HEADLESS - Set to "false" to show the browser (default: true)
|
||||
* UKVISAJOBS_MAX_JOBS - Maximum jobs to fetch (default: 50, max: 200) - Set via UI Settings
|
||||
* UKVISAJOBS_SEARCH_KEYWORD - Optional search filter
|
||||
*/
|
||||
|
||||
import { mkdir, writeFile } from 'fs/promises';
|
||||
import { mkdir, writeFile, readFile } from 'fs/promises';
|
||||
import { join, dirname } from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import type { Request } from 'playwright';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const API_URL = 'https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data';
|
||||
const SIGNIN_URL = 'https://my.ukvisajobs.com/signin';
|
||||
const OPEN_JOBS_URL = 'https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1';
|
||||
const AUTH_CACHE_PATH = join(__dirname, '../storage/ukvisajobs-auth.json');
|
||||
const JOBS_PER_PAGE = 15;
|
||||
const DEFAULT_MAX_JOBS = 50;
|
||||
const MAX_ALLOWED_JOBS = 200;
|
||||
@ -77,6 +80,27 @@ interface ExtractedJob {
|
||||
jobLevel?: string;
|
||||
}
|
||||
|
||||
interface UkVisaJobsAuthSession {
|
||||
token: string;
|
||||
authToken: string;
|
||||
csrfToken: string;
|
||||
ciSession: string;
|
||||
fetchedAt: string;
|
||||
source: 'cache' | 'browser';
|
||||
}
|
||||
|
||||
class UkVisaJobsAuthError extends Error {
|
||||
status: number;
|
||||
responseText: string;
|
||||
|
||||
constructor(message: string, status: number, responseText: string) {
|
||||
super(message);
|
||||
this.name = 'UkVisaJobsAuthError';
|
||||
this.status = status;
|
||||
this.responseText = responseText;
|
||||
}
|
||||
}
|
||||
|
||||
function toStringOrNull(value: unknown): string | null {
|
||||
if (value === null || value === undefined) return null;
|
||||
if (typeof value === 'string') {
|
||||
@ -101,8 +125,7 @@ function toNumberOrNull(value: unknown): number | null {
|
||||
|
||||
async function fetchPage(
|
||||
pageNo: number,
|
||||
token: string,
|
||||
cookies: string,
|
||||
session: UkVisaJobsAuthSession,
|
||||
options: { searchKeyword?: string } = {}
|
||||
): Promise<UkVisaJobsApiResponse> {
|
||||
// Use native FormData API (Node.js 18+)
|
||||
@ -113,7 +136,9 @@ async function fetchPage(
|
||||
formData.append('visaAcceptance', 'false');
|
||||
formData.append('applicants_outside_uk', 'false');
|
||||
formData.append('searchKeyword', options.searchKeyword || 'null');
|
||||
formData.append('token', token);
|
||||
formData.append('token', session.token);
|
||||
|
||||
const cookies = buildCookieHeader(session);
|
||||
|
||||
const response = await fetch(API_URL, {
|
||||
method: 'POST',
|
||||
@ -130,6 +155,13 @@ async function fetchPage(
|
||||
|
||||
if (!response.ok) {
|
||||
const text = await response.text();
|
||||
if (isAuthErrorResponse(response.status, text)) {
|
||||
throw new UkVisaJobsAuthError(
|
||||
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
|
||||
response.status,
|
||||
text
|
||||
);
|
||||
}
|
||||
throw new Error(`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`);
|
||||
}
|
||||
|
||||
@ -143,12 +175,12 @@ function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
||||
const maxSalary = toNumberOrNull(raw.max_salary);
|
||||
|
||||
if (minSalary !== null && minSalary > 0 && maxSalary !== null && maxSalary > 0) {
|
||||
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
|
||||
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
|
||||
if (raw.salary_interval) {
|
||||
salary += ` / ${raw.salary_interval}`;
|
||||
}
|
||||
} else if (maxSalary !== null && maxSalary > 0) {
|
||||
salary = `£${maxSalary.toLocaleString()}`;
|
||||
salary = `£${maxSalary.toLocaleString()}`;
|
||||
if (raw.salary_interval) {
|
||||
salary += ` / ${raw.salary_interval}`;
|
||||
}
|
||||
@ -188,30 +220,181 @@ function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
||||
};
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log('🇬🇧 UK Visa Jobs Extractor starting...');
|
||||
function buildCookieHeader(session: UkVisaJobsAuthSession): string {
|
||||
const cookieParts: string[] = [];
|
||||
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
|
||||
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
|
||||
if (session.authToken) cookieParts.push(`authToken=${session.authToken}`);
|
||||
return cookieParts.join('; ');
|
||||
}
|
||||
|
||||
// Get credentials from environment
|
||||
const token = process.env.UKVISAJOBS_TOKEN;
|
||||
const authToken = process.env.UKVISAJOBS_AUTH_TOKEN || token;
|
||||
const csrfToken = process.env.UKVISAJOBS_CSRF_TOKEN || '';
|
||||
const ciSession = process.env.UKVISAJOBS_CI_SESSION || '';
|
||||
function getLoginCredentials(): { email: string; password: string } | null {
|
||||
const email = process.env.UKVISAJOBS_EMAIL;
|
||||
const password = process.env.UKVISAJOBS_PASSWORD;
|
||||
if (!email || !password) return null;
|
||||
return { email, password };
|
||||
}
|
||||
|
||||
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
||||
try {
|
||||
const data = await readFile(AUTH_CACHE_PATH, 'utf8');
|
||||
const parsed = JSON.parse(data) as UkVisaJobsAuthSession;
|
||||
if (!parsed?.token) return null;
|
||||
return {
|
||||
token: parsed.token,
|
||||
authToken: parsed.authToken || parsed.token,
|
||||
csrfToken: parsed.csrfToken || '',
|
||||
ciSession: parsed.ciSession || '',
|
||||
fetchedAt: parsed.fetchedAt || new Date().toISOString(),
|
||||
source: 'cache',
|
||||
};
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function saveCachedAuthSession(session: UkVisaJobsAuthSession): Promise<void> {
|
||||
const payload = {
|
||||
token: session.token,
|
||||
authToken: session.authToken,
|
||||
csrfToken: session.csrfToken,
|
||||
ciSession: session.ciSession,
|
||||
fetchedAt: session.fetchedAt,
|
||||
source: session.source,
|
||||
};
|
||||
await mkdir(dirname(AUTH_CACHE_PATH), { recursive: true });
|
||||
await writeFile(AUTH_CACHE_PATH, JSON.stringify(payload, null, 2));
|
||||
}
|
||||
|
||||
function extractMultipartField(body: string, field: string): string | null {
|
||||
const nameToken = `name="${field}"`;
|
||||
const index = body.indexOf(nameToken);
|
||||
if (index === -1) return null;
|
||||
|
||||
const afterName = body.slice(index + nameToken.length);
|
||||
let separatorIndex = afterName.indexOf('\r\n\r\n');
|
||||
let separatorLength = 4;
|
||||
if (separatorIndex === -1) {
|
||||
separatorIndex = afterName.indexOf('\n\n');
|
||||
separatorLength = 2;
|
||||
}
|
||||
if (separatorIndex === -1) return null;
|
||||
|
||||
const valueStart = index + nameToken.length + separatorIndex + separatorLength;
|
||||
const remainder = body.slice(valueStart);
|
||||
const endIndex = remainder.indexOf('\r\n');
|
||||
if (endIndex === -1) return remainder.trim();
|
||||
return remainder.slice(0, endIndex).trim();
|
||||
}
|
||||
|
||||
function extractTokenFromRequest(request: Request): string | null {
|
||||
const postData = request.postData();
|
||||
if (!postData) return null;
|
||||
const multipartToken = extractMultipartField(postData, 'token');
|
||||
if (multipartToken) return multipartToken;
|
||||
try {
|
||||
const params = new URLSearchParams(postData);
|
||||
const token = params.get('token');
|
||||
return token || null;
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function isAuthErrorResponse(status: number, bodyText: string): boolean {
|
||||
if (status === 401 || status === 403) return true;
|
||||
if (status !== 400) return false;
|
||||
try {
|
||||
const parsed = JSON.parse(bodyText) as { errorType?: string; message?: string };
|
||||
if (parsed?.errorType === 'expired') return true;
|
||||
if (parsed?.message && parsed.message.toLowerCase().includes('expired')) return true;
|
||||
} catch (error) {
|
||||
// ignore JSON parse failures
|
||||
}
|
||||
return bodyText.toLowerCase().includes('expired');
|
||||
}
|
||||
|
||||
async function loginWithBrowser(email: string, password: string): Promise<UkVisaJobsAuthSession> {
|
||||
const [{ launchOptions }, { firefox }] = await Promise.all([
|
||||
import('camoufox-js'),
|
||||
import('playwright'),
|
||||
]);
|
||||
const headless = process.env.UKVISAJOBS_HEADLESS !== 'false';
|
||||
const browser = await firefox.launch(await launchOptions({
|
||||
headless,
|
||||
humanize: true,
|
||||
geoip: true,
|
||||
}));
|
||||
const context = await browser.newContext();
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
await page.goto(SIGNIN_URL, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForSelector('#email', { timeout: 15000 });
|
||||
await page.fill('#email', email);
|
||||
await page.fill('#password', password);
|
||||
await page.keyboard.press('Enter');
|
||||
await page.waitForTimeout(7000);
|
||||
|
||||
const requestPromise = page.waitForRequest(
|
||||
(request) => request.url().includes('/ukvisa-api/api/fetch-jobs-data') && request.method() === 'POST',
|
||||
{ timeout: 30000 }
|
||||
);
|
||||
|
||||
await page.goto(OPEN_JOBS_URL, { waitUntil: 'networkidle' });
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
let fetchRequest: Request | null = null;
|
||||
try {
|
||||
fetchRequest = await requestPromise;
|
||||
} catch (error) {
|
||||
fetchRequest = null;
|
||||
}
|
||||
|
||||
const cookies = await context.cookies('https://my.ukvisajobs.com');
|
||||
const csrfToken = cookies.find((cookie) => cookie.name === 'csrf_token')?.value || '';
|
||||
const ciSession = cookies.find((cookie) => cookie.name === 'ci_session')?.value || '';
|
||||
const authToken = cookies.find((cookie) => cookie.name === 'authToken')?.value || '';
|
||||
const token = fetchRequest ? extractTokenFromRequest(fetchRequest) : authToken;
|
||||
|
||||
if (!token) {
|
||||
throw new Error('Failed to locate auth token from browser session.');
|
||||
}
|
||||
|
||||
return {
|
||||
token,
|
||||
authToken: authToken || token,
|
||||
csrfToken,
|
||||
ciSession,
|
||||
fetchedAt: new Date().toISOString(),
|
||||
source: 'browser',
|
||||
};
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log('🇬🇧 UK Visa Jobs Extractor starting...');
|
||||
const credentials = getLoginCredentials();
|
||||
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
|
||||
|
||||
if (!token) {
|
||||
console.error('❌ UKVISAJOBS_TOKEN environment variable is not set');
|
||||
process.exit(1);
|
||||
let authSession = await loadCachedAuthSession();
|
||||
|
||||
if (!authSession) {
|
||||
if (!credentials) {
|
||||
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(' No cached session found. Logging in to refresh tokens...');
|
||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
||||
await saveCachedAuthSession(authSession);
|
||||
}
|
||||
|
||||
// Build cookies string
|
||||
const cookieParts: string[] = [];
|
||||
if (csrfToken) cookieParts.push(`csrf_token=${csrfToken}`);
|
||||
if (ciSession) cookieParts.push(`ci_session=${ciSession}`);
|
||||
if (authToken) cookieParts.push(`authToken=${authToken}`);
|
||||
const cookies = cookieParts.join('; ');
|
||||
|
||||
console.log(` Cookies configured: ${cookieParts.length > 0 ? 'Yes' : 'No'}`);
|
||||
console.log(` Token length: ${token.length}`);
|
||||
const cookies = buildCookieHeader(authSession);
|
||||
console.log(` Auth source: ${authSession.source}`);
|
||||
console.log(` Cookies configured: ${cookies ? 'Yes' : 'No'}`);
|
||||
console.log(` Token length: ${authSession.token.length}`);
|
||||
|
||||
// Get max jobs from environment
|
||||
const maxJobsEnv = toNumberOrNull(process.env.UKVISAJOBS_MAX_JOBS);
|
||||
@ -232,10 +415,25 @@ async function main(): Promise<void> {
|
||||
while (pageNo <= maxPages && allJobs.length < maxJobs) {
|
||||
console.log(` Fetching page ${pageNo}/${maxPages}...`);
|
||||
|
||||
const response = await fetchPage(pageNo, token, cookies, { searchKeyword });
|
||||
let response: UkVisaJobsApiResponse;
|
||||
try {
|
||||
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
||||
} catch (error) {
|
||||
if (error instanceof UkVisaJobsAuthError) {
|
||||
if (!credentials) {
|
||||
throw new Error('UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
||||
}
|
||||
console.log(' Auth expired. Refreshing tokens...');
|
||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
||||
await saveCachedAuthSession(authSession);
|
||||
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
if (response.status !== 1) {
|
||||
console.warn(` ⚠️ API returned status ${response.status} on page ${pageNo}`);
|
||||
console.warn(` âš ï¸ API returned status ${response.status} on page ${pageNo}`);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -271,7 +469,7 @@ async function main(): Promise<void> {
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
}
|
||||
|
||||
console.log(`✅ Scraped ${allJobs.length} jobs`);
|
||||
console.log(`✅ Scraped ${allJobs.length} jobs`);
|
||||
|
||||
// Write output to storage directory (similar to Crawlee dataset structure)
|
||||
const storageDir = join(__dirname, '../storage/datasets/default');
|
||||
@ -292,7 +490,7 @@ async function main(): Promise<void> {
|
||||
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
||||
console.error(`❌ Error: ${message}`);
|
||||
console.error(`⌠Error: ${message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
@ -301,3 +499,6 @@ main().catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/**
|
||||
/**
|
||||
* Service for running the UK Visa Jobs extractor (extractors/ukvisajobs).
|
||||
*
|
||||
* Spawns the extractor as a child process and reads its output dataset.
|
||||
@ -13,6 +13,14 @@ import type { CreateJobInput } from '../../shared/types.js';
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const UKVISAJOBS_DIR = join(__dirname, '../../../../extractors/ukvisajobs');
|
||||
const STORAGE_DIR = join(UKVISAJOBS_DIR, 'storage/datasets/default');
|
||||
const AUTH_CACHE_PATH = join(UKVISAJOBS_DIR, 'storage/ukvisajobs-auth.json');
|
||||
|
||||
interface UkVisaJobsAuthSession {
|
||||
token?: string;
|
||||
authToken?: string;
|
||||
csrfToken?: string;
|
||||
ciSession?: string;
|
||||
}
|
||||
|
||||
export interface RunUkVisaJobsOptions {
|
||||
/** Maximum number of jobs to fetch per search term. Defaults to 50, max 200. */
|
||||
@ -73,11 +81,11 @@ async function fetchJobDescription(url: string): Promise<string | null> {
|
||||
try {
|
||||
console.log(` Fetching description from ${url}...`);
|
||||
|
||||
// Build cookies if present in env (similar to extractor)
|
||||
const authSession = await loadCachedAuthSession();
|
||||
const cookieParts: string[] = [];
|
||||
if (process.env.UKVISAJOBS_CSRF_TOKEN) cookieParts.push(`csrf_token=${process.env.UKVISAJOBS_CSRF_TOKEN}`);
|
||||
if (process.env.UKVISAJOBS_CI_SESSION) cookieParts.push(`ci_session=${process.env.UKVISAJOBS_CI_SESSION}`);
|
||||
const token = process.env.UKVISAJOBS_AUTH_TOKEN || process.env.UKVISAJOBS_TOKEN;
|
||||
if (authSession?.csrfToken) cookieParts.push(`csrf_token=${authSession.csrfToken}`);
|
||||
if (authSession?.ciSession) cookieParts.push(`ci_session=${authSession.ciSession}`);
|
||||
const token = authSession?.authToken || authSession?.token;
|
||||
if (token) cookieParts.push(`authToken=${token}`);
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
@ -101,7 +109,16 @@ async function fetchJobDescription(url: string): Promise<string | null> {
|
||||
// If we only got a tiny bit of text, it might have failed
|
||||
return cleaned.length > 100 ? cleaned : null;
|
||||
} catch (error) {
|
||||
console.warn(` ⚠️ Failed to fetch description: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||||
console.warn(` âš ï¸ Failed to fetch description: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
||||
try {
|
||||
const data = await readFile(AUTH_CACHE_PATH, 'utf-8');
|
||||
return JSON.parse(data) as UkVisaJobsAuthSession;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -118,7 +135,7 @@ async function clearStorageDataset(): Promise<void> {
|
||||
}
|
||||
|
||||
export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise<UkVisaJobsResult> {
|
||||
console.log('🇬🇧 Running UK Visa Jobs extractor...');
|
||||
console.log('🇬🇧 Running UK Visa Jobs extractor...');
|
||||
|
||||
// Determine terms to run
|
||||
const terms: string[] = [];
|
||||
@ -192,11 +209,11 @@ export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` ✅ Fetched ${runJobs.length} jobs for ${termLabel} (${newCount} new unique)`);
|
||||
console.log(` ✅ Fetched ${runJobs.length} jobs for ${termLabel} (${newCount} new unique)`);
|
||||
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
||||
console.error(`❌ UK Visa Jobs failed for ${termLabel}: ${message}`);
|
||||
console.error(`⌠UK Visa Jobs failed for ${termLabel}: ${message}`);
|
||||
// Continue to next term instead of failing completely
|
||||
}
|
||||
|
||||
@ -207,7 +224,7 @@ export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`✅ UK Visa Jobs: imported total ${allJobs.length} unique jobs`);
|
||||
console.log(`✅ UK Visa Jobs: imported total ${allJobs.length} unique jobs`);
|
||||
return { success: true, jobs: allJobs };
|
||||
}
|
||||
|
||||
@ -254,3 +271,4 @@ async function readDataset(): Promise<CreateJobInput[]> {
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user