autologin for ukvisajobs
This commit is contained in:
parent
4726c463c8
commit
2b2af06bb8
@ -40,9 +40,8 @@ JOBSPY_LINKEDIN_FETCH_DESCRIPTION=1
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
# UKVisaJobs (UK visa sponsorship jobs) - optional
|
# UKVisaJobs (UK visa sponsorship jobs) - optional
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Get these tokens from browser dev tools after logging into my.ukvisajobs.com
|
# Provide email/password for automatic login and token refresh.
|
||||||
# See extractors/ukvisajobs/README.md for detailed instructions.
|
# See extractors/ukvisajobs/README.md for detailed instructions.
|
||||||
UKVISAJOBS_TOKEN=
|
UKVISAJOBS_EMAIL=
|
||||||
UKVISAJOBS_AUTH_TOKEN=
|
UKVISAJOBS_PASSWORD=
|
||||||
UKVISAJOBS_CSRF_TOKEN=
|
UKVISAJOBS_HEADLESS=true
|
||||||
UKVISAJOBS_CI_SESSION=
|
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -5,6 +5,9 @@
|
|||||||
# Data directory (bind mount in Docker)
|
# Data directory (bind mount in Docker)
|
||||||
data/
|
data/
|
||||||
|
|
||||||
|
# Extractor storage outputs and cached auth
|
||||||
|
extractors/ukvisajobs/storage/
|
||||||
|
|
||||||
# OS files
|
# OS files
|
||||||
.DS_Store
|
.DS_Store
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
@ -51,10 +51,9 @@ services:
|
|||||||
- WEBHOOK_SECRET=${WEBHOOK_SECRET:-}
|
- WEBHOOK_SECRET=${WEBHOOK_SECRET:-}
|
||||||
|
|
||||||
# UKVisaJobs (UK visa sponsorship jobs) - optional
|
# UKVisaJobs (UK visa sponsorship jobs) - optional
|
||||||
- UKVISAJOBS_TOKEN=${UKVISAJOBS_TOKEN:-}
|
- UKVISAJOBS_EMAIL=${UKVISAJOBS_EMAIL:-}
|
||||||
- UKVISAJOBS_AUTH_TOKEN=${UKVISAJOBS_AUTH_TOKEN:-}
|
- UKVISAJOBS_PASSWORD=${UKVISAJOBS_PASSWORD:-}
|
||||||
- UKVISAJOBS_CSRF_TOKEN=${UKVISAJOBS_CSRF_TOKEN:-}
|
- UKVISAJOBS_HEADLESS=${UKVISAJOBS_HEADLESS:-true}
|
||||||
- UKVISAJOBS_CI_SESSION=${UKVISAJOBS_CI_SESSION:-}
|
|
||||||
- UKVISAJOBS_SEARCH_KEYWORD=${UKVISAJOBS_SEARCH_KEYWORD:-}
|
- UKVISAJOBS_SEARCH_KEYWORD=${UKVISAJOBS_SEARCH_KEYWORD:-}
|
||||||
|
|
||||||
# Python path (uses system python in container)
|
# Python path (uses system python in container)
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
# UK Visa Jobs Extractor
|
# UK Visa Jobs Extractor
|
||||||
|
|
||||||
Fetches job listings from [my.ukvisajobs.com](https://my.ukvisajobs.com) that may sponsor work visas.
|
Fetches job listings from [my.ukvisajobs.com](https://my.ukvisajobs.com) that may sponsor work visas.
|
||||||
|
|
||||||
@ -8,28 +8,38 @@ Fetches job listings from [my.ukvisajobs.com](https://my.ukvisajobs.com) that ma
|
|||||||
npm install
|
npm install
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If Playwright browsers are skipped in your environment, install Firefox:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npx playwright install firefox
|
||||||
|
```
|
||||||
|
|
||||||
|
If Camoufox assets are missing, fetch them:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npx camoufox-js fetch
|
||||||
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
Set the following environment variables (you can get these from your browser's dev tools after logging in):
|
Set the following environment variables:
|
||||||
|
|
||||||
| Variable | Description |
|
| Variable | Description |
|
||||||
|----------|-------------|
|
|----------|-------------|
|
||||||
| `UKVISAJOBS_TOKEN` | JWT token from the request body (required) |
|
| `UKVISAJOBS_EMAIL` | Login email for automatic token refresh |
|
||||||
| `UKVISAJOBS_AUTH_TOKEN` | Auth cookie token (defaults to UKVISAJOBS_TOKEN) |
|
| `UKVISAJOBS_PASSWORD` | Login password for automatic token refresh |
|
||||||
| `UKVISAJOBS_CSRF_TOKEN` | CSRF token from cookies |
|
| `UKVISAJOBS_HEADLESS` | Set to `false` to show the browser (default: true) |
|
||||||
| `UKVISAJOBS_CI_SESSION` | CI session ID from cookies |
|
|
||||||
| `UKVISAJOBS_MAX_JOBS` | Maximum jobs to fetch (default: 50, max: 200) |
|
| `UKVISAJOBS_MAX_JOBS` | Maximum jobs to fetch (default: 50, max: 200) |
|
||||||
| `UKVISAJOBS_SEARCH_KEYWORD` | Optional search filter |
|
| `UKVISAJOBS_SEARCH_KEYWORD` | Optional search filter |
|
||||||
|
|
||||||
## How to get tokens
|
## Automatic login & cache
|
||||||
|
|
||||||
1. Log into `my.ukvisajobs.com` in your browser
|
The extractor will:
|
||||||
2. Open Developer Tools → Network tab
|
|
||||||
3. Navigate to the jobs page
|
1. Launch a Camoufox (Playwright Firefox) browser and sign in
|
||||||
4. Find the `fetch-jobs-data` POST request
|
2. Navigate to the open jobs page and capture the token/cookies
|
||||||
5. Copy values:
|
3. Cache the session to `storage/ukvisajobs-auth.json`
|
||||||
- From **Request Body**: copy the `token` field → `UKVISAJOBS_TOKEN`
|
4. Reuse the cached values until the API reports an expired token, then refresh
|
||||||
- From **Cookies**: copy `authToken`, `csrf_token`, `ci_session`
|
|
||||||
|
|
||||||
## Running
|
## Running
|
||||||
|
|
||||||
@ -38,3 +48,4 @@ npm start
|
|||||||
```
|
```
|
||||||
|
|
||||||
Output is written to `storage/datasets/default/` as JSON files.
|
Output is written to `storage/datasets/default/` as JSON files.
|
||||||
|
|
||||||
|
|||||||
1110
extractors/ukvisajobs/package-lock.json
generated
1110
extractors/ukvisajobs/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,10 @@
|
|||||||
"type": "module",
|
"type": "module",
|
||||||
"description": "UK Visa Jobs extractor - fetches job listings that may sponsor work visas",
|
"description": "UK Visa Jobs extractor - fetches job listings that may sponsor work visas",
|
||||||
"main": "dist/main.js",
|
"main": "dist/main.js",
|
||||||
"dependencies": {},
|
"dependencies": {
|
||||||
|
"camoufox-js": "^0.8.0",
|
||||||
|
"playwright": "^1.57.0"
|
||||||
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@apify/tsconfig": "^0.1.0",
|
"@apify/tsconfig": "^0.1.0",
|
||||||
"@types/node": "^24.0.0",
|
"@types/node": "^24.0.0",
|
||||||
@ -15,7 +18,9 @@
|
|||||||
"start": "npm run start:dev",
|
"start": "npm run start:dev",
|
||||||
"start:prod": "node dist/main.js",
|
"start:prod": "node dist/main.js",
|
||||||
"start:dev": "tsx src/main.ts",
|
"start:dev": "tsx src/main.ts",
|
||||||
"build": "tsc"
|
"build": "tsc",
|
||||||
|
"get-binaries": "camoufox-js fetch",
|
||||||
|
"postinstall": "npm run get-binaries"
|
||||||
},
|
},
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC"
|
"license": "ISC"
|
||||||
|
|||||||
@ -1,25 +1,28 @@
|
|||||||
/**
|
/**
|
||||||
* UK Visa Jobs Extractor
|
* UK Visa Jobs Extractor
|
||||||
*
|
*
|
||||||
* Fetches job listings from my.ukvisajobs.com that may sponsor work visas.
|
* Fetches job listings from my.ukvisajobs.com that may sponsor work visas.
|
||||||
* Outputs JSON to stdout for the orchestrator to consume.
|
* Outputs JSON to stdout for the orchestrator to consume.
|
||||||
*
|
*
|
||||||
* Environment variables:
|
* Environment variables:
|
||||||
* UKVISAJOBS_TOKEN - JWT token (required)
|
* UKVISAJOBS_EMAIL - Login email for auto-refresh
|
||||||
* UKVISAJOBS_AUTH_TOKEN - Auth cookie token (defaults to UKVISAJOBS_TOKEN)
|
* UKVISAJOBS_PASSWORD - Login password for auto-refresh
|
||||||
* UKVISAJOBS_CSRF_TOKEN - CSRF token cookie
|
* UKVISAJOBS_HEADLESS - Set to "false" to show the browser (default: true)
|
||||||
* UKVISAJOBS_CI_SESSION - CI session cookie
|
|
||||||
* UKVISAJOBS_MAX_JOBS - Maximum jobs to fetch (default: 50, max: 200) - Set via UI Settings
|
* UKVISAJOBS_MAX_JOBS - Maximum jobs to fetch (default: 50, max: 200) - Set via UI Settings
|
||||||
* UKVISAJOBS_SEARCH_KEYWORD - Optional search filter
|
* UKVISAJOBS_SEARCH_KEYWORD - Optional search filter
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { mkdir, writeFile } from 'fs/promises';
|
import { mkdir, writeFile, readFile } from 'fs/promises';
|
||||||
import { join, dirname } from 'path';
|
import { join, dirname } from 'path';
|
||||||
import { fileURLToPath } from 'url';
|
import { fileURLToPath } from 'url';
|
||||||
|
import type { Request } from 'playwright';
|
||||||
|
|
||||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
const API_URL = 'https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data';
|
const API_URL = 'https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data';
|
||||||
|
const SIGNIN_URL = 'https://my.ukvisajobs.com/signin';
|
||||||
|
const OPEN_JOBS_URL = 'https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1';
|
||||||
|
const AUTH_CACHE_PATH = join(__dirname, '../storage/ukvisajobs-auth.json');
|
||||||
const JOBS_PER_PAGE = 15;
|
const JOBS_PER_PAGE = 15;
|
||||||
const DEFAULT_MAX_JOBS = 50;
|
const DEFAULT_MAX_JOBS = 50;
|
||||||
const MAX_ALLOWED_JOBS = 200;
|
const MAX_ALLOWED_JOBS = 200;
|
||||||
@ -77,6 +80,27 @@ interface ExtractedJob {
|
|||||||
jobLevel?: string;
|
jobLevel?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface UkVisaJobsAuthSession {
|
||||||
|
token: string;
|
||||||
|
authToken: string;
|
||||||
|
csrfToken: string;
|
||||||
|
ciSession: string;
|
||||||
|
fetchedAt: string;
|
||||||
|
source: 'cache' | 'browser';
|
||||||
|
}
|
||||||
|
|
||||||
|
class UkVisaJobsAuthError extends Error {
|
||||||
|
status: number;
|
||||||
|
responseText: string;
|
||||||
|
|
||||||
|
constructor(message: string, status: number, responseText: string) {
|
||||||
|
super(message);
|
||||||
|
this.name = 'UkVisaJobsAuthError';
|
||||||
|
this.status = status;
|
||||||
|
this.responseText = responseText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function toStringOrNull(value: unknown): string | null {
|
function toStringOrNull(value: unknown): string | null {
|
||||||
if (value === null || value === undefined) return null;
|
if (value === null || value === undefined) return null;
|
||||||
if (typeof value === 'string') {
|
if (typeof value === 'string') {
|
||||||
@ -101,8 +125,7 @@ function toNumberOrNull(value: unknown): number | null {
|
|||||||
|
|
||||||
async function fetchPage(
|
async function fetchPage(
|
||||||
pageNo: number,
|
pageNo: number,
|
||||||
token: string,
|
session: UkVisaJobsAuthSession,
|
||||||
cookies: string,
|
|
||||||
options: { searchKeyword?: string } = {}
|
options: { searchKeyword?: string } = {}
|
||||||
): Promise<UkVisaJobsApiResponse> {
|
): Promise<UkVisaJobsApiResponse> {
|
||||||
// Use native FormData API (Node.js 18+)
|
// Use native FormData API (Node.js 18+)
|
||||||
@ -113,7 +136,9 @@ async function fetchPage(
|
|||||||
formData.append('visaAcceptance', 'false');
|
formData.append('visaAcceptance', 'false');
|
||||||
formData.append('applicants_outside_uk', 'false');
|
formData.append('applicants_outside_uk', 'false');
|
||||||
formData.append('searchKeyword', options.searchKeyword || 'null');
|
formData.append('searchKeyword', options.searchKeyword || 'null');
|
||||||
formData.append('token', token);
|
formData.append('token', session.token);
|
||||||
|
|
||||||
|
const cookies = buildCookieHeader(session);
|
||||||
|
|
||||||
const response = await fetch(API_URL, {
|
const response = await fetch(API_URL, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
@ -130,6 +155,13 @@ async function fetchPage(
|
|||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
const text = await response.text();
|
const text = await response.text();
|
||||||
|
if (isAuthErrorResponse(response.status, text)) {
|
||||||
|
throw new UkVisaJobsAuthError(
|
||||||
|
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
|
||||||
|
response.status,
|
||||||
|
text
|
||||||
|
);
|
||||||
|
}
|
||||||
throw new Error(`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`);
|
throw new Error(`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,12 +175,12 @@ function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
|||||||
const maxSalary = toNumberOrNull(raw.max_salary);
|
const maxSalary = toNumberOrNull(raw.max_salary);
|
||||||
|
|
||||||
if (minSalary !== null && minSalary > 0 && maxSalary !== null && maxSalary > 0) {
|
if (minSalary !== null && minSalary > 0 && maxSalary !== null && maxSalary > 0) {
|
||||||
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
|
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
|
||||||
if (raw.salary_interval) {
|
if (raw.salary_interval) {
|
||||||
salary += ` / ${raw.salary_interval}`;
|
salary += ` / ${raw.salary_interval}`;
|
||||||
}
|
}
|
||||||
} else if (maxSalary !== null && maxSalary > 0) {
|
} else if (maxSalary !== null && maxSalary > 0) {
|
||||||
salary = `£${maxSalary.toLocaleString()}`;
|
salary = `£${maxSalary.toLocaleString()}`;
|
||||||
if (raw.salary_interval) {
|
if (raw.salary_interval) {
|
||||||
salary += ` / ${raw.salary_interval}`;
|
salary += ` / ${raw.salary_interval}`;
|
||||||
}
|
}
|
||||||
@ -188,30 +220,181 @@ function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main(): Promise<void> {
|
function buildCookieHeader(session: UkVisaJobsAuthSession): string {
|
||||||
console.log('🇬🇧 UK Visa Jobs Extractor starting...');
|
const cookieParts: string[] = [];
|
||||||
|
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
|
||||||
|
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
|
||||||
|
if (session.authToken) cookieParts.push(`authToken=${session.authToken}`);
|
||||||
|
return cookieParts.join('; ');
|
||||||
|
}
|
||||||
|
|
||||||
// Get credentials from environment
|
function getLoginCredentials(): { email: string; password: string } | null {
|
||||||
const token = process.env.UKVISAJOBS_TOKEN;
|
const email = process.env.UKVISAJOBS_EMAIL;
|
||||||
const authToken = process.env.UKVISAJOBS_AUTH_TOKEN || token;
|
const password = process.env.UKVISAJOBS_PASSWORD;
|
||||||
const csrfToken = process.env.UKVISAJOBS_CSRF_TOKEN || '';
|
if (!email || !password) return null;
|
||||||
const ciSession = process.env.UKVISAJOBS_CI_SESSION || '';
|
return { email, password };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
||||||
|
try {
|
||||||
|
const data = await readFile(AUTH_CACHE_PATH, 'utf8');
|
||||||
|
const parsed = JSON.parse(data) as UkVisaJobsAuthSession;
|
||||||
|
if (!parsed?.token) return null;
|
||||||
|
return {
|
||||||
|
token: parsed.token,
|
||||||
|
authToken: parsed.authToken || parsed.token,
|
||||||
|
csrfToken: parsed.csrfToken || '',
|
||||||
|
ciSession: parsed.ciSession || '',
|
||||||
|
fetchedAt: parsed.fetchedAt || new Date().toISOString(),
|
||||||
|
source: 'cache',
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function saveCachedAuthSession(session: UkVisaJobsAuthSession): Promise<void> {
|
||||||
|
const payload = {
|
||||||
|
token: session.token,
|
||||||
|
authToken: session.authToken,
|
||||||
|
csrfToken: session.csrfToken,
|
||||||
|
ciSession: session.ciSession,
|
||||||
|
fetchedAt: session.fetchedAt,
|
||||||
|
source: session.source,
|
||||||
|
};
|
||||||
|
await mkdir(dirname(AUTH_CACHE_PATH), { recursive: true });
|
||||||
|
await writeFile(AUTH_CACHE_PATH, JSON.stringify(payload, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractMultipartField(body: string, field: string): string | null {
|
||||||
|
const nameToken = `name="${field}"`;
|
||||||
|
const index = body.indexOf(nameToken);
|
||||||
|
if (index === -1) return null;
|
||||||
|
|
||||||
|
const afterName = body.slice(index + nameToken.length);
|
||||||
|
let separatorIndex = afterName.indexOf('\r\n\r\n');
|
||||||
|
let separatorLength = 4;
|
||||||
|
if (separatorIndex === -1) {
|
||||||
|
separatorIndex = afterName.indexOf('\n\n');
|
||||||
|
separatorLength = 2;
|
||||||
|
}
|
||||||
|
if (separatorIndex === -1) return null;
|
||||||
|
|
||||||
|
const valueStart = index + nameToken.length + separatorIndex + separatorLength;
|
||||||
|
const remainder = body.slice(valueStart);
|
||||||
|
const endIndex = remainder.indexOf('\r\n');
|
||||||
|
if (endIndex === -1) return remainder.trim();
|
||||||
|
return remainder.slice(0, endIndex).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTokenFromRequest(request: Request): string | null {
|
||||||
|
const postData = request.postData();
|
||||||
|
if (!postData) return null;
|
||||||
|
const multipartToken = extractMultipartField(postData, 'token');
|
||||||
|
if (multipartToken) return multipartToken;
|
||||||
|
try {
|
||||||
|
const params = new URLSearchParams(postData);
|
||||||
|
const token = params.get('token');
|
||||||
|
return token || null;
|
||||||
|
} catch (error) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function isAuthErrorResponse(status: number, bodyText: string): boolean {
|
||||||
|
if (status === 401 || status === 403) return true;
|
||||||
|
if (status !== 400) return false;
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(bodyText) as { errorType?: string; message?: string };
|
||||||
|
if (parsed?.errorType === 'expired') return true;
|
||||||
|
if (parsed?.message && parsed.message.toLowerCase().includes('expired')) return true;
|
||||||
|
} catch (error) {
|
||||||
|
// ignore JSON parse failures
|
||||||
|
}
|
||||||
|
return bodyText.toLowerCase().includes('expired');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loginWithBrowser(email: string, password: string): Promise<UkVisaJobsAuthSession> {
|
||||||
|
const [{ launchOptions }, { firefox }] = await Promise.all([
|
||||||
|
import('camoufox-js'),
|
||||||
|
import('playwright'),
|
||||||
|
]);
|
||||||
|
const headless = process.env.UKVISAJOBS_HEADLESS !== 'false';
|
||||||
|
const browser = await firefox.launch(await launchOptions({
|
||||||
|
headless,
|
||||||
|
humanize: true,
|
||||||
|
geoip: true,
|
||||||
|
}));
|
||||||
|
const context = await browser.newContext();
|
||||||
|
const page = await context.newPage();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.goto(SIGNIN_URL, { waitUntil: 'domcontentloaded' });
|
||||||
|
await page.waitForSelector('#email', { timeout: 15000 });
|
||||||
|
await page.fill('#email', email);
|
||||||
|
await page.fill('#password', password);
|
||||||
|
await page.keyboard.press('Enter');
|
||||||
|
await page.waitForTimeout(7000);
|
||||||
|
|
||||||
|
const requestPromise = page.waitForRequest(
|
||||||
|
(request) => request.url().includes('/ukvisa-api/api/fetch-jobs-data') && request.method() === 'POST',
|
||||||
|
{ timeout: 30000 }
|
||||||
|
);
|
||||||
|
|
||||||
|
await page.goto(OPEN_JOBS_URL, { waitUntil: 'networkidle' });
|
||||||
|
await page.waitForTimeout(5000);
|
||||||
|
|
||||||
|
let fetchRequest: Request | null = null;
|
||||||
|
try {
|
||||||
|
fetchRequest = await requestPromise;
|
||||||
|
} catch (error) {
|
||||||
|
fetchRequest = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cookies = await context.cookies('https://my.ukvisajobs.com');
|
||||||
|
const csrfToken = cookies.find((cookie) => cookie.name === 'csrf_token')?.value || '';
|
||||||
|
const ciSession = cookies.find((cookie) => cookie.name === 'ci_session')?.value || '';
|
||||||
|
const authToken = cookies.find((cookie) => cookie.name === 'authToken')?.value || '';
|
||||||
|
const token = fetchRequest ? extractTokenFromRequest(fetchRequest) : authToken;
|
||||||
|
|
||||||
|
if (!token) {
|
||||||
|
throw new Error('Failed to locate auth token from browser session.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
token,
|
||||||
|
authToken: authToken || token,
|
||||||
|
csrfToken,
|
||||||
|
ciSession,
|
||||||
|
fetchedAt: new Date().toISOString(),
|
||||||
|
source: 'browser',
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
console.log('🇬🇧 UK Visa Jobs Extractor starting...');
|
||||||
|
const credentials = getLoginCredentials();
|
||||||
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
|
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
|
||||||
|
|
||||||
if (!token) {
|
let authSession = await loadCachedAuthSession();
|
||||||
console.error('❌ UKVISAJOBS_TOKEN environment variable is not set');
|
|
||||||
process.exit(1);
|
if (!authSession) {
|
||||||
|
if (!credentials) {
|
||||||
|
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
console.log(' No cached session found. Logging in to refresh tokens...');
|
||||||
|
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
||||||
|
await saveCachedAuthSession(authSession);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build cookies string
|
const cookies = buildCookieHeader(authSession);
|
||||||
const cookieParts: string[] = [];
|
console.log(` Auth source: ${authSession.source}`);
|
||||||
if (csrfToken) cookieParts.push(`csrf_token=${csrfToken}`);
|
console.log(` Cookies configured: ${cookies ? 'Yes' : 'No'}`);
|
||||||
if (ciSession) cookieParts.push(`ci_session=${ciSession}`);
|
console.log(` Token length: ${authSession.token.length}`);
|
||||||
if (authToken) cookieParts.push(`authToken=${authToken}`);
|
|
||||||
const cookies = cookieParts.join('; ');
|
|
||||||
|
|
||||||
console.log(` Cookies configured: ${cookieParts.length > 0 ? 'Yes' : 'No'}`);
|
|
||||||
console.log(` Token length: ${token.length}`);
|
|
||||||
|
|
||||||
// Get max jobs from environment
|
// Get max jobs from environment
|
||||||
const maxJobsEnv = toNumberOrNull(process.env.UKVISAJOBS_MAX_JOBS);
|
const maxJobsEnv = toNumberOrNull(process.env.UKVISAJOBS_MAX_JOBS);
|
||||||
@ -232,10 +415,25 @@ async function main(): Promise<void> {
|
|||||||
while (pageNo <= maxPages && allJobs.length < maxJobs) {
|
while (pageNo <= maxPages && allJobs.length < maxJobs) {
|
||||||
console.log(` Fetching page ${pageNo}/${maxPages}...`);
|
console.log(` Fetching page ${pageNo}/${maxPages}...`);
|
||||||
|
|
||||||
const response = await fetchPage(pageNo, token, cookies, { searchKeyword });
|
let response: UkVisaJobsApiResponse;
|
||||||
|
try {
|
||||||
|
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof UkVisaJobsAuthError) {
|
||||||
|
if (!credentials) {
|
||||||
|
throw new Error('UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
||||||
|
}
|
||||||
|
console.log(' Auth expired. Refreshing tokens...');
|
||||||
|
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
||||||
|
await saveCachedAuthSession(authSession);
|
||||||
|
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
||||||
|
} else {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (response.status !== 1) {
|
if (response.status !== 1) {
|
||||||
console.warn(` ⚠️ API returned status ${response.status} on page ${pageNo}`);
|
console.warn(` âš ï¸ API returned status ${response.status} on page ${pageNo}`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -271,7 +469,7 @@ async function main(): Promise<void> {
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`✅ Scraped ${allJobs.length} jobs`);
|
console.log(`✅ Scraped ${allJobs.length} jobs`);
|
||||||
|
|
||||||
// Write output to storage directory (similar to Crawlee dataset structure)
|
// Write output to storage directory (similar to Crawlee dataset structure)
|
||||||
const storageDir = join(__dirname, '../storage/datasets/default');
|
const storageDir = join(__dirname, '../storage/datasets/default');
|
||||||
@ -292,7 +490,7 @@ async function main(): Promise<void> {
|
|||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
const message = error instanceof Error ? error.message : 'Unknown error';
|
||||||
console.error(`❌ Error: ${message}`);
|
console.error(`⌠Error: ${message}`);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -301,3 +499,6 @@ main().catch((error) => {
|
|||||||
console.error('Fatal error:', error);
|
console.error('Fatal error:', error);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/**
|
/**
|
||||||
* Service for running the UK Visa Jobs extractor (extractors/ukvisajobs).
|
* Service for running the UK Visa Jobs extractor (extractors/ukvisajobs).
|
||||||
*
|
*
|
||||||
* Spawns the extractor as a child process and reads its output dataset.
|
* Spawns the extractor as a child process and reads its output dataset.
|
||||||
@ -13,6 +13,14 @@ import type { CreateJobInput } from '../../shared/types.js';
|
|||||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
const UKVISAJOBS_DIR = join(__dirname, '../../../../extractors/ukvisajobs');
|
const UKVISAJOBS_DIR = join(__dirname, '../../../../extractors/ukvisajobs');
|
||||||
const STORAGE_DIR = join(UKVISAJOBS_DIR, 'storage/datasets/default');
|
const STORAGE_DIR = join(UKVISAJOBS_DIR, 'storage/datasets/default');
|
||||||
|
const AUTH_CACHE_PATH = join(UKVISAJOBS_DIR, 'storage/ukvisajobs-auth.json');
|
||||||
|
|
||||||
|
interface UkVisaJobsAuthSession {
|
||||||
|
token?: string;
|
||||||
|
authToken?: string;
|
||||||
|
csrfToken?: string;
|
||||||
|
ciSession?: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface RunUkVisaJobsOptions {
|
export interface RunUkVisaJobsOptions {
|
||||||
/** Maximum number of jobs to fetch per search term. Defaults to 50, max 200. */
|
/** Maximum number of jobs to fetch per search term. Defaults to 50, max 200. */
|
||||||
@ -73,11 +81,11 @@ async function fetchJobDescription(url: string): Promise<string | null> {
|
|||||||
try {
|
try {
|
||||||
console.log(` Fetching description from ${url}...`);
|
console.log(` Fetching description from ${url}...`);
|
||||||
|
|
||||||
// Build cookies if present in env (similar to extractor)
|
const authSession = await loadCachedAuthSession();
|
||||||
const cookieParts: string[] = [];
|
const cookieParts: string[] = [];
|
||||||
if (process.env.UKVISAJOBS_CSRF_TOKEN) cookieParts.push(`csrf_token=${process.env.UKVISAJOBS_CSRF_TOKEN}`);
|
if (authSession?.csrfToken) cookieParts.push(`csrf_token=${authSession.csrfToken}`);
|
||||||
if (process.env.UKVISAJOBS_CI_SESSION) cookieParts.push(`ci_session=${process.env.UKVISAJOBS_CI_SESSION}`);
|
if (authSession?.ciSession) cookieParts.push(`ci_session=${authSession.ciSession}`);
|
||||||
const token = process.env.UKVISAJOBS_AUTH_TOKEN || process.env.UKVISAJOBS_TOKEN;
|
const token = authSession?.authToken || authSession?.token;
|
||||||
if (token) cookieParts.push(`authToken=${token}`);
|
if (token) cookieParts.push(`authToken=${token}`);
|
||||||
|
|
||||||
const headers: Record<string, string> = {
|
const headers: Record<string, string> = {
|
||||||
@ -101,7 +109,16 @@ async function fetchJobDescription(url: string): Promise<string | null> {
|
|||||||
// If we only got a tiny bit of text, it might have failed
|
// If we only got a tiny bit of text, it might have failed
|
||||||
return cleaned.length > 100 ? cleaned : null;
|
return cleaned.length > 100 ? cleaned : null;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn(` ⚠️ Failed to fetch description: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
console.warn(` âš ï¸ Failed to fetch description: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
||||||
|
try {
|
||||||
|
const data = await readFile(AUTH_CACHE_PATH, 'utf-8');
|
||||||
|
return JSON.parse(data) as UkVisaJobsAuthSession;
|
||||||
|
} catch {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -118,7 +135,7 @@ async function clearStorageDataset(): Promise<void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise<UkVisaJobsResult> {
|
export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise<UkVisaJobsResult> {
|
||||||
console.log('🇬🇧 Running UK Visa Jobs extractor...');
|
console.log('🇬🇧 Running UK Visa Jobs extractor...');
|
||||||
|
|
||||||
// Determine terms to run
|
// Determine terms to run
|
||||||
const terms: string[] = [];
|
const terms: string[] = [];
|
||||||
@ -192,11 +209,11 @@ export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(` ✅ Fetched ${runJobs.length} jobs for ${termLabel} (${newCount} new unique)`);
|
console.log(` ✅ Fetched ${runJobs.length} jobs for ${termLabel} (${newCount} new unique)`);
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
const message = error instanceof Error ? error.message : 'Unknown error';
|
||||||
console.error(`❌ UK Visa Jobs failed for ${termLabel}: ${message}`);
|
console.error(`⌠UK Visa Jobs failed for ${termLabel}: ${message}`);
|
||||||
// Continue to next term instead of failing completely
|
// Continue to next term instead of failing completely
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -207,7 +224,7 @@ export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`✅ UK Visa Jobs: imported total ${allJobs.length} unique jobs`);
|
console.log(`✅ UK Visa Jobs: imported total ${allJobs.length} unique jobs`);
|
||||||
return { success: true, jobs: allJobs };
|
return { success: true, jobs: allJobs };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -254,3 +271,4 @@ async function readDataset(): Promise<CreateJobInput[]> {
|
|||||||
|
|
||||||
return jobs;
|
return jobs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user