in extractors

This commit is contained in:
DaKheera47 2026-01-25 13:34:16 +00:00
parent aaab9b5124
commit 4ffaf06b1d
9 changed files with 626 additions and 565 deletions

View File

@ -1,34 +1,27 @@
{
"$schema": "https://biomejs.dev/schemas/2.3.12/schema.json",
"formatter": {
"indentStyle": "space",
"indentWidth": 2
},
"files": {
"includes": [
"**",
"!!**/dist"
]
},
"css": {
"parser": {
"tailwindDirectives": true
"$schema": "https://biomejs.dev/schemas/2.3.12/schema.json",
"formatter": {
"indentStyle": "space",
"indentWidth": 2
},
"files": {
"includes": ["**", "!!**/dist"]
},
"css": {
"parser": {
"tailwindDirectives": true
}
},
"overrides": [
{
"includes": ["**/*.test.ts", "**/*.test.tsx", "**/test-utils.ts"],
"linter": {
"rules": {
"suspicious": {
"noExplicitAny": "off"
}
}
},
"overrides": [
{
"includes": [
"**/*.test.ts",
"**/*.test.tsx",
"**/test-utils.ts"
],
"linter": {
"rules": {
"suspicious": {
"noExplicitAny": "off"
}
}
}
}
]
}
}
}
]
}

View File

@ -1,30 +1,30 @@
{
"name": "job-flow",
"version": "0.0.1",
"type": "module",
"description": "This is an example of a Crawlee project.",
"dependencies": {
"camoufox-js": "^0.8.0",
"crawlee": "^3.0.0",
"playwright": "*"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/fs-extra": "^11",
"@types/node": "^24.0.0",
"fs-extra": "^11.3.0",
"tsx": "^4.4.0",
"typescript": "~5.9.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
"get-binaries": "camoufox-js fetch",
"postinstall": "npm run get-binaries"
},
"author": "It's not you it's me",
"license": "ISC"
"name": "job-flow",
"version": "0.0.1",
"type": "module",
"description": "This is an example of a Crawlee project.",
"dependencies": {
"camoufox-js": "^0.8.0",
"crawlee": "^3.0.0",
"playwright": "*"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/fs-extra": "^11",
"@types/node": "^24.0.0",
"fs-extra": "^11.3.0",
"tsx": "^4.4.0",
"typescript": "~5.9.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
"get-binaries": "camoufox-js fetch",
"postinstall": "npm run get-binaries"
},
"author": "It's not you it's me",
"license": "ISC"
}

View File

@ -2,9 +2,8 @@
import { launchOptions } from "camoufox-js";
import { PlaywrightCrawler } from "crawlee";
import { firefox } from "playwright";
import { router } from "./routes.js";
import { initJobOpsProgress } from "./progress.js";
import { router } from "./routes.js";
// locations
const locations = [
@ -17,10 +16,7 @@ const locations = [
];
// roles
const defaultRoles = [
"web-development",
"software-systems",
];
const defaultRoles = ["web-development", "software-systems"];
let roles = defaultRoles;
const envRolesRaw = process.env.GRADCRACKER_SEARCH_TERMS;
@ -29,15 +25,16 @@ if (envRolesRaw) {
try {
const parsed = JSON.parse(envRolesRaw) as string[];
if (Array.isArray(parsed) && parsed.length > 0) {
roles = parsed.map(term =>
term.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
roles = parsed.map((term) =>
term
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/^-+|-+$/g, ""),
);
console.log(`Using configured search terms: ${roles.join(', ')}`);
console.log(`Using configured search terms: ${roles.join(", ")}`);
}
} catch (e) {
console.warn('Failed to parse GRADCRACKER_SEARCH_TERMS', e);
console.warn("Failed to parse GRADCRACKER_SEARCH_TERMS", e);
}
}
@ -46,12 +43,12 @@ const gradcrackerUrls = locations.flatMap((location) => {
return roles.map((role) => {
return {
url: `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`,
role
role,
};
});
});
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`)
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`);
const startUrls = gradcrackerUrls.map(({ url, role }) => ({
url,

View File

@ -26,7 +26,7 @@ interface JobOpsCrawlProgressState {
const PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
const isEnabled = () => process.env.JOBOPS_EMIT_PROGRESS === "1";
let state: JobOpsCrawlProgressState = {
const state: JobOpsCrawlProgressState = {
listPagesProcessed: 0,
jobCardsFound: 0,
jobPagesEnqueued: 0,
@ -80,4 +80,3 @@ export function markJobPageDone(params: { currentUrl: string }): void {
state.currentUrl = params.currentUrl;
emit();
}

View File

@ -1,5 +1,5 @@
import { createPlaywrightRouter, log } from "crawlee";
import { readFileSync } from "node:fs";
import { createPlaywrightRouter, log } from "crawlee";
import { markJobPageDone, markListPageDone } from "./progress.js";
function normalizeUrl(raw: string | null | undefined): string | null {
@ -17,16 +17,15 @@ function normalizeUrl(raw: string | null | undefined): string | null {
function getExistingJobUrlSet(): Set<string> {
const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
const raw =
filePath
? (() => {
try {
return readFileSync(filePath, "utf-8");
} catch {
return null;
}
})()
: process.env.JOBOPS_EXISTING_JOB_URLS;
const raw = filePath
? (() => {
try {
return readFileSync(filePath, "utf-8");
} catch {
return null;
}
})()
: process.env.JOBOPS_EXISTING_JOB_URLS;
if (!raw) return new Set();
try {
@ -41,12 +40,16 @@ function getExistingJobUrlSet(): Set<string> {
}
}
const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
const SKIP_APPLY_FOR_EXISTING =
process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
const EXISTING_JOB_URLS = getExistingJobUrlSet();
// Global counters for max jobs per search term
const jobCounts = new Map<string, number>();
const MAX_JOBS_PER_TERM = parseInt(process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0", 10);
const MAX_JOBS_PER_TERM = parseInt(
process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0",
10,
);
interface Job {
title: string | null;
@ -72,7 +75,9 @@ router.addHandler(
if (MAX_JOBS_PER_TERM > 0) {
const currentCount = jobCounts.get(role) || 0;
if (currentCount >= MAX_JOBS_PER_TERM) {
log.info(`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`);
log.info(
`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`,
);
markListPageDone({
currentUrl: request.url,
jobCardsFound: 0,
@ -120,7 +125,8 @@ router.addHandler(
let disciplines: string | null = null;
try {
const disciplinesEl = article.locator("h3");
disciplines = (await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
disciplines =
(await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
} catch {
// h3 not found or timed out - that's okay, disciplines is optional
}
@ -195,8 +201,10 @@ router.addHandler(
if (MAX_JOBS_PER_TERM > 0) {
const currentCount = jobCounts.get(role) || 0;
if (currentCount >= MAX_JOBS_PER_TERM) {
log.info(`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`);
break;
log.info(
`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`,
);
break;
}
jobCounts.set(role, currentCount + 1);
}
@ -205,7 +213,7 @@ router.addHandler(
urls: [jobUrl],
userData: {
...jobs[jobs.length - 1],
label: "gradcracker-single-job-page"
label: "gradcracker-single-job-page",
},
});
enqueuedJobs++;
@ -216,7 +224,7 @@ router.addHandler(
log.info(`Extracted ${jobs.length} jobs`);
if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
log.info(
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`,
);
}
@ -226,7 +234,7 @@ router.addHandler(
jobPagesEnqueued: enqueuedJobs,
jobPagesSkipped: skippedKnownJobs,
});
}
},
);
router.addHandler(
@ -261,7 +269,9 @@ router.addHandler(
// Prefer page-scoped popup detection. Using the browser context's "page" event
// can accidentally capture unrelated pages created by other concurrent requests.
const popupPromise = page.waitForEvent("popup", { timeout: 8000 }).catch(() => null);
const popupPromise = page
.waitForEvent("popup", { timeout: 8000 })
.catch(() => null);
const navigationPromise = page
.waitForNavigation({ timeout: 8000, waitUntil: "domcontentloaded" })
.catch(() => null);
@ -271,7 +281,12 @@ router.addHandler(
await applyButton.click();
// Wait for URL to stabilize (same URL for 3 consecutive checks)
const waitForUrlStable = async (targetPage: typeof page, maxWaitMs = 10000, checkIntervalMs = 100, requiredStableChecks = 3) => {
const waitForUrlStable = async (
targetPage: typeof page,
maxWaitMs = 10000,
checkIntervalMs = 100,
requiredStableChecks = 3,
) => {
let lastUrl = targetPage.url();
let stableCount = 0;
const startTime = Date.now();
@ -298,11 +313,15 @@ router.addHandler(
const targetPage = maybePopup ?? page;
if (maybePopup) {
await maybePopup.waitForLoadState("domcontentloaded", { timeout: 15000 }).catch(() => null);
await maybePopup
.waitForLoadState("domcontentloaded", { timeout: 15000 })
.catch(() => null);
// If the popup initially opens as about:blank, give it a moment to redirect.
if (maybePopup.url() === "about:blank") {
await maybePopup
.waitForURL((u) => u.toString() !== "about:blank", { timeout: 15000 })
.waitForURL((u) => u.toString() !== "about:blank", {
timeout: 15000,
})
.catch(() => null);
}
} else {
@ -317,7 +336,7 @@ router.addHandler(
if (applicationLink === originalUrl) {
log.info(
`Apply click did not change URL (still Gradcracker): ${applicationLink}`
`Apply click did not change URL (still Gradcracker): ${applicationLink}`,
);
} else {
log.info(`Captured application URL: ${applicationLink}`);
@ -342,5 +361,5 @@ router.addHandler(
});
markJobPageDone({ currentUrl: request.url });
}
},
);

View File

@ -1,12 +1,12 @@
{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"lib": ["DOM"]
},
"include": ["./src/**/*"]
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"lib": ["DOM"]
},
"include": ["./src/**/*"]
}

View File

@ -1,27 +1,27 @@
{
"name": "ukvisajobs-extractor",
"version": "0.0.1",
"type": "module",
"description": "UK Visa Jobs extractor - fetches job listings that may sponsor work visas",
"main": "dist/main.js",
"dependencies": {
"camoufox-js": "^0.8.0",
"playwright": "^1.57.0"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/node": "^24.0.0",
"tsx": "^4.4.0",
"typescript": "~5.9.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"get-binaries": "camoufox-js fetch",
"postinstall": "npm run get-binaries"
},
"author": "",
"license": "ISC"
"name": "ukvisajobs-extractor",
"version": "0.0.1",
"type": "module",
"description": "UK Visa Jobs extractor - fetches job listings that may sponsor work visas",
"main": "dist/main.js",
"dependencies": {
"camoufox-js": "^0.8.0",
"playwright": "^1.57.0"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"@types/node": "^24.0.0",
"tsx": "^4.4.0",
"typescript": "~5.9.0"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "tsx src/main.ts",
"build": "tsc",
"get-binaries": "camoufox-js fetch",
"postinstall": "npm run get-binaries"
},
"author": "",
"license": "ISC"
}

View File

@ -1,9 +1,9 @@
/**
* UK Visa Jobs Extractor
*
*
* Fetches job listings from my.ukvisajobs.com that may sponsor work visas.
* Outputs JSON to stdout for the orchestrator to consume.
*
*
* Environment variables:
* UKVISAJOBS_EMAIL - Login email for auto-refresh
* UKVISAJOBS_PASSWORD - Login password for auto-refresh
@ -13,507 +13,560 @@
* UKVISAJOBS_REFRESH_ONLY - Set to "1" to refresh tokens and exit
*/
import { mkdir, writeFile, readFile } from 'fs/promises';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import type { Request } from 'playwright';
import { mkdir, readFile, writeFile } from "node:fs/promises";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
import type { Request } from "playwright";
const __dirname = dirname(fileURLToPath(import.meta.url));
const API_URL = 'https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data';
const SIGNIN_URL = 'https://my.ukvisajobs.com/signin';
const OPEN_JOBS_URL = 'https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1';
const AUTH_CACHE_PATH = join(__dirname, '../storage/ukvisajobs-auth.json');
const API_URL = "https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data";
const SIGNIN_URL = "https://my.ukvisajobs.com/signin";
const OPEN_JOBS_URL =
"https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1";
const AUTH_CACHE_PATH = join(__dirname, "../storage/ukvisajobs-auth.json");
const JOBS_PER_PAGE = 15;
const DEFAULT_MAX_JOBS = 50;
const MAX_ALLOWED_JOBS = 200;
interface UkVisaJobsApiJob {
id: string;
title: string;
company_name: string;
company_link?: string;
job_link: string;
city: string;
created_date: string;
job_expire: string;
description?: string;
min_salary?: string;
max_salary?: string;
salary_interval?: string;
salary_method?: string;
degree_requirement?: string;
job_type?: string;
job_level?: string;
job_industry?: string;
visa_acceptance?: string;
applicants_outside_uk?: string;
likely_to_sponsor?: string;
definitely_sponsored?: string;
new_entrant?: string;
student_graduate?: string;
image?: string;
computed_cos_total?: string;
id: string;
title: string;
company_name: string;
company_link?: string;
job_link: string;
city: string;
created_date: string;
job_expire: string;
description?: string;
min_salary?: string;
max_salary?: string;
salary_interval?: string;
salary_method?: string;
degree_requirement?: string;
job_type?: string;
job_level?: string;
job_industry?: string;
visa_acceptance?: string;
applicants_outside_uk?: string;
likely_to_sponsor?: string;
definitely_sponsored?: string;
new_entrant?: string;
student_graduate?: string;
image?: string;
computed_cos_total?: string;
}
interface UkVisaJobsApiResponse {
status: number;
totalJobs: number;
query?: string;
jobs: UkVisaJobsApiJob[];
status: number;
totalJobs: number;
query?: string;
jobs: UkVisaJobsApiJob[];
}
interface ExtractedJob {
source: 'ukvisajobs';
sourceJobId: string;
title: string;
employer: string;
employerUrl?: string;
jobUrl: string;
applicationLink: string;
location?: string;
deadline?: string;
salary?: string;
jobDescription?: string;
datePosted?: string;
degreeRequired?: string;
jobType?: string;
jobLevel?: string;
source: "ukvisajobs";
sourceJobId: string;
title: string;
employer: string;
employerUrl?: string;
jobUrl: string;
applicationLink: string;
location?: string;
deadline?: string;
salary?: string;
jobDescription?: string;
datePosted?: string;
degreeRequired?: string;
jobType?: string;
jobLevel?: string;
}
interface UkVisaJobsAuthSession {
token: string;
authToken: string;
csrfToken: string;
ciSession: string;
fetchedAt: string;
source: 'cache' | 'browser';
token: string;
authToken: string;
csrfToken: string;
ciSession: string;
fetchedAt: string;
source: "cache" | "browser";
}
class UkVisaJobsAuthError extends Error {
status: number;
responseText: string;
status: number;
responseText: string;
constructor(message: string, status: number, responseText: string) {
super(message);
this.name = 'UkVisaJobsAuthError';
this.status = status;
this.responseText = responseText;
}
constructor(message: string, status: number, responseText: string) {
super(message);
this.name = "UkVisaJobsAuthError";
this.status = status;
this.responseText = responseText;
}
}
function toStringOrNull(value: unknown): string | null {
if (value === null || value === undefined) return null;
if (typeof value === 'string') {
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : null;
}
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
return null;
if (value === null || value === undefined) return null;
if (typeof value === "string") {
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : null;
}
if (typeof value === "number" || typeof value === "boolean")
return String(value);
return null;
}
function toNumberOrNull(value: unknown): number | null {
if (value === null || value === undefined) return null;
if (typeof value === 'number') return Number.isFinite(value) ? value : null;
if (typeof value === 'string') {
const trimmed = value.trim();
if (!trimmed) return null;
const parsed = Number(trimmed);
return Number.isFinite(parsed) ? parsed : null;
}
return null;
if (value === null || value === undefined) return null;
if (typeof value === "number") return Number.isFinite(value) ? value : null;
if (typeof value === "string") {
const trimmed = value.trim();
if (!trimmed) return null;
const parsed = Number(trimmed);
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
async function fetchPage(
pageNo: number,
session: UkVisaJobsAuthSession,
options: { searchKeyword?: string } = {}
pageNo: number,
session: UkVisaJobsAuthSession,
options: { searchKeyword?: string } = {},
): Promise<UkVisaJobsApiResponse> {
// Use native FormData API (Node.js 18+)
const formData = new FormData();
formData.append('is_global', '0');
formData.append('sortBy', 'desc');
formData.append('pageNo', String(pageNo));
formData.append('visaAcceptance', 'false');
formData.append('applicants_outside_uk', 'false');
formData.append('searchKeyword', options.searchKeyword || 'null');
formData.append('token', session.token);
// Use native FormData API (Node.js 18+)
const formData = new FormData();
formData.append("is_global", "0");
formData.append("sortBy", "desc");
formData.append("pageNo", String(pageNo));
formData.append("visaAcceptance", "false");
formData.append("applicants_outside_uk", "false");
formData.append("searchKeyword", options.searchKeyword || "null");
formData.append("token", session.token);
const cookies = buildCookieHeader(session);
const cookies = buildCookieHeader(session);
const response = await fetch(API_URL, {
method: 'POST',
headers: {
'accept': 'application/json, text/plain, */*',
'accept-language': 'en-US,en;q=0.9',
'cookie': cookies,
'origin': 'https://my.ukvisajobs.com',
'referer': `https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&pageNo=${pageNo}&visaAcceptance=false&applicants_outside_uk=false`,
'user-agent': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36',
},
body: formData,
});
const response = await fetch(API_URL, {
method: "POST",
headers: {
accept: "application/json, text/plain, */*",
"accept-language": "en-US,en;q=0.9",
cookie: cookies,
origin: "https://my.ukvisajobs.com",
referer: `https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&pageNo=${pageNo}&visaAcceptance=false&applicants_outside_uk=false`,
"user-agent":
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36",
},
body: formData,
});
if (!response.ok) {
const text = await response.text();
if (isAuthErrorResponse(response.status, text)) {
throw new UkVisaJobsAuthError(
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
response.status,
text
);
}
throw new Error(`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`);
if (!response.ok) {
const text = await response.text();
if (isAuthErrorResponse(response.status, text)) {
throw new UkVisaJobsAuthError(
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
response.status,
text,
);
}
throw new Error(
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
);
}
return response.json() as Promise<UkVisaJobsApiResponse>;
return response.json() as Promise<UkVisaJobsApiResponse>;
}
function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
// Build salary string from min/max
let salary: string | undefined = undefined;
const minSalary = toNumberOrNull(raw.min_salary);
const maxSalary = toNumberOrNull(raw.max_salary);
// Build salary string from min/max
let salary: string | undefined;
const minSalary = toNumberOrNull(raw.min_salary);
const maxSalary = toNumberOrNull(raw.max_salary);
if (minSalary !== null && minSalary > 0 && maxSalary !== null && maxSalary > 0) {
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
if (raw.salary_interval) {
salary += ` / ${raw.salary_interval}`;
}
} else if (maxSalary !== null && maxSalary > 0) {
salary = `£${maxSalary.toLocaleString()}`;
if (raw.salary_interval) {
salary += ` / ${raw.salary_interval}`;
}
if (
minSalary !== null &&
minSalary > 0 &&
maxSalary !== null &&
maxSalary > 0
) {
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
if (raw.salary_interval) {
salary += ` / ${raw.salary_interval}`;
}
} else if (maxSalary !== null && maxSalary > 0) {
salary = `£${maxSalary.toLocaleString()}`;
if (raw.salary_interval) {
salary += ` / ${raw.salary_interval}`;
}
}
// Build a description from visa sponsorship fields
const visaInfo: string[] = [];
if (raw.visa_acceptance?.toLowerCase() === 'yes') visaInfo.push('Visa acceptance: Yes');
if (raw.applicants_outside_uk?.toLowerCase() === 'yes') visaInfo.push('Accepts applicants outside UK');
if (raw.likely_to_sponsor?.toLowerCase() === 'yes') visaInfo.push('Likely to sponsor');
if (raw.definitely_sponsored?.toLowerCase() === 'yes') visaInfo.push('Definitely sponsored');
if (raw.new_entrant?.toLowerCase() === 'yes') visaInfo.push('New entrant friendly');
if (raw.student_graduate?.toLowerCase() === 'yes') visaInfo.push('Student/Graduate friendly');
// Build a description from visa sponsorship fields
const visaInfo: string[] = [];
if (raw.visa_acceptance?.toLowerCase() === "yes")
visaInfo.push("Visa acceptance: Yes");
if (raw.applicants_outside_uk?.toLowerCase() === "yes")
visaInfo.push("Accepts applicants outside UK");
if (raw.likely_to_sponsor?.toLowerCase() === "yes")
visaInfo.push("Likely to sponsor");
if (raw.definitely_sponsored?.toLowerCase() === "yes")
visaInfo.push("Definitely sponsored");
if (raw.new_entrant?.toLowerCase() === "yes")
visaInfo.push("New entrant friendly");
if (raw.student_graduate?.toLowerCase() === "yes")
visaInfo.push("Student/Graduate friendly");
const description = raw.description
? raw.description
: visaInfo.length > 0
? `Visa sponsorship info: ${visaInfo.join(', ')}`
: undefined;
const description = raw.description
? raw.description
: visaInfo.length > 0
? `Visa sponsorship info: ${visaInfo.join(", ")}`
: undefined;
return {
source: 'ukvisajobs',
sourceJobId: raw.id,
title: raw.title || 'Unknown Title',
employer: raw.company_name || 'Unknown Employer',
employerUrl: toStringOrNull(raw.company_link) ?? undefined,
jobUrl: raw.job_link,
applicationLink: raw.job_link,
location: raw.city || undefined,
deadline: raw.job_expire || undefined,
salary,
jobDescription: description,
datePosted: raw.created_date || undefined,
degreeRequired: toStringOrNull(raw.degree_requirement) ?? undefined,
jobType: toStringOrNull(raw.job_type) ?? undefined,
jobLevel: toStringOrNull(raw.job_level) ?? undefined,
};
return {
source: "ukvisajobs",
sourceJobId: raw.id,
title: raw.title || "Unknown Title",
employer: raw.company_name || "Unknown Employer",
employerUrl: toStringOrNull(raw.company_link) ?? undefined,
jobUrl: raw.job_link,
applicationLink: raw.job_link,
location: raw.city || undefined,
deadline: raw.job_expire || undefined,
salary,
jobDescription: description,
datePosted: raw.created_date || undefined,
degreeRequired: toStringOrNull(raw.degree_requirement) ?? undefined,
jobType: toStringOrNull(raw.job_type) ?? undefined,
jobLevel: toStringOrNull(raw.job_level) ?? undefined,
};
}
function buildCookieHeader(session: UkVisaJobsAuthSession): string {
const cookieParts: string[] = [];
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
if (session.authToken) cookieParts.push(`authToken=${session.authToken}`);
return cookieParts.join('; ');
const cookieParts: string[] = [];
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
if (session.authToken) cookieParts.push(`authToken=${session.authToken}`);
return cookieParts.join("; ");
}
function getLoginCredentials(): { email: string; password: string } | null {
const email = process.env.UKVISAJOBS_EMAIL;
const password = process.env.UKVISAJOBS_PASSWORD;
if (!email || !password) return null;
return { email, password };
const email = process.env.UKVISAJOBS_EMAIL;
const password = process.env.UKVISAJOBS_PASSWORD;
if (!email || !password) return null;
return { email, password };
}
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
try {
const data = await readFile(AUTH_CACHE_PATH, 'utf8');
const parsed = JSON.parse(data) as UkVisaJobsAuthSession;
if (!parsed?.token) return null;
return {
token: parsed.token,
authToken: parsed.authToken || parsed.token,
csrfToken: parsed.csrfToken || '',
ciSession: parsed.ciSession || '',
fetchedAt: parsed.fetchedAt || new Date().toISOString(),
source: 'cache',
};
} catch (error) {
return null;
}
try {
const data = await readFile(AUTH_CACHE_PATH, "utf8");
const parsed = JSON.parse(data) as UkVisaJobsAuthSession;
if (!parsed?.token) return null;
return {
token: parsed.token,
authToken: parsed.authToken || parsed.token,
csrfToken: parsed.csrfToken || "",
ciSession: parsed.ciSession || "",
fetchedAt: parsed.fetchedAt || new Date().toISOString(),
source: "cache",
};
} catch (_error) {
return null;
}
}
async function saveCachedAuthSession(session: UkVisaJobsAuthSession): Promise<void> {
const payload = {
token: session.token,
authToken: session.authToken,
csrfToken: session.csrfToken,
ciSession: session.ciSession,
fetchedAt: session.fetchedAt,
source: session.source,
};
await mkdir(dirname(AUTH_CACHE_PATH), { recursive: true });
await writeFile(AUTH_CACHE_PATH, JSON.stringify(payload, null, 2));
async function saveCachedAuthSession(
session: UkVisaJobsAuthSession,
): Promise<void> {
const payload = {
token: session.token,
authToken: session.authToken,
csrfToken: session.csrfToken,
ciSession: session.ciSession,
fetchedAt: session.fetchedAt,
source: session.source,
};
await mkdir(dirname(AUTH_CACHE_PATH), { recursive: true });
await writeFile(AUTH_CACHE_PATH, JSON.stringify(payload, null, 2));
}
function extractMultipartField(body: string, field: string): string | null {
const nameToken = `name="${field}"`;
const index = body.indexOf(nameToken);
if (index === -1) return null;
const nameToken = `name="${field}"`;
const index = body.indexOf(nameToken);
if (index === -1) return null;
const afterName = body.slice(index + nameToken.length);
let separatorIndex = afterName.indexOf('\r\n\r\n');
let separatorLength = 4;
if (separatorIndex === -1) {
separatorIndex = afterName.indexOf('\n\n');
separatorLength = 2;
}
if (separatorIndex === -1) return null;
const afterName = body.slice(index + nameToken.length);
let separatorIndex = afterName.indexOf("\r\n\r\n");
let separatorLength = 4;
if (separatorIndex === -1) {
separatorIndex = afterName.indexOf("\n\n");
separatorLength = 2;
}
if (separatorIndex === -1) return null;
const valueStart = index + nameToken.length + separatorIndex + separatorLength;
const remainder = body.slice(valueStart);
const endIndex = remainder.indexOf('\r\n');
if (endIndex === -1) return remainder.trim();
return remainder.slice(0, endIndex).trim();
const valueStart =
index + nameToken.length + separatorIndex + separatorLength;
const remainder = body.slice(valueStart);
const endIndex = remainder.indexOf("\r\n");
if (endIndex === -1) return remainder.trim();
return remainder.slice(0, endIndex).trim();
}
function extractTokenFromRequest(request: Request): string | null {
const postData = request.postData();
if (!postData) return null;
const multipartToken = extractMultipartField(postData, 'token');
if (multipartToken) return multipartToken;
try {
const params = new URLSearchParams(postData);
const token = params.get('token');
return token || null;
} catch (error) {
return null;
}
const postData = request.postData();
if (!postData) return null;
const multipartToken = extractMultipartField(postData, "token");
if (multipartToken) return multipartToken;
try {
const params = new URLSearchParams(postData);
const token = params.get("token");
return token || null;
} catch (_error) {
return null;
}
}
function isAuthErrorResponse(status: number, bodyText: string): boolean {
if (status === 401 || status === 403) return true;
if (status !== 400) return false;
try {
const parsed = JSON.parse(bodyText) as { errorType?: string; message?: string };
if (parsed?.errorType === 'expired') return true;
if (parsed?.message && parsed.message.toLowerCase().includes('expired')) return true;
} catch (error) {
// ignore JSON parse failures
}
return bodyText.toLowerCase().includes('expired');
if (status === 401 || status === 403) return true;
if (status !== 400) return false;
try {
const parsed = JSON.parse(bodyText) as {
errorType?: string;
message?: string;
};
if (parsed?.errorType === "expired") return true;
if (parsed?.message?.toLowerCase().includes("expired")) return true;
} catch (_error) {
// ignore JSON parse failures
}
return bodyText.toLowerCase().includes("expired");
}
async function loginWithBrowser(email: string, password: string): Promise<UkVisaJobsAuthSession> {
const [{ launchOptions }, { firefox }] = await Promise.all([
import('camoufox-js'),
import('playwright'),
]);
const headless = process.env.UKVISAJOBS_HEADLESS !== 'false';
const browser = await firefox.launch(await launchOptions({
headless,
humanize: true,
geoip: true,
}));
const context = await browser.newContext();
const page = await context.newPage();
async function loginWithBrowser(
email: string,
password: string,
): Promise<UkVisaJobsAuthSession> {
const [{ launchOptions }, { firefox }] = await Promise.all([
import("camoufox-js"),
import("playwright"),
]);
const headless = process.env.UKVISAJOBS_HEADLESS !== "false";
const browser = await firefox.launch(
await launchOptions({
headless,
humanize: true,
geoip: true,
}),
);
const context = await browser.newContext();
const page = await context.newPage();
try {
await page.goto(SIGNIN_URL, { waitUntil: "domcontentloaded" });
await page.waitForSelector("#email", { timeout: 15000 });
await page.fill("#email", email);
await page.fill("#password", password);
await page.keyboard.press("Enter");
await page.waitForTimeout(7000);
const requestPromise = page.waitForRequest(
(request) =>
request.url().includes("/ukvisa-api/api/fetch-jobs-data") &&
request.method() === "POST",
{ timeout: 30000 },
);
await page.goto(OPEN_JOBS_URL, { waitUntil: "networkidle" });
await page.waitForTimeout(5000);
let fetchRequest: Request | null = null;
try {
await page.goto(SIGNIN_URL, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('#email', { timeout: 15000 });
await page.fill('#email', email);
await page.fill('#password', password);
await page.keyboard.press('Enter');
await page.waitForTimeout(7000);
const requestPromise = page.waitForRequest(
(request) => request.url().includes('/ukvisa-api/api/fetch-jobs-data') && request.method() === 'POST',
{ timeout: 30000 }
);
await page.goto(OPEN_JOBS_URL, { waitUntil: 'networkidle' });
await page.waitForTimeout(5000);
let fetchRequest: Request | null = null;
try {
fetchRequest = await requestPromise;
} catch (error) {
fetchRequest = null;
}
const cookies = await context.cookies('https://my.ukvisajobs.com');
const csrfToken = cookies.find((cookie) => cookie.name === 'csrf_token')?.value || '';
const ciSession = cookies.find((cookie) => cookie.name === 'ci_session')?.value || '';
const authToken = cookies.find((cookie) => cookie.name === 'authToken')?.value || '';
const token = fetchRequest ? extractTokenFromRequest(fetchRequest) : authToken;
if (!token) {
throw new Error('Failed to locate auth token from browser session.');
}
return {
token,
authToken: authToken || token,
csrfToken,
ciSession,
fetchedAt: new Date().toISOString(),
source: 'browser',
};
} finally {
await browser.close();
fetchRequest = await requestPromise;
} catch (_error) {
fetchRequest = null;
}
const cookies = await context.cookies("https://my.ukvisajobs.com");
const csrfToken =
cookies.find((cookie) => cookie.name === "csrf_token")?.value || "";
const ciSession =
cookies.find((cookie) => cookie.name === "ci_session")?.value || "";
const authToken =
cookies.find((cookie) => cookie.name === "authToken")?.value || "";
const token = fetchRequest
? extractTokenFromRequest(fetchRequest)
: authToken;
if (!token) {
throw new Error("Failed to locate auth token from browser session.");
}
return {
token,
authToken: authToken || token,
csrfToken,
ciSession,
fetchedAt: new Date().toISOString(),
source: "browser",
};
} finally {
await browser.close();
}
}
async function main(): Promise<void> {
console.log('🇬🇧 UK Visa Jobs Extractor starting...');
const credentials = getLoginCredentials();
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
const refreshOnly = process.env.UKVISAJOBS_REFRESH_ONLY === '1';
console.log("🇬🇧 UK Visa Jobs Extractor starting...");
const credentials = getLoginCredentials();
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
const refreshOnly = process.env.UKVISAJOBS_REFRESH_ONLY === "1";
let authSession = await loadCachedAuthSession();
let authSession = await loadCachedAuthSession();
if (refreshOnly) {
if (refreshOnly) {
if (!credentials) {
console.error(
"ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set",
);
process.exit(1);
}
console.log(" Refresh-only mode: logging in to refresh tokens...");
authSession = await loginWithBrowser(
credentials.email,
credentials.password,
);
await saveCachedAuthSession(authSession);
console.log(" Auth session refreshed.");
return;
}
if (!authSession) {
if (!credentials) {
console.error(
"ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set",
);
process.exit(1);
}
console.log(" No cached session found. Logging in to refresh tokens...");
authSession = await loginWithBrowser(
credentials.email,
credentials.password,
);
await saveCachedAuthSession(authSession);
}
const cookies = buildCookieHeader(authSession);
console.log(` Auth source: ${authSession.source}`);
console.log(` Cookies configured: ${cookies ? "Yes" : "No"}`);
console.log(` Token length: ${authSession.token.length}`);
// Get max jobs from environment
const maxJobsEnv = toNumberOrNull(process.env.UKVISAJOBS_MAX_JOBS);
const maxJobs = Math.min(maxJobsEnv ?? DEFAULT_MAX_JOBS, MAX_ALLOWED_JOBS);
const maxPages = Math.ceil(maxJobs / JOBS_PER_PAGE);
console.log(` Max jobs: ${maxJobs} (${maxPages} pages)`);
if (searchKeyword) {
console.log(` Search keyword: ${searchKeyword}`);
}
const allJobs: ExtractedJob[] = [];
const seenIds = new Set<string>();
let totalAvailable = 0;
let pageNo = 1;
try {
while (pageNo <= maxPages && allJobs.length < maxJobs) {
console.log(` Fetching page ${pageNo}/${maxPages}...`);
let response: UkVisaJobsApiResponse;
try {
response = await fetchPage(pageNo, authSession, { searchKeyword });
} catch (error) {
if (!credentials) {
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
process.exit(1);
if (error instanceof UkVisaJobsAuthError) {
throw new Error(
"UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.",
);
}
throw error;
}
console.log(' Refresh-only mode: logging in to refresh tokens...');
authSession = await loginWithBrowser(credentials.email, credentials.password);
const reason =
error instanceof UkVisaJobsAuthError
? "Auth expired."
: "Fetch failed.";
console.log(` ${reason} Refreshing tokens and retrying...`);
authSession = await loginWithBrowser(
credentials.email,
credentials.password,
);
await saveCachedAuthSession(authSession);
console.log(' Auth session refreshed.');
return;
response = await fetchPage(pageNo, authSession, { searchKeyword });
}
if (response.status !== 1) {
console.warn(
` ⚠️ API returned status ${response.status} on page ${pageNo}`,
);
break;
}
if (pageNo === 1) {
totalAvailable = response.totalJobs;
console.log(` Total available: ${totalAvailable} jobs`);
}
if (!response.jobs || response.jobs.length === 0) {
console.log(` No more jobs on page ${pageNo}`);
break;
}
for (const rawJob of response.jobs) {
if (allJobs.length >= maxJobs) break;
// Deduplicate by ID
if (seenIds.has(rawJob.id)) continue;
seenIds.add(rawJob.id);
const mapped = mapJob(rawJob);
allJobs.push(mapped);
}
// If we got fewer jobs than a full page, we're at the end
if (response.jobs.length < JOBS_PER_PAGE) {
break;
}
pageNo++;
// Small delay to be nice to the API
await new Promise((resolve) => setTimeout(resolve, 500));
}
if (!authSession) {
if (!credentials) {
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
process.exit(1);
}
console.log(' No cached session found. Logging in to refresh tokens...');
authSession = await loginWithBrowser(credentials.email, credentials.password);
await saveCachedAuthSession(authSession);
console.log(`✅ Scraped ${allJobs.length} jobs`);
// Write output to storage directory (similar to Crawlee dataset structure)
const storageDir = join(__dirname, "../storage/datasets/default");
await mkdir(storageDir, { recursive: true });
// Write each job as a separate JSON file (Crawlee dataset format)
for (let i = 0; i < allJobs.length; i++) {
const filename = join(
storageDir,
`${String(i + 1).padStart(6, "0")}.json`,
);
await writeFile(filename, JSON.stringify(allJobs[i], null, 2));
}
const cookies = buildCookieHeader(authSession);
console.log(` Auth source: ${authSession.source}`);
console.log(` Cookies configured: ${cookies ? 'Yes' : 'No'}`);
console.log(` Token length: ${authSession.token.length}`);
// Also write a combined output file for easier consumption
const outputFile = join(storageDir, "jobs.json");
await writeFile(outputFile, JSON.stringify(allJobs, null, 2));
// Get max jobs from environment
const maxJobsEnv = toNumberOrNull(process.env.UKVISAJOBS_MAX_JOBS);
const maxJobs = Math.min(maxJobsEnv ?? DEFAULT_MAX_JOBS, MAX_ALLOWED_JOBS);
const maxPages = Math.ceil(maxJobs / JOBS_PER_PAGE);
console.log(` Max jobs: ${maxJobs} (${maxPages} pages)`);
if (searchKeyword) {
console.log(` Search keyword: ${searchKeyword}`);
}
const allJobs: ExtractedJob[] = [];
const seenIds = new Set<string>();
let totalAvailable = 0;
let pageNo = 1;
try {
while (pageNo <= maxPages && allJobs.length < maxJobs) {
console.log(` Fetching page ${pageNo}/${maxPages}...`);
let response: UkVisaJobsApiResponse;
try {
response = await fetchPage(pageNo, authSession, { searchKeyword });
} catch (error) {
if (!credentials) {
if (error instanceof UkVisaJobsAuthError) {
throw new Error('UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
}
throw error;
}
const reason = error instanceof UkVisaJobsAuthError ? 'Auth expired.' : 'Fetch failed.';
console.log(` ${reason} Refreshing tokens and retrying...`);
authSession = await loginWithBrowser(credentials.email, credentials.password);
await saveCachedAuthSession(authSession);
response = await fetchPage(pageNo, authSession, { searchKeyword });
}
if (response.status !== 1) {
console.warn(` ⚠️ API returned status ${response.status} on page ${pageNo}`);
break;
}
if (pageNo === 1) {
totalAvailable = response.totalJobs;
console.log(` Total available: ${totalAvailable} jobs`);
}
if (!response.jobs || response.jobs.length === 0) {
console.log(` No more jobs on page ${pageNo}`);
break;
}
for (const rawJob of response.jobs) {
if (allJobs.length >= maxJobs) break;
// Deduplicate by ID
if (seenIds.has(rawJob.id)) continue;
seenIds.add(rawJob.id);
const mapped = mapJob(rawJob);
allJobs.push(mapped);
}
// If we got fewer jobs than a full page, we're at the end
if (response.jobs.length < JOBS_PER_PAGE) {
break;
}
pageNo++;
// Small delay to be nice to the API
await new Promise((resolve) => setTimeout(resolve, 500));
}
console.log(`✅ Scraped ${allJobs.length} jobs`);
// Write output to storage directory (similar to Crawlee dataset structure)
const storageDir = join(__dirname, '../storage/datasets/default');
await mkdir(storageDir, { recursive: true });
// Write each job as a separate JSON file (Crawlee dataset format)
for (let i = 0; i < allJobs.length; i++) {
const filename = join(storageDir, `${String(i + 1).padStart(6, '0')}.json`);
await writeFile(filename, JSON.stringify(allJobs[i], null, 2));
}
// Also write a combined output file for easier consumption
const outputFile = join(storageDir, 'jobs.json');
await writeFile(outputFile, JSON.stringify(allJobs, null, 2));
console.log(` Output written to: ${storageDir}`);
console.log(` Jobs file: ${outputFile}`);
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
console.error(`❌ Error: ${message}`);
process.exit(1);
}
console.log(` Output written to: ${storageDir}`);
console.log(` Jobs file: ${outputFile}`);
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
console.error(`❌ Error: ${message}`);
process.exit(1);
}
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
console.error("Fatal error:", error);
process.exit(1);
});

View File

@ -1,12 +1,12 @@
{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"lib": ["DOM"]
},
"include": ["./src/**/*"]
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"lib": ["DOM"]
},
"include": ["./src/**/*"]
}