in extractors
This commit is contained in:
parent
aaab9b5124
commit
4ffaf06b1d
11
biome.json
11
biome.json
@ -5,10 +5,7 @@
|
||||
"indentWidth": 2
|
||||
},
|
||||
"files": {
|
||||
"includes": [
|
||||
"**",
|
||||
"!!**/dist"
|
||||
]
|
||||
"includes": ["**", "!!**/dist"]
|
||||
},
|
||||
"css": {
|
||||
"parser": {
|
||||
@ -17,11 +14,7 @@
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"includes": [
|
||||
"**/*.test.ts",
|
||||
"**/*.test.tsx",
|
||||
"**/test-utils.ts"
|
||||
],
|
||||
"includes": ["**/*.test.ts", "**/*.test.tsx", "**/test-utils.ts"],
|
||||
"linter": {
|
||||
"rules": {
|
||||
"suspicious": {
|
||||
|
||||
@ -2,9 +2,8 @@
|
||||
import { launchOptions } from "camoufox-js";
|
||||
import { PlaywrightCrawler } from "crawlee";
|
||||
import { firefox } from "playwright";
|
||||
|
||||
import { router } from "./routes.js";
|
||||
import { initJobOpsProgress } from "./progress.js";
|
||||
import { router } from "./routes.js";
|
||||
|
||||
// locations
|
||||
const locations = [
|
||||
@ -17,10 +16,7 @@ const locations = [
|
||||
];
|
||||
|
||||
// roles
|
||||
const defaultRoles = [
|
||||
"web-development",
|
||||
"software-systems",
|
||||
];
|
||||
const defaultRoles = ["web-development", "software-systems"];
|
||||
|
||||
let roles = defaultRoles;
|
||||
const envRolesRaw = process.env.GRADCRACKER_SEARCH_TERMS;
|
||||
@ -29,15 +25,16 @@ if (envRolesRaw) {
|
||||
try {
|
||||
const parsed = JSON.parse(envRolesRaw) as string[];
|
||||
if (Array.isArray(parsed) && parsed.length > 0) {
|
||||
roles = parsed.map(term =>
|
||||
term.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
roles = parsed.map((term) =>
|
||||
term
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, "-")
|
||||
.replace(/^-+|-+$/g, ""),
|
||||
);
|
||||
console.log(`Using configured search terms: ${roles.join(', ')}`);
|
||||
console.log(`Using configured search terms: ${roles.join(", ")}`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('Failed to parse GRADCRACKER_SEARCH_TERMS', e);
|
||||
console.warn("Failed to parse GRADCRACKER_SEARCH_TERMS", e);
|
||||
}
|
||||
}
|
||||
|
||||
@ -46,12 +43,12 @@ const gradcrackerUrls = locations.flatMap((location) => {
|
||||
return roles.map((role) => {
|
||||
return {
|
||||
url: `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`,
|
||||
role
|
||||
role,
|
||||
};
|
||||
});
|
||||
});
|
||||
|
||||
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`)
|
||||
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`);
|
||||
|
||||
const startUrls = gradcrackerUrls.map(({ url, role }) => ({
|
||||
url,
|
||||
|
||||
@ -26,7 +26,7 @@ interface JobOpsCrawlProgressState {
|
||||
const PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
|
||||
const isEnabled = () => process.env.JOBOPS_EMIT_PROGRESS === "1";
|
||||
|
||||
let state: JobOpsCrawlProgressState = {
|
||||
const state: JobOpsCrawlProgressState = {
|
||||
listPagesProcessed: 0,
|
||||
jobCardsFound: 0,
|
||||
jobPagesEnqueued: 0,
|
||||
@ -80,4 +80,3 @@ export function markJobPageDone(params: { currentUrl: string }): void {
|
||||
state.currentUrl = params.currentUrl;
|
||||
emit();
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
import { createPlaywrightRouter, log } from "crawlee";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { createPlaywrightRouter, log } from "crawlee";
|
||||
import { markJobPageDone, markListPageDone } from "./progress.js";
|
||||
|
||||
function normalizeUrl(raw: string | null | undefined): string | null {
|
||||
@ -17,8 +17,7 @@ function normalizeUrl(raw: string | null | undefined): string | null {
|
||||
|
||||
function getExistingJobUrlSet(): Set<string> {
|
||||
const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
|
||||
const raw =
|
||||
filePath
|
||||
const raw = filePath
|
||||
? (() => {
|
||||
try {
|
||||
return readFileSync(filePath, "utf-8");
|
||||
@ -41,12 +40,16 @@ function getExistingJobUrlSet(): Set<string> {
|
||||
}
|
||||
}
|
||||
|
||||
const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
|
||||
const SKIP_APPLY_FOR_EXISTING =
|
||||
process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
|
||||
const EXISTING_JOB_URLS = getExistingJobUrlSet();
|
||||
|
||||
// Global counters for max jobs per search term
|
||||
const jobCounts = new Map<string, number>();
|
||||
const MAX_JOBS_PER_TERM = parseInt(process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0", 10);
|
||||
const MAX_JOBS_PER_TERM = parseInt(
|
||||
process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0",
|
||||
10,
|
||||
);
|
||||
|
||||
interface Job {
|
||||
title: string | null;
|
||||
@ -72,7 +75,9 @@ router.addHandler(
|
||||
if (MAX_JOBS_PER_TERM > 0) {
|
||||
const currentCount = jobCounts.get(role) || 0;
|
||||
if (currentCount >= MAX_JOBS_PER_TERM) {
|
||||
log.info(`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`);
|
||||
log.info(
|
||||
`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`,
|
||||
);
|
||||
markListPageDone({
|
||||
currentUrl: request.url,
|
||||
jobCardsFound: 0,
|
||||
@ -120,7 +125,8 @@ router.addHandler(
|
||||
let disciplines: string | null = null;
|
||||
try {
|
||||
const disciplinesEl = article.locator("h3");
|
||||
disciplines = (await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
|
||||
disciplines =
|
||||
(await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
|
||||
} catch {
|
||||
// h3 not found or timed out - that's okay, disciplines is optional
|
||||
}
|
||||
@ -195,7 +201,9 @@ router.addHandler(
|
||||
if (MAX_JOBS_PER_TERM > 0) {
|
||||
const currentCount = jobCounts.get(role) || 0;
|
||||
if (currentCount >= MAX_JOBS_PER_TERM) {
|
||||
log.info(`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`);
|
||||
log.info(
|
||||
`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
jobCounts.set(role, currentCount + 1);
|
||||
@ -205,7 +213,7 @@ router.addHandler(
|
||||
urls: [jobUrl],
|
||||
userData: {
|
||||
...jobs[jobs.length - 1],
|
||||
label: "gradcracker-single-job-page"
|
||||
label: "gradcracker-single-job-page",
|
||||
},
|
||||
});
|
||||
enqueuedJobs++;
|
||||
@ -216,7 +224,7 @@ router.addHandler(
|
||||
log.info(`Extracted ${jobs.length} jobs`);
|
||||
if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
|
||||
log.info(
|
||||
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`
|
||||
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`,
|
||||
);
|
||||
}
|
||||
|
||||
@ -226,7 +234,7 @@ router.addHandler(
|
||||
jobPagesEnqueued: enqueuedJobs,
|
||||
jobPagesSkipped: skippedKnownJobs,
|
||||
});
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
router.addHandler(
|
||||
@ -261,7 +269,9 @@ router.addHandler(
|
||||
|
||||
// Prefer page-scoped popup detection. Using the browser context's "page" event
|
||||
// can accidentally capture unrelated pages created by other concurrent requests.
|
||||
const popupPromise = page.waitForEvent("popup", { timeout: 8000 }).catch(() => null);
|
||||
const popupPromise = page
|
||||
.waitForEvent("popup", { timeout: 8000 })
|
||||
.catch(() => null);
|
||||
const navigationPromise = page
|
||||
.waitForNavigation({ timeout: 8000, waitUntil: "domcontentloaded" })
|
||||
.catch(() => null);
|
||||
@ -271,7 +281,12 @@ router.addHandler(
|
||||
await applyButton.click();
|
||||
|
||||
// Wait for URL to stabilize (same URL for 3 consecutive checks)
|
||||
const waitForUrlStable = async (targetPage: typeof page, maxWaitMs = 10000, checkIntervalMs = 100, requiredStableChecks = 3) => {
|
||||
const waitForUrlStable = async (
|
||||
targetPage: typeof page,
|
||||
maxWaitMs = 10000,
|
||||
checkIntervalMs = 100,
|
||||
requiredStableChecks = 3,
|
||||
) => {
|
||||
let lastUrl = targetPage.url();
|
||||
let stableCount = 0;
|
||||
const startTime = Date.now();
|
||||
@ -298,11 +313,15 @@ router.addHandler(
|
||||
const targetPage = maybePopup ?? page;
|
||||
|
||||
if (maybePopup) {
|
||||
await maybePopup.waitForLoadState("domcontentloaded", { timeout: 15000 }).catch(() => null);
|
||||
await maybePopup
|
||||
.waitForLoadState("domcontentloaded", { timeout: 15000 })
|
||||
.catch(() => null);
|
||||
// If the popup initially opens as about:blank, give it a moment to redirect.
|
||||
if (maybePopup.url() === "about:blank") {
|
||||
await maybePopup
|
||||
.waitForURL((u) => u.toString() !== "about:blank", { timeout: 15000 })
|
||||
.waitForURL((u) => u.toString() !== "about:blank", {
|
||||
timeout: 15000,
|
||||
})
|
||||
.catch(() => null);
|
||||
}
|
||||
} else {
|
||||
@ -317,7 +336,7 @@ router.addHandler(
|
||||
|
||||
if (applicationLink === originalUrl) {
|
||||
log.info(
|
||||
`Apply click did not change URL (still Gradcracker): ${applicationLink}`
|
||||
`Apply click did not change URL (still Gradcracker): ${applicationLink}`,
|
||||
);
|
||||
} else {
|
||||
log.info(`Captured application URL: ${applicationLink}`);
|
||||
@ -342,5 +361,5 @@ router.addHandler(
|
||||
});
|
||||
|
||||
markJobPageDone({ currentUrl: request.url });
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
@ -13,17 +13,18 @@
|
||||
* UKVISAJOBS_REFRESH_ONLY - Set to "1" to refresh tokens and exit
|
||||
*/
|
||||
|
||||
import { mkdir, writeFile, readFile } from 'fs/promises';
|
||||
import { join, dirname } from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import type { Request } from 'playwright';
|
||||
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import type { Request } from "playwright";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const API_URL = 'https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data';
|
||||
const SIGNIN_URL = 'https://my.ukvisajobs.com/signin';
|
||||
const OPEN_JOBS_URL = 'https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1';
|
||||
const AUTH_CACHE_PATH = join(__dirname, '../storage/ukvisajobs-auth.json');
|
||||
const API_URL = "https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data";
|
||||
const SIGNIN_URL = "https://my.ukvisajobs.com/signin";
|
||||
const OPEN_JOBS_URL =
|
||||
"https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1";
|
||||
const AUTH_CACHE_PATH = join(__dirname, "../storage/ukvisajobs-auth.json");
|
||||
const JOBS_PER_PAGE = 15;
|
||||
const DEFAULT_MAX_JOBS = 50;
|
||||
const MAX_ALLOWED_JOBS = 200;
|
||||
@ -64,7 +65,7 @@ interface UkVisaJobsApiResponse {
|
||||
}
|
||||
|
||||
interface ExtractedJob {
|
||||
source: 'ukvisajobs';
|
||||
source: "ukvisajobs";
|
||||
sourceJobId: string;
|
||||
title: string;
|
||||
employer: string;
|
||||
@ -87,7 +88,7 @@ interface UkVisaJobsAuthSession {
|
||||
csrfToken: string;
|
||||
ciSession: string;
|
||||
fetchedAt: string;
|
||||
source: 'cache' | 'browser';
|
||||
source: "cache" | "browser";
|
||||
}
|
||||
|
||||
class UkVisaJobsAuthError extends Error {
|
||||
@ -96,7 +97,7 @@ class UkVisaJobsAuthError extends Error {
|
||||
|
||||
constructor(message: string, status: number, responseText: string) {
|
||||
super(message);
|
||||
this.name = 'UkVisaJobsAuthError';
|
||||
this.name = "UkVisaJobsAuthError";
|
||||
this.status = status;
|
||||
this.responseText = responseText;
|
||||
}
|
||||
@ -104,18 +105,19 @@ class UkVisaJobsAuthError extends Error {
|
||||
|
||||
function toStringOrNull(value: unknown): string | null {
|
||||
if (value === null || value === undefined) return null;
|
||||
if (typeof value === 'string') {
|
||||
if (typeof value === "string") {
|
||||
const trimmed = value.trim();
|
||||
return trimmed.length > 0 ? trimmed : null;
|
||||
}
|
||||
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
|
||||
if (typeof value === "number" || typeof value === "boolean")
|
||||
return String(value);
|
||||
return null;
|
||||
}
|
||||
|
||||
function toNumberOrNull(value: unknown): number | null {
|
||||
if (value === null || value === undefined) return null;
|
||||
if (typeof value === 'number') return Number.isFinite(value) ? value : null;
|
||||
if (typeof value === 'string') {
|
||||
if (typeof value === "number") return Number.isFinite(value) ? value : null;
|
||||
if (typeof value === "string") {
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed) return null;
|
||||
const parsed = Number(trimmed);
|
||||
@ -127,29 +129,30 @@ function toNumberOrNull(value: unknown): number | null {
|
||||
async function fetchPage(
|
||||
pageNo: number,
|
||||
session: UkVisaJobsAuthSession,
|
||||
options: { searchKeyword?: string } = {}
|
||||
options: { searchKeyword?: string } = {},
|
||||
): Promise<UkVisaJobsApiResponse> {
|
||||
// Use native FormData API (Node.js 18+)
|
||||
const formData = new FormData();
|
||||
formData.append('is_global', '0');
|
||||
formData.append('sortBy', 'desc');
|
||||
formData.append('pageNo', String(pageNo));
|
||||
formData.append('visaAcceptance', 'false');
|
||||
formData.append('applicants_outside_uk', 'false');
|
||||
formData.append('searchKeyword', options.searchKeyword || 'null');
|
||||
formData.append('token', session.token);
|
||||
formData.append("is_global", "0");
|
||||
formData.append("sortBy", "desc");
|
||||
formData.append("pageNo", String(pageNo));
|
||||
formData.append("visaAcceptance", "false");
|
||||
formData.append("applicants_outside_uk", "false");
|
||||
formData.append("searchKeyword", options.searchKeyword || "null");
|
||||
formData.append("token", session.token);
|
||||
|
||||
const cookies = buildCookieHeader(session);
|
||||
|
||||
const response = await fetch(API_URL, {
|
||||
method: 'POST',
|
||||
method: "POST",
|
||||
headers: {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'cookie': cookies,
|
||||
'origin': 'https://my.ukvisajobs.com',
|
||||
'referer': `https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&pageNo=${pageNo}&visaAcceptance=false&applicants_outside_uk=false`,
|
||||
'user-agent': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36',
|
||||
accept: "application/json, text/plain, */*",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
cookie: cookies,
|
||||
origin: "https://my.ukvisajobs.com",
|
||||
referer: `https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&pageNo=${pageNo}&visaAcceptance=false&applicants_outside_uk=false`,
|
||||
"user-agent":
|
||||
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36",
|
||||
},
|
||||
body: formData,
|
||||
});
|
||||
@ -160,10 +163,12 @@ async function fetchPage(
|
||||
throw new UkVisaJobsAuthError(
|
||||
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
|
||||
response.status,
|
||||
text
|
||||
text,
|
||||
);
|
||||
}
|
||||
throw new Error(`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`);
|
||||
throw new Error(
|
||||
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
|
||||
);
|
||||
}
|
||||
|
||||
return response.json() as Promise<UkVisaJobsApiResponse>;
|
||||
@ -171,11 +176,16 @@ async function fetchPage(
|
||||
|
||||
function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
||||
// Build salary string from min/max
|
||||
let salary: string | undefined = undefined;
|
||||
let salary: string | undefined;
|
||||
const minSalary = toNumberOrNull(raw.min_salary);
|
||||
const maxSalary = toNumberOrNull(raw.max_salary);
|
||||
|
||||
if (minSalary !== null && minSalary > 0 && maxSalary !== null && maxSalary > 0) {
|
||||
if (
|
||||
minSalary !== null &&
|
||||
minSalary > 0 &&
|
||||
maxSalary !== null &&
|
||||
maxSalary > 0
|
||||
) {
|
||||
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
|
||||
if (raw.salary_interval) {
|
||||
salary += ` / ${raw.salary_interval}`;
|
||||
@ -189,24 +199,30 @@ function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
||||
|
||||
// Build a description from visa sponsorship fields
|
||||
const visaInfo: string[] = [];
|
||||
if (raw.visa_acceptance?.toLowerCase() === 'yes') visaInfo.push('Visa acceptance: Yes');
|
||||
if (raw.applicants_outside_uk?.toLowerCase() === 'yes') visaInfo.push('Accepts applicants outside UK');
|
||||
if (raw.likely_to_sponsor?.toLowerCase() === 'yes') visaInfo.push('Likely to sponsor');
|
||||
if (raw.definitely_sponsored?.toLowerCase() === 'yes') visaInfo.push('Definitely sponsored');
|
||||
if (raw.new_entrant?.toLowerCase() === 'yes') visaInfo.push('New entrant friendly');
|
||||
if (raw.student_graduate?.toLowerCase() === 'yes') visaInfo.push('Student/Graduate friendly');
|
||||
if (raw.visa_acceptance?.toLowerCase() === "yes")
|
||||
visaInfo.push("Visa acceptance: Yes");
|
||||
if (raw.applicants_outside_uk?.toLowerCase() === "yes")
|
||||
visaInfo.push("Accepts applicants outside UK");
|
||||
if (raw.likely_to_sponsor?.toLowerCase() === "yes")
|
||||
visaInfo.push("Likely to sponsor");
|
||||
if (raw.definitely_sponsored?.toLowerCase() === "yes")
|
||||
visaInfo.push("Definitely sponsored");
|
||||
if (raw.new_entrant?.toLowerCase() === "yes")
|
||||
visaInfo.push("New entrant friendly");
|
||||
if (raw.student_graduate?.toLowerCase() === "yes")
|
||||
visaInfo.push("Student/Graduate friendly");
|
||||
|
||||
const description = raw.description
|
||||
? raw.description
|
||||
: visaInfo.length > 0
|
||||
? `Visa sponsorship info: ${visaInfo.join(', ')}`
|
||||
? `Visa sponsorship info: ${visaInfo.join(", ")}`
|
||||
: undefined;
|
||||
|
||||
return {
|
||||
source: 'ukvisajobs',
|
||||
source: "ukvisajobs",
|
||||
sourceJobId: raw.id,
|
||||
title: raw.title || 'Unknown Title',
|
||||
employer: raw.company_name || 'Unknown Employer',
|
||||
title: raw.title || "Unknown Title",
|
||||
employer: raw.company_name || "Unknown Employer",
|
||||
employerUrl: toStringOrNull(raw.company_link) ?? undefined,
|
||||
jobUrl: raw.job_link,
|
||||
applicationLink: raw.job_link,
|
||||
@ -226,7 +242,7 @@ function buildCookieHeader(session: UkVisaJobsAuthSession): string {
|
||||
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
|
||||
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
|
||||
if (session.authToken) cookieParts.push(`authToken=${session.authToken}`);
|
||||
return cookieParts.join('; ');
|
||||
return cookieParts.join("; ");
|
||||
}
|
||||
|
||||
function getLoginCredentials(): { email: string; password: string } | null {
|
||||
@ -238,23 +254,25 @@ function getLoginCredentials(): { email: string; password: string } | null {
|
||||
|
||||
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
||||
try {
|
||||
const data = await readFile(AUTH_CACHE_PATH, 'utf8');
|
||||
const data = await readFile(AUTH_CACHE_PATH, "utf8");
|
||||
const parsed = JSON.parse(data) as UkVisaJobsAuthSession;
|
||||
if (!parsed?.token) return null;
|
||||
return {
|
||||
token: parsed.token,
|
||||
authToken: parsed.authToken || parsed.token,
|
||||
csrfToken: parsed.csrfToken || '',
|
||||
ciSession: parsed.ciSession || '',
|
||||
csrfToken: parsed.csrfToken || "",
|
||||
ciSession: parsed.ciSession || "",
|
||||
fetchedAt: parsed.fetchedAt || new Date().toISOString(),
|
||||
source: 'cache',
|
||||
source: "cache",
|
||||
};
|
||||
} catch (error) {
|
||||
} catch (_error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function saveCachedAuthSession(session: UkVisaJobsAuthSession): Promise<void> {
|
||||
async function saveCachedAuthSession(
|
||||
session: UkVisaJobsAuthSession,
|
||||
): Promise<void> {
|
||||
const payload = {
|
||||
token: session.token,
|
||||
authToken: session.authToken,
|
||||
@ -273,17 +291,18 @@ function extractMultipartField(body: string, field: string): string | null {
|
||||
if (index === -1) return null;
|
||||
|
||||
const afterName = body.slice(index + nameToken.length);
|
||||
let separatorIndex = afterName.indexOf('\r\n\r\n');
|
||||
let separatorIndex = afterName.indexOf("\r\n\r\n");
|
||||
let separatorLength = 4;
|
||||
if (separatorIndex === -1) {
|
||||
separatorIndex = afterName.indexOf('\n\n');
|
||||
separatorIndex = afterName.indexOf("\n\n");
|
||||
separatorLength = 2;
|
||||
}
|
||||
if (separatorIndex === -1) return null;
|
||||
|
||||
const valueStart = index + nameToken.length + separatorIndex + separatorLength;
|
||||
const valueStart =
|
||||
index + nameToken.length + separatorIndex + separatorLength;
|
||||
const remainder = body.slice(valueStart);
|
||||
const endIndex = remainder.indexOf('\r\n');
|
||||
const endIndex = remainder.indexOf("\r\n");
|
||||
if (endIndex === -1) return remainder.trim();
|
||||
return remainder.slice(0, endIndex).trim();
|
||||
}
|
||||
@ -291,13 +310,13 @@ function extractMultipartField(body: string, field: string): string | null {
|
||||
function extractTokenFromRequest(request: Request): string | null {
|
||||
const postData = request.postData();
|
||||
if (!postData) return null;
|
||||
const multipartToken = extractMultipartField(postData, 'token');
|
||||
const multipartToken = extractMultipartField(postData, "token");
|
||||
if (multipartToken) return multipartToken;
|
||||
try {
|
||||
const params = new URLSearchParams(postData);
|
||||
const token = params.get('token');
|
||||
const token = params.get("token");
|
||||
return token || null;
|
||||
} catch (error) {
|
||||
} catch (_error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -306,60 +325,75 @@ function isAuthErrorResponse(status: number, bodyText: string): boolean {
|
||||
if (status === 401 || status === 403) return true;
|
||||
if (status !== 400) return false;
|
||||
try {
|
||||
const parsed = JSON.parse(bodyText) as { errorType?: string; message?: string };
|
||||
if (parsed?.errorType === 'expired') return true;
|
||||
if (parsed?.message && parsed.message.toLowerCase().includes('expired')) return true;
|
||||
} catch (error) {
|
||||
const parsed = JSON.parse(bodyText) as {
|
||||
errorType?: string;
|
||||
message?: string;
|
||||
};
|
||||
if (parsed?.errorType === "expired") return true;
|
||||
if (parsed?.message?.toLowerCase().includes("expired")) return true;
|
||||
} catch (_error) {
|
||||
// ignore JSON parse failures
|
||||
}
|
||||
return bodyText.toLowerCase().includes('expired');
|
||||
return bodyText.toLowerCase().includes("expired");
|
||||
}
|
||||
|
||||
async function loginWithBrowser(email: string, password: string): Promise<UkVisaJobsAuthSession> {
|
||||
async function loginWithBrowser(
|
||||
email: string,
|
||||
password: string,
|
||||
): Promise<UkVisaJobsAuthSession> {
|
||||
const [{ launchOptions }, { firefox }] = await Promise.all([
|
||||
import('camoufox-js'),
|
||||
import('playwright'),
|
||||
import("camoufox-js"),
|
||||
import("playwright"),
|
||||
]);
|
||||
const headless = process.env.UKVISAJOBS_HEADLESS !== 'false';
|
||||
const browser = await firefox.launch(await launchOptions({
|
||||
const headless = process.env.UKVISAJOBS_HEADLESS !== "false";
|
||||
const browser = await firefox.launch(
|
||||
await launchOptions({
|
||||
headless,
|
||||
humanize: true,
|
||||
geoip: true,
|
||||
}));
|
||||
}),
|
||||
);
|
||||
const context = await browser.newContext();
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
await page.goto(SIGNIN_URL, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForSelector('#email', { timeout: 15000 });
|
||||
await page.fill('#email', email);
|
||||
await page.fill('#password', password);
|
||||
await page.keyboard.press('Enter');
|
||||
await page.goto(SIGNIN_URL, { waitUntil: "domcontentloaded" });
|
||||
await page.waitForSelector("#email", { timeout: 15000 });
|
||||
await page.fill("#email", email);
|
||||
await page.fill("#password", password);
|
||||
await page.keyboard.press("Enter");
|
||||
await page.waitForTimeout(7000);
|
||||
|
||||
const requestPromise = page.waitForRequest(
|
||||
(request) => request.url().includes('/ukvisa-api/api/fetch-jobs-data') && request.method() === 'POST',
|
||||
{ timeout: 30000 }
|
||||
(request) =>
|
||||
request.url().includes("/ukvisa-api/api/fetch-jobs-data") &&
|
||||
request.method() === "POST",
|
||||
{ timeout: 30000 },
|
||||
);
|
||||
|
||||
await page.goto(OPEN_JOBS_URL, { waitUntil: 'networkidle' });
|
||||
await page.goto(OPEN_JOBS_URL, { waitUntil: "networkidle" });
|
||||
await page.waitForTimeout(5000);
|
||||
|
||||
let fetchRequest: Request | null = null;
|
||||
try {
|
||||
fetchRequest = await requestPromise;
|
||||
} catch (error) {
|
||||
} catch (_error) {
|
||||
fetchRequest = null;
|
||||
}
|
||||
|
||||
const cookies = await context.cookies('https://my.ukvisajobs.com');
|
||||
const csrfToken = cookies.find((cookie) => cookie.name === 'csrf_token')?.value || '';
|
||||
const ciSession = cookies.find((cookie) => cookie.name === 'ci_session')?.value || '';
|
||||
const authToken = cookies.find((cookie) => cookie.name === 'authToken')?.value || '';
|
||||
const token = fetchRequest ? extractTokenFromRequest(fetchRequest) : authToken;
|
||||
const cookies = await context.cookies("https://my.ukvisajobs.com");
|
||||
const csrfToken =
|
||||
cookies.find((cookie) => cookie.name === "csrf_token")?.value || "";
|
||||
const ciSession =
|
||||
cookies.find((cookie) => cookie.name === "ci_session")?.value || "";
|
||||
const authToken =
|
||||
cookies.find((cookie) => cookie.name === "authToken")?.value || "";
|
||||
const token = fetchRequest
|
||||
? extractTokenFromRequest(fetchRequest)
|
||||
: authToken;
|
||||
|
||||
if (!token) {
|
||||
throw new Error('Failed to locate auth token from browser session.');
|
||||
throw new Error("Failed to locate auth token from browser session.");
|
||||
}
|
||||
|
||||
return {
|
||||
@ -368,7 +402,7 @@ async function loginWithBrowser(email: string, password: string): Promise<UkVisa
|
||||
csrfToken,
|
||||
ciSession,
|
||||
fetchedAt: new Date().toISOString(),
|
||||
source: 'browser',
|
||||
source: "browser",
|
||||
};
|
||||
} finally {
|
||||
await browser.close();
|
||||
@ -376,38 +410,48 @@ async function loginWithBrowser(email: string, password: string): Promise<UkVisa
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log('🇬🇧 UK Visa Jobs Extractor starting...');
|
||||
console.log("🇬🇧 UK Visa Jobs Extractor starting...");
|
||||
const credentials = getLoginCredentials();
|
||||
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
|
||||
const refreshOnly = process.env.UKVISAJOBS_REFRESH_ONLY === '1';
|
||||
const refreshOnly = process.env.UKVISAJOBS_REFRESH_ONLY === "1";
|
||||
|
||||
let authSession = await loadCachedAuthSession();
|
||||
|
||||
if (refreshOnly) {
|
||||
if (!credentials) {
|
||||
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
|
||||
console.error(
|
||||
"ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set",
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(' Refresh-only mode: logging in to refresh tokens...');
|
||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
||||
console.log(" Refresh-only mode: logging in to refresh tokens...");
|
||||
authSession = await loginWithBrowser(
|
||||
credentials.email,
|
||||
credentials.password,
|
||||
);
|
||||
await saveCachedAuthSession(authSession);
|
||||
console.log(' Auth session refreshed.');
|
||||
console.log(" Auth session refreshed.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!authSession) {
|
||||
if (!credentials) {
|
||||
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
|
||||
console.error(
|
||||
"ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set",
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(' No cached session found. Logging in to refresh tokens...');
|
||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
||||
console.log(" No cached session found. Logging in to refresh tokens...");
|
||||
authSession = await loginWithBrowser(
|
||||
credentials.email,
|
||||
credentials.password,
|
||||
);
|
||||
await saveCachedAuthSession(authSession);
|
||||
}
|
||||
|
||||
const cookies = buildCookieHeader(authSession);
|
||||
console.log(` Auth source: ${authSession.source}`);
|
||||
console.log(` Cookies configured: ${cookies ? 'Yes' : 'No'}`);
|
||||
console.log(` Cookies configured: ${cookies ? "Yes" : "No"}`);
|
||||
console.log(` Token length: ${authSession.token.length}`);
|
||||
|
||||
// Get max jobs from environment
|
||||
@ -435,20 +479,30 @@ async function main(): Promise<void> {
|
||||
} catch (error) {
|
||||
if (!credentials) {
|
||||
if (error instanceof UkVisaJobsAuthError) {
|
||||
throw new Error('UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
||||
throw new Error(
|
||||
"UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.",
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
const reason = error instanceof UkVisaJobsAuthError ? 'Auth expired.' : 'Fetch failed.';
|
||||
const reason =
|
||||
error instanceof UkVisaJobsAuthError
|
||||
? "Auth expired."
|
||||
: "Fetch failed.";
|
||||
console.log(` ${reason} Refreshing tokens and retrying...`);
|
||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
||||
authSession = await loginWithBrowser(
|
||||
credentials.email,
|
||||
credentials.password,
|
||||
);
|
||||
await saveCachedAuthSession(authSession);
|
||||
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
||||
}
|
||||
|
||||
if (response.status !== 1) {
|
||||
console.warn(` âš ï¸ API returned status ${response.status} on page ${pageNo}`);
|
||||
console.warn(
|
||||
` âš ï¸ API returned status ${response.status} on page ${pageNo}`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -487,33 +541,32 @@ async function main(): Promise<void> {
|
||||
console.log(`✅ Scraped ${allJobs.length} jobs`);
|
||||
|
||||
// Write output to storage directory (similar to Crawlee dataset structure)
|
||||
const storageDir = join(__dirname, '../storage/datasets/default');
|
||||
const storageDir = join(__dirname, "../storage/datasets/default");
|
||||
await mkdir(storageDir, { recursive: true });
|
||||
|
||||
// Write each job as a separate JSON file (Crawlee dataset format)
|
||||
for (let i = 0; i < allJobs.length; i++) {
|
||||
const filename = join(storageDir, `${String(i + 1).padStart(6, '0')}.json`);
|
||||
const filename = join(
|
||||
storageDir,
|
||||
`${String(i + 1).padStart(6, "0")}.json`,
|
||||
);
|
||||
await writeFile(filename, JSON.stringify(allJobs[i], null, 2));
|
||||
}
|
||||
|
||||
// Also write a combined output file for easier consumption
|
||||
const outputFile = join(storageDir, 'jobs.json');
|
||||
const outputFile = join(storageDir, "jobs.json");
|
||||
await writeFile(outputFile, JSON.stringify(allJobs, null, 2));
|
||||
|
||||
console.log(` Output written to: ${storageDir}`);
|
||||
console.log(` Jobs file: ${outputFile}`);
|
||||
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
||||
const message = error instanceof Error ? error.message : "Unknown error";
|
||||
console.error(`⌠Error: ${message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
console.error("Fatal error:", error);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user