Merge pull request #49 from DaKheera47/formatting-on-push
- don't run docker image build for every PR
- Automated Linting: Uses Biome to ensure code quality and consistency across the repository.
- Test Orchestration: Executes unit tests for the orchestrator service using Vitest.
- Parallel Build Verification: Implements a matrix build strategy to verify orchestrator and extractors simultaneously, reducing total CI runtime.
- Optimized Resource Usage:
- Concurrency Control: Automatically cancels outdated runs when new commits are pushed.
- Selective Script Execution: Skips heavy binary downloads (Camoufox) during extractor builds to speed up verification.
- Granular Caching: Configures project-specific dependency caching for faster installation.
This commit is contained in:
commit
f92cdbf0df
67
.github/workflows/ci.yml
vendored
Normal file
67
.github/workflows/ci.yml
vendored
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ci-${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lint:
|
||||||
|
name: Linting (Biome)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup Biome
|
||||||
|
uses: biomejs/setup-biome@v2
|
||||||
|
with:
|
||||||
|
version: 2.3.12
|
||||||
|
- name: Run Biome
|
||||||
|
run: biome ci .
|
||||||
|
|
||||||
|
test-orchestrator:
|
||||||
|
name: Orchestrator Tests
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup Node
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
cache: 'npm'
|
||||||
|
cache-dependency-path: orchestrator/package-lock.json
|
||||||
|
- name: Install dependencies
|
||||||
|
run: npm ci
|
||||||
|
working-directory: orchestrator
|
||||||
|
- name: Run Vitest
|
||||||
|
run: npm run test:run
|
||||||
|
working-directory: orchestrator
|
||||||
|
|
||||||
|
build:
|
||||||
|
name: Build Verification
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
project: [orchestrator, extractors/gradcracker, extractors/ukvisajobs]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup Node
|
||||||
|
uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: 20
|
||||||
|
cache: 'npm'
|
||||||
|
cache-dependency-path: ${{ matrix.project }}/package-lock.json
|
||||||
|
|
||||||
|
- name: Build ${{ matrix.project }}
|
||||||
|
run: |
|
||||||
|
if [[ "${{ matrix.project }}" == extractors/* ]]; then
|
||||||
|
npm ci --ignore-scripts
|
||||||
|
else
|
||||||
|
npm ci
|
||||||
|
fi
|
||||||
|
npm run build
|
||||||
|
working-directory: ${{ matrix.project }}
|
||||||
20
.github/workflows/ghcr.yml
vendored
20
.github/workflows/ghcr.yml
vendored
@ -1,12 +1,9 @@
|
|||||||
# build and push releases to ghcr
|
# build and push releases to ghcr
|
||||||
# build for PRs only to test failures
|
|
||||||
|
|
||||||
name: build-and-push-ghcr
|
name: build-and-push-ghcr
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
tags: ["v*"]
|
tags: ["v*"]
|
||||||
pull_request:
|
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
@ -30,7 +27,6 @@ jobs:
|
|||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Log in to GHCR
|
- name: Log in to GHCR
|
||||||
if: github.event_name != 'pull_request'
|
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
registry: ghcr.io
|
registry: ghcr.io
|
||||||
@ -48,21 +44,7 @@ jobs:
|
|||||||
# Optional: also publish :latest for version tags
|
# Optional: also publish :latest for version tags
|
||||||
type=raw,value=latest
|
type=raw,value=latest
|
||||||
|
|
||||||
- name: Build (PR)
|
- name: Build and push
|
||||||
if: github.event_name == 'pull_request'
|
|
||||||
uses: docker/build-push-action@v6
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
file: ./Dockerfile
|
|
||||||
push: false
|
|
||||||
platforms: linux/amd64,linux/arm64
|
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
|
|
||||||
- name: Build and push (tag)
|
|
||||||
if: github.event_name != 'pull_request'
|
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
|
|||||||
55
biome.json
55
biome.json
@ -1,34 +1,27 @@
|
|||||||
{
|
{
|
||||||
"$schema": "https://biomejs.dev/schemas/2.3.12/schema.json",
|
"$schema": "https://biomejs.dev/schemas/2.3.12/schema.json",
|
||||||
"formatter": {
|
"formatter": {
|
||||||
"indentStyle": "space",
|
"indentStyle": "space",
|
||||||
"indentWidth": 2
|
"indentWidth": 2
|
||||||
},
|
},
|
||||||
"files": {
|
"files": {
|
||||||
"includes": [
|
"includes": ["**", "!!**/dist"]
|
||||||
"**",
|
},
|
||||||
"!!**/dist"
|
"css": {
|
||||||
]
|
"parser": {
|
||||||
},
|
"tailwindDirectives": true
|
||||||
"css": {
|
}
|
||||||
"parser": {
|
},
|
||||||
"tailwindDirectives": true
|
"overrides": [
|
||||||
|
{
|
||||||
|
"includes": ["**/*.test.ts", "**/*.test.tsx", "**/test-utils.ts"],
|
||||||
|
"linter": {
|
||||||
|
"rules": {
|
||||||
|
"suspicious": {
|
||||||
|
"noExplicitAny": "off"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"overrides": [
|
}
|
||||||
{
|
]
|
||||||
"includes": [
|
|
||||||
"**/*.test.ts",
|
|
||||||
"**/*.test.tsx",
|
|
||||||
"**/test-utils.ts"
|
|
||||||
],
|
|
||||||
"linter": {
|
|
||||||
"rules": {
|
|
||||||
"suspicious": {
|
|
||||||
"noExplicitAny": "off"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
@ -1,30 +1,30 @@
|
|||||||
{
|
{
|
||||||
"name": "job-flow",
|
"name": "job-flow",
|
||||||
"version": "0.0.1",
|
"version": "0.0.1",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"description": "This is an example of a Crawlee project.",
|
"description": "This is an example of a Crawlee project.",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"camoufox-js": "^0.8.0",
|
"camoufox-js": "^0.8.0",
|
||||||
"crawlee": "^3.0.0",
|
"crawlee": "^3.0.0",
|
||||||
"playwright": "*"
|
"playwright": "*"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@apify/tsconfig": "^0.1.0",
|
"@apify/tsconfig": "^0.1.0",
|
||||||
"@types/fs-extra": "^11",
|
"@types/fs-extra": "^11",
|
||||||
"@types/node": "^24.0.0",
|
"@types/node": "^24.0.0",
|
||||||
"fs-extra": "^11.3.0",
|
"fs-extra": "^11.3.0",
|
||||||
"tsx": "^4.4.0",
|
"tsx": "^4.4.0",
|
||||||
"typescript": "~5.9.0"
|
"typescript": "~5.9.0"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "npm run start:dev",
|
"start": "npm run start:dev",
|
||||||
"start:prod": "node dist/main.js",
|
"start:prod": "node dist/main.js",
|
||||||
"start:dev": "tsx src/main.ts",
|
"start:dev": "tsx src/main.ts",
|
||||||
"build": "tsc",
|
"build": "tsc",
|
||||||
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
|
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
|
||||||
"get-binaries": "camoufox-js fetch",
|
"get-binaries": "camoufox-js fetch",
|
||||||
"postinstall": "npm run get-binaries"
|
"postinstall": "npm run get-binaries"
|
||||||
},
|
},
|
||||||
"author": "It's not you it's me",
|
"author": "It's not you it's me",
|
||||||
"license": "ISC"
|
"license": "ISC"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,9 +2,8 @@
|
|||||||
import { launchOptions } from "camoufox-js";
|
import { launchOptions } from "camoufox-js";
|
||||||
import { PlaywrightCrawler } from "crawlee";
|
import { PlaywrightCrawler } from "crawlee";
|
||||||
import { firefox } from "playwright";
|
import { firefox } from "playwright";
|
||||||
|
|
||||||
import { router } from "./routes.js";
|
|
||||||
import { initJobOpsProgress } from "./progress.js";
|
import { initJobOpsProgress } from "./progress.js";
|
||||||
|
import { router } from "./routes.js";
|
||||||
|
|
||||||
// locations
|
// locations
|
||||||
const locations = [
|
const locations = [
|
||||||
@ -17,10 +16,7 @@ const locations = [
|
|||||||
];
|
];
|
||||||
|
|
||||||
// roles
|
// roles
|
||||||
const defaultRoles = [
|
const defaultRoles = ["web-development", "software-systems"];
|
||||||
"web-development",
|
|
||||||
"software-systems",
|
|
||||||
];
|
|
||||||
|
|
||||||
let roles = defaultRoles;
|
let roles = defaultRoles;
|
||||||
const envRolesRaw = process.env.GRADCRACKER_SEARCH_TERMS;
|
const envRolesRaw = process.env.GRADCRACKER_SEARCH_TERMS;
|
||||||
@ -29,15 +25,16 @@ if (envRolesRaw) {
|
|||||||
try {
|
try {
|
||||||
const parsed = JSON.parse(envRolesRaw) as string[];
|
const parsed = JSON.parse(envRolesRaw) as string[];
|
||||||
if (Array.isArray(parsed) && parsed.length > 0) {
|
if (Array.isArray(parsed) && parsed.length > 0) {
|
||||||
roles = parsed.map(term =>
|
roles = parsed.map((term) =>
|
||||||
term.toLowerCase()
|
term
|
||||||
.replace(/[^a-z0-9]+/g, '-')
|
.toLowerCase()
|
||||||
.replace(/^-+|-+$/g, '')
|
.replace(/[^a-z0-9]+/g, "-")
|
||||||
|
.replace(/^-+|-+$/g, ""),
|
||||||
);
|
);
|
||||||
console.log(`Using configured search terms: ${roles.join(', ')}`);
|
console.log(`Using configured search terms: ${roles.join(", ")}`);
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Failed to parse GRADCRACKER_SEARCH_TERMS', e);
|
console.warn("Failed to parse GRADCRACKER_SEARCH_TERMS", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -46,12 +43,12 @@ const gradcrackerUrls = locations.flatMap((location) => {
|
|||||||
return roles.map((role) => {
|
return roles.map((role) => {
|
||||||
return {
|
return {
|
||||||
url: `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`,
|
url: `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`,
|
||||||
role
|
role,
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`)
|
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`);
|
||||||
|
|
||||||
const startUrls = gradcrackerUrls.map(({ url, role }) => ({
|
const startUrls = gradcrackerUrls.map(({ url, role }) => ({
|
||||||
url,
|
url,
|
||||||
|
|||||||
@ -26,7 +26,7 @@ interface JobOpsCrawlProgressState {
|
|||||||
const PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
|
const PROGRESS_PREFIX = "JOBOPS_PROGRESS ";
|
||||||
const isEnabled = () => process.env.JOBOPS_EMIT_PROGRESS === "1";
|
const isEnabled = () => process.env.JOBOPS_EMIT_PROGRESS === "1";
|
||||||
|
|
||||||
let state: JobOpsCrawlProgressState = {
|
const state: JobOpsCrawlProgressState = {
|
||||||
listPagesProcessed: 0,
|
listPagesProcessed: 0,
|
||||||
jobCardsFound: 0,
|
jobCardsFound: 0,
|
||||||
jobPagesEnqueued: 0,
|
jobPagesEnqueued: 0,
|
||||||
@ -80,4 +80,3 @@ export function markJobPageDone(params: { currentUrl: string }): void {
|
|||||||
state.currentUrl = params.currentUrl;
|
state.currentUrl = params.currentUrl;
|
||||||
emit();
|
emit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
import { createPlaywrightRouter, log } from "crawlee";
|
|
||||||
import { readFileSync } from "node:fs";
|
import { readFileSync } from "node:fs";
|
||||||
|
import { createPlaywrightRouter, log } from "crawlee";
|
||||||
import { markJobPageDone, markListPageDone } from "./progress.js";
|
import { markJobPageDone, markListPageDone } from "./progress.js";
|
||||||
|
|
||||||
function normalizeUrl(raw: string | null | undefined): string | null {
|
function normalizeUrl(raw: string | null | undefined): string | null {
|
||||||
@ -17,16 +17,15 @@ function normalizeUrl(raw: string | null | undefined): string | null {
|
|||||||
|
|
||||||
function getExistingJobUrlSet(): Set<string> {
|
function getExistingJobUrlSet(): Set<string> {
|
||||||
const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
|
const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
|
||||||
const raw =
|
const raw = filePath
|
||||||
filePath
|
? (() => {
|
||||||
? (() => {
|
try {
|
||||||
try {
|
return readFileSync(filePath, "utf-8");
|
||||||
return readFileSync(filePath, "utf-8");
|
} catch {
|
||||||
} catch {
|
return null;
|
||||||
return null;
|
}
|
||||||
}
|
})()
|
||||||
})()
|
: process.env.JOBOPS_EXISTING_JOB_URLS;
|
||||||
: process.env.JOBOPS_EXISTING_JOB_URLS;
|
|
||||||
|
|
||||||
if (!raw) return new Set();
|
if (!raw) return new Set();
|
||||||
try {
|
try {
|
||||||
@ -41,12 +40,16 @@ function getExistingJobUrlSet(): Set<string> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
|
const SKIP_APPLY_FOR_EXISTING =
|
||||||
|
process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
|
||||||
const EXISTING_JOB_URLS = getExistingJobUrlSet();
|
const EXISTING_JOB_URLS = getExistingJobUrlSet();
|
||||||
|
|
||||||
// Global counters for max jobs per search term
|
// Global counters for max jobs per search term
|
||||||
const jobCounts = new Map<string, number>();
|
const jobCounts = new Map<string, number>();
|
||||||
const MAX_JOBS_PER_TERM = parseInt(process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0", 10);
|
const MAX_JOBS_PER_TERM = parseInt(
|
||||||
|
process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0",
|
||||||
|
10,
|
||||||
|
);
|
||||||
|
|
||||||
interface Job {
|
interface Job {
|
||||||
title: string | null;
|
title: string | null;
|
||||||
@ -72,7 +75,9 @@ router.addHandler(
|
|||||||
if (MAX_JOBS_PER_TERM > 0) {
|
if (MAX_JOBS_PER_TERM > 0) {
|
||||||
const currentCount = jobCounts.get(role) || 0;
|
const currentCount = jobCounts.get(role) || 0;
|
||||||
if (currentCount >= MAX_JOBS_PER_TERM) {
|
if (currentCount >= MAX_JOBS_PER_TERM) {
|
||||||
log.info(`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`);
|
log.info(
|
||||||
|
`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`,
|
||||||
|
);
|
||||||
markListPageDone({
|
markListPageDone({
|
||||||
currentUrl: request.url,
|
currentUrl: request.url,
|
||||||
jobCardsFound: 0,
|
jobCardsFound: 0,
|
||||||
@ -120,7 +125,8 @@ router.addHandler(
|
|||||||
let disciplines: string | null = null;
|
let disciplines: string | null = null;
|
||||||
try {
|
try {
|
||||||
const disciplinesEl = article.locator("h3");
|
const disciplinesEl = article.locator("h3");
|
||||||
disciplines = (await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
|
disciplines =
|
||||||
|
(await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
|
||||||
} catch {
|
} catch {
|
||||||
// h3 not found or timed out - that's okay, disciplines is optional
|
// h3 not found or timed out - that's okay, disciplines is optional
|
||||||
}
|
}
|
||||||
@ -195,7 +201,9 @@ router.addHandler(
|
|||||||
if (MAX_JOBS_PER_TERM > 0) {
|
if (MAX_JOBS_PER_TERM > 0) {
|
||||||
const currentCount = jobCounts.get(role) || 0;
|
const currentCount = jobCounts.get(role) || 0;
|
||||||
if (currentCount >= MAX_JOBS_PER_TERM) {
|
if (currentCount >= MAX_JOBS_PER_TERM) {
|
||||||
log.info(`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`);
|
log.info(
|
||||||
|
`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`,
|
||||||
|
);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
jobCounts.set(role, currentCount + 1);
|
jobCounts.set(role, currentCount + 1);
|
||||||
@ -205,7 +213,7 @@ router.addHandler(
|
|||||||
urls: [jobUrl],
|
urls: [jobUrl],
|
||||||
userData: {
|
userData: {
|
||||||
...jobs[jobs.length - 1],
|
...jobs[jobs.length - 1],
|
||||||
label: "gradcracker-single-job-page"
|
label: "gradcracker-single-job-page",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
enqueuedJobs++;
|
enqueuedJobs++;
|
||||||
@ -216,7 +224,7 @@ router.addHandler(
|
|||||||
log.info(`Extracted ${jobs.length} jobs`);
|
log.info(`Extracted ${jobs.length} jobs`);
|
||||||
if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
|
if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
|
||||||
log.info(
|
log.info(
|
||||||
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`
|
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -226,7 +234,7 @@ router.addHandler(
|
|||||||
jobPagesEnqueued: enqueuedJobs,
|
jobPagesEnqueued: enqueuedJobs,
|
||||||
jobPagesSkipped: skippedKnownJobs,
|
jobPagesSkipped: skippedKnownJobs,
|
||||||
});
|
});
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
router.addHandler(
|
router.addHandler(
|
||||||
@ -261,7 +269,9 @@ router.addHandler(
|
|||||||
|
|
||||||
// Prefer page-scoped popup detection. Using the browser context's "page" event
|
// Prefer page-scoped popup detection. Using the browser context's "page" event
|
||||||
// can accidentally capture unrelated pages created by other concurrent requests.
|
// can accidentally capture unrelated pages created by other concurrent requests.
|
||||||
const popupPromise = page.waitForEvent("popup", { timeout: 8000 }).catch(() => null);
|
const popupPromise = page
|
||||||
|
.waitForEvent("popup", { timeout: 8000 })
|
||||||
|
.catch(() => null);
|
||||||
const navigationPromise = page
|
const navigationPromise = page
|
||||||
.waitForNavigation({ timeout: 8000, waitUntil: "domcontentloaded" })
|
.waitForNavigation({ timeout: 8000, waitUntil: "domcontentloaded" })
|
||||||
.catch(() => null);
|
.catch(() => null);
|
||||||
@ -271,7 +281,12 @@ router.addHandler(
|
|||||||
await applyButton.click();
|
await applyButton.click();
|
||||||
|
|
||||||
// Wait for URL to stabilize (same URL for 3 consecutive checks)
|
// Wait for URL to stabilize (same URL for 3 consecutive checks)
|
||||||
const waitForUrlStable = async (targetPage: typeof page, maxWaitMs = 10000, checkIntervalMs = 100, requiredStableChecks = 3) => {
|
const waitForUrlStable = async (
|
||||||
|
targetPage: typeof page,
|
||||||
|
maxWaitMs = 10000,
|
||||||
|
checkIntervalMs = 100,
|
||||||
|
requiredStableChecks = 3,
|
||||||
|
) => {
|
||||||
let lastUrl = targetPage.url();
|
let lastUrl = targetPage.url();
|
||||||
let stableCount = 0;
|
let stableCount = 0;
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
@ -298,11 +313,15 @@ router.addHandler(
|
|||||||
const targetPage = maybePopup ?? page;
|
const targetPage = maybePopup ?? page;
|
||||||
|
|
||||||
if (maybePopup) {
|
if (maybePopup) {
|
||||||
await maybePopup.waitForLoadState("domcontentloaded", { timeout: 15000 }).catch(() => null);
|
await maybePopup
|
||||||
|
.waitForLoadState("domcontentloaded", { timeout: 15000 })
|
||||||
|
.catch(() => null);
|
||||||
// If the popup initially opens as about:blank, give it a moment to redirect.
|
// If the popup initially opens as about:blank, give it a moment to redirect.
|
||||||
if (maybePopup.url() === "about:blank") {
|
if (maybePopup.url() === "about:blank") {
|
||||||
await maybePopup
|
await maybePopup
|
||||||
.waitForURL((u) => u.toString() !== "about:blank", { timeout: 15000 })
|
.waitForURL((u) => u.toString() !== "about:blank", {
|
||||||
|
timeout: 15000,
|
||||||
|
})
|
||||||
.catch(() => null);
|
.catch(() => null);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -317,7 +336,7 @@ router.addHandler(
|
|||||||
|
|
||||||
if (applicationLink === originalUrl) {
|
if (applicationLink === originalUrl) {
|
||||||
log.info(
|
log.info(
|
||||||
`Apply click did not change URL (still Gradcracker): ${applicationLink}`
|
`Apply click did not change URL (still Gradcracker): ${applicationLink}`,
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
log.info(`Captured application URL: ${applicationLink}`);
|
log.info(`Captured application URL: ${applicationLink}`);
|
||||||
@ -342,5 +361,5 @@ router.addHandler(
|
|||||||
});
|
});
|
||||||
|
|
||||||
markJobPageDone({ currentUrl: request.url });
|
markJobPageDone({ currentUrl: request.url });
|
||||||
}
|
},
|
||||||
);
|
);
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"extends": "@apify/tsconfig",
|
"extends": "@apify/tsconfig",
|
||||||
"compilerOptions": {
|
"compilerOptions": {
|
||||||
"module": "NodeNext",
|
"module": "NodeNext",
|
||||||
"moduleResolution": "NodeNext",
|
"moduleResolution": "NodeNext",
|
||||||
"target": "ES2022",
|
"target": "ES2022",
|
||||||
"outDir": "dist",
|
"outDir": "dist",
|
||||||
"noUnusedLocals": false,
|
"noUnusedLocals": false,
|
||||||
"lib": ["DOM"]
|
"lib": ["DOM"]
|
||||||
},
|
},
|
||||||
"include": ["./src/**/*"]
|
"include": ["./src/**/*"]
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,27 +1,27 @@
|
|||||||
{
|
{
|
||||||
"name": "ukvisajobs-extractor",
|
"name": "ukvisajobs-extractor",
|
||||||
"version": "0.0.1",
|
"version": "0.0.1",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"description": "UK Visa Jobs extractor - fetches job listings that may sponsor work visas",
|
"description": "UK Visa Jobs extractor - fetches job listings that may sponsor work visas",
|
||||||
"main": "dist/main.js",
|
"main": "dist/main.js",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"camoufox-js": "^0.8.0",
|
"camoufox-js": "^0.8.0",
|
||||||
"playwright": "^1.57.0"
|
"playwright": "^1.57.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@apify/tsconfig": "^0.1.0",
|
"@apify/tsconfig": "^0.1.0",
|
||||||
"@types/node": "^24.0.0",
|
"@types/node": "^24.0.0",
|
||||||
"tsx": "^4.4.0",
|
"tsx": "^4.4.0",
|
||||||
"typescript": "~5.9.0"
|
"typescript": "~5.9.0"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "npm run start:dev",
|
"start": "npm run start:dev",
|
||||||
"start:prod": "node dist/main.js",
|
"start:prod": "node dist/main.js",
|
||||||
"start:dev": "tsx src/main.ts",
|
"start:dev": "tsx src/main.ts",
|
||||||
"build": "tsc",
|
"build": "tsc",
|
||||||
"get-binaries": "camoufox-js fetch",
|
"get-binaries": "camoufox-js fetch",
|
||||||
"postinstall": "npm run get-binaries"
|
"postinstall": "npm run get-binaries"
|
||||||
},
|
},
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC"
|
"license": "ISC"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,507 +13,560 @@
|
|||||||
* UKVISAJOBS_REFRESH_ONLY - Set to "1" to refresh tokens and exit
|
* UKVISAJOBS_REFRESH_ONLY - Set to "1" to refresh tokens and exit
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { mkdir, writeFile, readFile } from 'fs/promises';
|
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
||||||
import { join, dirname } from 'path';
|
import { dirname, join } from "node:path";
|
||||||
import { fileURLToPath } from 'url';
|
import { fileURLToPath } from "node:url";
|
||||||
import type { Request } from 'playwright';
|
import type { Request } from "playwright";
|
||||||
|
|
||||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
const API_URL = 'https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data';
|
const API_URL = "https://my.ukvisajobs.com/ukvisa-api/api/fetch-jobs-data";
|
||||||
const SIGNIN_URL = 'https://my.ukvisajobs.com/signin';
|
const SIGNIN_URL = "https://my.ukvisajobs.com/signin";
|
||||||
const OPEN_JOBS_URL = 'https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1';
|
const OPEN_JOBS_URL =
|
||||||
const AUTH_CACHE_PATH = join(__dirname, '../storage/ukvisajobs-auth.json');
|
"https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&visaAcceptance=false&applicants_outside_uk=false&pageNo=1";
|
||||||
|
const AUTH_CACHE_PATH = join(__dirname, "../storage/ukvisajobs-auth.json");
|
||||||
const JOBS_PER_PAGE = 15;
|
const JOBS_PER_PAGE = 15;
|
||||||
const DEFAULT_MAX_JOBS = 50;
|
const DEFAULT_MAX_JOBS = 50;
|
||||||
const MAX_ALLOWED_JOBS = 200;
|
const MAX_ALLOWED_JOBS = 200;
|
||||||
|
|
||||||
interface UkVisaJobsApiJob {
|
interface UkVisaJobsApiJob {
|
||||||
id: string;
|
id: string;
|
||||||
title: string;
|
title: string;
|
||||||
company_name: string;
|
company_name: string;
|
||||||
company_link?: string;
|
company_link?: string;
|
||||||
job_link: string;
|
job_link: string;
|
||||||
city: string;
|
city: string;
|
||||||
created_date: string;
|
created_date: string;
|
||||||
job_expire: string;
|
job_expire: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
min_salary?: string;
|
min_salary?: string;
|
||||||
max_salary?: string;
|
max_salary?: string;
|
||||||
salary_interval?: string;
|
salary_interval?: string;
|
||||||
salary_method?: string;
|
salary_method?: string;
|
||||||
degree_requirement?: string;
|
degree_requirement?: string;
|
||||||
job_type?: string;
|
job_type?: string;
|
||||||
job_level?: string;
|
job_level?: string;
|
||||||
job_industry?: string;
|
job_industry?: string;
|
||||||
visa_acceptance?: string;
|
visa_acceptance?: string;
|
||||||
applicants_outside_uk?: string;
|
applicants_outside_uk?: string;
|
||||||
likely_to_sponsor?: string;
|
likely_to_sponsor?: string;
|
||||||
definitely_sponsored?: string;
|
definitely_sponsored?: string;
|
||||||
new_entrant?: string;
|
new_entrant?: string;
|
||||||
student_graduate?: string;
|
student_graduate?: string;
|
||||||
image?: string;
|
image?: string;
|
||||||
computed_cos_total?: string;
|
computed_cos_total?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface UkVisaJobsApiResponse {
|
interface UkVisaJobsApiResponse {
|
||||||
status: number;
|
status: number;
|
||||||
totalJobs: number;
|
totalJobs: number;
|
||||||
query?: string;
|
query?: string;
|
||||||
jobs: UkVisaJobsApiJob[];
|
jobs: UkVisaJobsApiJob[];
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ExtractedJob {
|
interface ExtractedJob {
|
||||||
source: 'ukvisajobs';
|
source: "ukvisajobs";
|
||||||
sourceJobId: string;
|
sourceJobId: string;
|
||||||
title: string;
|
title: string;
|
||||||
employer: string;
|
employer: string;
|
||||||
employerUrl?: string;
|
employerUrl?: string;
|
||||||
jobUrl: string;
|
jobUrl: string;
|
||||||
applicationLink: string;
|
applicationLink: string;
|
||||||
location?: string;
|
location?: string;
|
||||||
deadline?: string;
|
deadline?: string;
|
||||||
salary?: string;
|
salary?: string;
|
||||||
jobDescription?: string;
|
jobDescription?: string;
|
||||||
datePosted?: string;
|
datePosted?: string;
|
||||||
degreeRequired?: string;
|
degreeRequired?: string;
|
||||||
jobType?: string;
|
jobType?: string;
|
||||||
jobLevel?: string;
|
jobLevel?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface UkVisaJobsAuthSession {
|
interface UkVisaJobsAuthSession {
|
||||||
token: string;
|
token: string;
|
||||||
authToken: string;
|
authToken: string;
|
||||||
csrfToken: string;
|
csrfToken: string;
|
||||||
ciSession: string;
|
ciSession: string;
|
||||||
fetchedAt: string;
|
fetchedAt: string;
|
||||||
source: 'cache' | 'browser';
|
source: "cache" | "browser";
|
||||||
}
|
}
|
||||||
|
|
||||||
class UkVisaJobsAuthError extends Error {
|
class UkVisaJobsAuthError extends Error {
|
||||||
status: number;
|
status: number;
|
||||||
responseText: string;
|
responseText: string;
|
||||||
|
|
||||||
constructor(message: string, status: number, responseText: string) {
|
constructor(message: string, status: number, responseText: string) {
|
||||||
super(message);
|
super(message);
|
||||||
this.name = 'UkVisaJobsAuthError';
|
this.name = "UkVisaJobsAuthError";
|
||||||
this.status = status;
|
this.status = status;
|
||||||
this.responseText = responseText;
|
this.responseText = responseText;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function toStringOrNull(value: unknown): string | null {
|
function toStringOrNull(value: unknown): string | null {
|
||||||
if (value === null || value === undefined) return null;
|
if (value === null || value === undefined) return null;
|
||||||
if (typeof value === 'string') {
|
if (typeof value === "string") {
|
||||||
const trimmed = value.trim();
|
const trimmed = value.trim();
|
||||||
return trimmed.length > 0 ? trimmed : null;
|
return trimmed.length > 0 ? trimmed : null;
|
||||||
}
|
}
|
||||||
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
|
if (typeof value === "number" || typeof value === "boolean")
|
||||||
return null;
|
return String(value);
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function toNumberOrNull(value: unknown): number | null {
|
function toNumberOrNull(value: unknown): number | null {
|
||||||
if (value === null || value === undefined) return null;
|
if (value === null || value === undefined) return null;
|
||||||
if (typeof value === 'number') return Number.isFinite(value) ? value : null;
|
if (typeof value === "number") return Number.isFinite(value) ? value : null;
|
||||||
if (typeof value === 'string') {
|
if (typeof value === "string") {
|
||||||
const trimmed = value.trim();
|
const trimmed = value.trim();
|
||||||
if (!trimmed) return null;
|
if (!trimmed) return null;
|
||||||
const parsed = Number(trimmed);
|
const parsed = Number(trimmed);
|
||||||
return Number.isFinite(parsed) ? parsed : null;
|
return Number.isFinite(parsed) ? parsed : null;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchPage(
|
async function fetchPage(
|
||||||
pageNo: number,
|
pageNo: number,
|
||||||
session: UkVisaJobsAuthSession,
|
session: UkVisaJobsAuthSession,
|
||||||
options: { searchKeyword?: string } = {}
|
options: { searchKeyword?: string } = {},
|
||||||
): Promise<UkVisaJobsApiResponse> {
|
): Promise<UkVisaJobsApiResponse> {
|
||||||
// Use native FormData API (Node.js 18+)
|
// Use native FormData API (Node.js 18+)
|
||||||
const formData = new FormData();
|
const formData = new FormData();
|
||||||
formData.append('is_global', '0');
|
formData.append("is_global", "0");
|
||||||
formData.append('sortBy', 'desc');
|
formData.append("sortBy", "desc");
|
||||||
formData.append('pageNo', String(pageNo));
|
formData.append("pageNo", String(pageNo));
|
||||||
formData.append('visaAcceptance', 'false');
|
formData.append("visaAcceptance", "false");
|
||||||
formData.append('applicants_outside_uk', 'false');
|
formData.append("applicants_outside_uk", "false");
|
||||||
formData.append('searchKeyword', options.searchKeyword || 'null');
|
formData.append("searchKeyword", options.searchKeyword || "null");
|
||||||
formData.append('token', session.token);
|
formData.append("token", session.token);
|
||||||
|
|
||||||
const cookies = buildCookieHeader(session);
|
const cookies = buildCookieHeader(session);
|
||||||
|
|
||||||
const response = await fetch(API_URL, {
|
const response = await fetch(API_URL, {
|
||||||
method: 'POST',
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
'accept': 'application/json, text/plain, */*',
|
accept: "application/json, text/plain, */*",
|
||||||
'accept-language': 'en-US,en;q=0.9',
|
"accept-language": "en-US,en;q=0.9",
|
||||||
'cookie': cookies,
|
cookie: cookies,
|
||||||
'origin': 'https://my.ukvisajobs.com',
|
origin: "https://my.ukvisajobs.com",
|
||||||
'referer': `https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&pageNo=${pageNo}&visaAcceptance=false&applicants_outside_uk=false`,
|
referer: `https://my.ukvisajobs.com/open-jobs/1?is_global=0&sortBy=desc&pageNo=${pageNo}&visaAcceptance=false&applicants_outside_uk=false`,
|
||||||
'user-agent': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36',
|
"user-agent":
|
||||||
},
|
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Mobile Safari/537.36",
|
||||||
body: formData,
|
},
|
||||||
});
|
body: formData,
|
||||||
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
const text = await response.text();
|
const text = await response.text();
|
||||||
if (isAuthErrorResponse(response.status, text)) {
|
if (isAuthErrorResponse(response.status, text)) {
|
||||||
throw new UkVisaJobsAuthError(
|
throw new UkVisaJobsAuthError(
|
||||||
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
|
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
|
||||||
response.status,
|
response.status,
|
||||||
text
|
text,
|
||||||
);
|
);
|
||||||
}
|
|
||||||
throw new Error(`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`);
|
|
||||||
}
|
}
|
||||||
|
throw new Error(
|
||||||
|
`UKVisaJobs API returned ${response.status}: ${response.statusText} - ${text}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return response.json() as Promise<UkVisaJobsApiResponse>;
|
return response.json() as Promise<UkVisaJobsApiResponse>;
|
||||||
}
|
}
|
||||||
|
|
||||||
function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
function mapJob(raw: UkVisaJobsApiJob): ExtractedJob {
|
||||||
// Build salary string from min/max
|
// Build salary string from min/max
|
||||||
let salary: string | undefined = undefined;
|
let salary: string | undefined;
|
||||||
const minSalary = toNumberOrNull(raw.min_salary);
|
const minSalary = toNumberOrNull(raw.min_salary);
|
||||||
const maxSalary = toNumberOrNull(raw.max_salary);
|
const maxSalary = toNumberOrNull(raw.max_salary);
|
||||||
|
|
||||||
if (minSalary !== null && minSalary > 0 && maxSalary !== null && maxSalary > 0) {
|
if (
|
||||||
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
|
minSalary !== null &&
|
||||||
if (raw.salary_interval) {
|
minSalary > 0 &&
|
||||||
salary += ` / ${raw.salary_interval}`;
|
maxSalary !== null &&
|
||||||
}
|
maxSalary > 0
|
||||||
} else if (maxSalary !== null && maxSalary > 0) {
|
) {
|
||||||
salary = `£${maxSalary.toLocaleString()}`;
|
salary = `£${minSalary.toLocaleString()}-${maxSalary.toLocaleString()}`;
|
||||||
if (raw.salary_interval) {
|
if (raw.salary_interval) {
|
||||||
salary += ` / ${raw.salary_interval}`;
|
salary += ` / ${raw.salary_interval}`;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
} else if (maxSalary !== null && maxSalary > 0) {
|
||||||
|
salary = `£${maxSalary.toLocaleString()}`;
|
||||||
|
if (raw.salary_interval) {
|
||||||
|
salary += ` / ${raw.salary_interval}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Build a description from visa sponsorship fields
|
// Build a description from visa sponsorship fields
|
||||||
const visaInfo: string[] = [];
|
const visaInfo: string[] = [];
|
||||||
if (raw.visa_acceptance?.toLowerCase() === 'yes') visaInfo.push('Visa acceptance: Yes');
|
if (raw.visa_acceptance?.toLowerCase() === "yes")
|
||||||
if (raw.applicants_outside_uk?.toLowerCase() === 'yes') visaInfo.push('Accepts applicants outside UK');
|
visaInfo.push("Visa acceptance: Yes");
|
||||||
if (raw.likely_to_sponsor?.toLowerCase() === 'yes') visaInfo.push('Likely to sponsor');
|
if (raw.applicants_outside_uk?.toLowerCase() === "yes")
|
||||||
if (raw.definitely_sponsored?.toLowerCase() === 'yes') visaInfo.push('Definitely sponsored');
|
visaInfo.push("Accepts applicants outside UK");
|
||||||
if (raw.new_entrant?.toLowerCase() === 'yes') visaInfo.push('New entrant friendly');
|
if (raw.likely_to_sponsor?.toLowerCase() === "yes")
|
||||||
if (raw.student_graduate?.toLowerCase() === 'yes') visaInfo.push('Student/Graduate friendly');
|
visaInfo.push("Likely to sponsor");
|
||||||
|
if (raw.definitely_sponsored?.toLowerCase() === "yes")
|
||||||
|
visaInfo.push("Definitely sponsored");
|
||||||
|
if (raw.new_entrant?.toLowerCase() === "yes")
|
||||||
|
visaInfo.push("New entrant friendly");
|
||||||
|
if (raw.student_graduate?.toLowerCase() === "yes")
|
||||||
|
visaInfo.push("Student/Graduate friendly");
|
||||||
|
|
||||||
const description = raw.description
|
const description = raw.description
|
||||||
? raw.description
|
? raw.description
|
||||||
: visaInfo.length > 0
|
: visaInfo.length > 0
|
||||||
? `Visa sponsorship info: ${visaInfo.join(', ')}`
|
? `Visa sponsorship info: ${visaInfo.join(", ")}`
|
||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
source: 'ukvisajobs',
|
source: "ukvisajobs",
|
||||||
sourceJobId: raw.id,
|
sourceJobId: raw.id,
|
||||||
title: raw.title || 'Unknown Title',
|
title: raw.title || "Unknown Title",
|
||||||
employer: raw.company_name || 'Unknown Employer',
|
employer: raw.company_name || "Unknown Employer",
|
||||||
employerUrl: toStringOrNull(raw.company_link) ?? undefined,
|
employerUrl: toStringOrNull(raw.company_link) ?? undefined,
|
||||||
jobUrl: raw.job_link,
|
jobUrl: raw.job_link,
|
||||||
applicationLink: raw.job_link,
|
applicationLink: raw.job_link,
|
||||||
location: raw.city || undefined,
|
location: raw.city || undefined,
|
||||||
deadline: raw.job_expire || undefined,
|
deadline: raw.job_expire || undefined,
|
||||||
salary,
|
salary,
|
||||||
jobDescription: description,
|
jobDescription: description,
|
||||||
datePosted: raw.created_date || undefined,
|
datePosted: raw.created_date || undefined,
|
||||||
degreeRequired: toStringOrNull(raw.degree_requirement) ?? undefined,
|
degreeRequired: toStringOrNull(raw.degree_requirement) ?? undefined,
|
||||||
jobType: toStringOrNull(raw.job_type) ?? undefined,
|
jobType: toStringOrNull(raw.job_type) ?? undefined,
|
||||||
jobLevel: toStringOrNull(raw.job_level) ?? undefined,
|
jobLevel: toStringOrNull(raw.job_level) ?? undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildCookieHeader(session: UkVisaJobsAuthSession): string {
|
function buildCookieHeader(session: UkVisaJobsAuthSession): string {
|
||||||
const cookieParts: string[] = [];
|
const cookieParts: string[] = [];
|
||||||
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
|
if (session.csrfToken) cookieParts.push(`csrf_token=${session.csrfToken}`);
|
||||||
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
|
if (session.ciSession) cookieParts.push(`ci_session=${session.ciSession}`);
|
||||||
if (session.authToken) cookieParts.push(`authToken=${session.authToken}`);
|
if (session.authToken) cookieParts.push(`authToken=${session.authToken}`);
|
||||||
return cookieParts.join('; ');
|
return cookieParts.join("; ");
|
||||||
}
|
}
|
||||||
|
|
||||||
function getLoginCredentials(): { email: string; password: string } | null {
|
function getLoginCredentials(): { email: string; password: string } | null {
|
||||||
const email = process.env.UKVISAJOBS_EMAIL;
|
const email = process.env.UKVISAJOBS_EMAIL;
|
||||||
const password = process.env.UKVISAJOBS_PASSWORD;
|
const password = process.env.UKVISAJOBS_PASSWORD;
|
||||||
if (!email || !password) return null;
|
if (!email || !password) return null;
|
||||||
return { email, password };
|
return { email, password };
|
||||||
}
|
}
|
||||||
|
|
||||||
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
async function loadCachedAuthSession(): Promise<UkVisaJobsAuthSession | null> {
|
||||||
try {
|
try {
|
||||||
const data = await readFile(AUTH_CACHE_PATH, 'utf8');
|
const data = await readFile(AUTH_CACHE_PATH, "utf8");
|
||||||
const parsed = JSON.parse(data) as UkVisaJobsAuthSession;
|
const parsed = JSON.parse(data) as UkVisaJobsAuthSession;
|
||||||
if (!parsed?.token) return null;
|
if (!parsed?.token) return null;
|
||||||
return {
|
return {
|
||||||
token: parsed.token,
|
token: parsed.token,
|
||||||
authToken: parsed.authToken || parsed.token,
|
authToken: parsed.authToken || parsed.token,
|
||||||
csrfToken: parsed.csrfToken || '',
|
csrfToken: parsed.csrfToken || "",
|
||||||
ciSession: parsed.ciSession || '',
|
ciSession: parsed.ciSession || "",
|
||||||
fetchedAt: parsed.fetchedAt || new Date().toISOString(),
|
fetchedAt: parsed.fetchedAt || new Date().toISOString(),
|
||||||
source: 'cache',
|
source: "cache",
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (_error) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function saveCachedAuthSession(session: UkVisaJobsAuthSession): Promise<void> {
|
async function saveCachedAuthSession(
|
||||||
const payload = {
|
session: UkVisaJobsAuthSession,
|
||||||
token: session.token,
|
): Promise<void> {
|
||||||
authToken: session.authToken,
|
const payload = {
|
||||||
csrfToken: session.csrfToken,
|
token: session.token,
|
||||||
ciSession: session.ciSession,
|
authToken: session.authToken,
|
||||||
fetchedAt: session.fetchedAt,
|
csrfToken: session.csrfToken,
|
||||||
source: session.source,
|
ciSession: session.ciSession,
|
||||||
};
|
fetchedAt: session.fetchedAt,
|
||||||
await mkdir(dirname(AUTH_CACHE_PATH), { recursive: true });
|
source: session.source,
|
||||||
await writeFile(AUTH_CACHE_PATH, JSON.stringify(payload, null, 2));
|
};
|
||||||
|
await mkdir(dirname(AUTH_CACHE_PATH), { recursive: true });
|
||||||
|
await writeFile(AUTH_CACHE_PATH, JSON.stringify(payload, null, 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractMultipartField(body: string, field: string): string | null {
|
function extractMultipartField(body: string, field: string): string | null {
|
||||||
const nameToken = `name="${field}"`;
|
const nameToken = `name="${field}"`;
|
||||||
const index = body.indexOf(nameToken);
|
const index = body.indexOf(nameToken);
|
||||||
if (index === -1) return null;
|
if (index === -1) return null;
|
||||||
|
|
||||||
const afterName = body.slice(index + nameToken.length);
|
const afterName = body.slice(index + nameToken.length);
|
||||||
let separatorIndex = afterName.indexOf('\r\n\r\n');
|
let separatorIndex = afterName.indexOf("\r\n\r\n");
|
||||||
let separatorLength = 4;
|
let separatorLength = 4;
|
||||||
if (separatorIndex === -1) {
|
if (separatorIndex === -1) {
|
||||||
separatorIndex = afterName.indexOf('\n\n');
|
separatorIndex = afterName.indexOf("\n\n");
|
||||||
separatorLength = 2;
|
separatorLength = 2;
|
||||||
}
|
}
|
||||||
if (separatorIndex === -1) return null;
|
if (separatorIndex === -1) return null;
|
||||||
|
|
||||||
const valueStart = index + nameToken.length + separatorIndex + separatorLength;
|
const valueStart =
|
||||||
const remainder = body.slice(valueStart);
|
index + nameToken.length + separatorIndex + separatorLength;
|
||||||
const endIndex = remainder.indexOf('\r\n');
|
const remainder = body.slice(valueStart);
|
||||||
if (endIndex === -1) return remainder.trim();
|
const endIndex = remainder.indexOf("\r\n");
|
||||||
return remainder.slice(0, endIndex).trim();
|
if (endIndex === -1) return remainder.trim();
|
||||||
|
return remainder.slice(0, endIndex).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractTokenFromRequest(request: Request): string | null {
|
function extractTokenFromRequest(request: Request): string | null {
|
||||||
const postData = request.postData();
|
const postData = request.postData();
|
||||||
if (!postData) return null;
|
if (!postData) return null;
|
||||||
const multipartToken = extractMultipartField(postData, 'token');
|
const multipartToken = extractMultipartField(postData, "token");
|
||||||
if (multipartToken) return multipartToken;
|
if (multipartToken) return multipartToken;
|
||||||
try {
|
try {
|
||||||
const params = new URLSearchParams(postData);
|
const params = new URLSearchParams(postData);
|
||||||
const token = params.get('token');
|
const token = params.get("token");
|
||||||
return token || null;
|
return token || null;
|
||||||
} catch (error) {
|
} catch (_error) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function isAuthErrorResponse(status: number, bodyText: string): boolean {
|
function isAuthErrorResponse(status: number, bodyText: string): boolean {
|
||||||
if (status === 401 || status === 403) return true;
|
if (status === 401 || status === 403) return true;
|
||||||
if (status !== 400) return false;
|
if (status !== 400) return false;
|
||||||
try {
|
try {
|
||||||
const parsed = JSON.parse(bodyText) as { errorType?: string; message?: string };
|
const parsed = JSON.parse(bodyText) as {
|
||||||
if (parsed?.errorType === 'expired') return true;
|
errorType?: string;
|
||||||
if (parsed?.message && parsed.message.toLowerCase().includes('expired')) return true;
|
message?: string;
|
||||||
} catch (error) {
|
};
|
||||||
// ignore JSON parse failures
|
if (parsed?.errorType === "expired") return true;
|
||||||
}
|
if (parsed?.message?.toLowerCase().includes("expired")) return true;
|
||||||
return bodyText.toLowerCase().includes('expired');
|
} catch (_error) {
|
||||||
|
// ignore JSON parse failures
|
||||||
|
}
|
||||||
|
return bodyText.toLowerCase().includes("expired");
|
||||||
}
|
}
|
||||||
|
|
||||||
async function loginWithBrowser(email: string, password: string): Promise<UkVisaJobsAuthSession> {
|
async function loginWithBrowser(
|
||||||
const [{ launchOptions }, { firefox }] = await Promise.all([
|
email: string,
|
||||||
import('camoufox-js'),
|
password: string,
|
||||||
import('playwright'),
|
): Promise<UkVisaJobsAuthSession> {
|
||||||
]);
|
const [{ launchOptions }, { firefox }] = await Promise.all([
|
||||||
const headless = process.env.UKVISAJOBS_HEADLESS !== 'false';
|
import("camoufox-js"),
|
||||||
const browser = await firefox.launch(await launchOptions({
|
import("playwright"),
|
||||||
headless,
|
]);
|
||||||
humanize: true,
|
const headless = process.env.UKVISAJOBS_HEADLESS !== "false";
|
||||||
geoip: true,
|
const browser = await firefox.launch(
|
||||||
}));
|
await launchOptions({
|
||||||
const context = await browser.newContext();
|
headless,
|
||||||
const page = await context.newPage();
|
humanize: true,
|
||||||
|
geoip: true,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
const context = await browser.newContext();
|
||||||
|
const page = await context.newPage();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.goto(SIGNIN_URL, { waitUntil: "domcontentloaded" });
|
||||||
|
await page.waitForSelector("#email", { timeout: 15000 });
|
||||||
|
await page.fill("#email", email);
|
||||||
|
await page.fill("#password", password);
|
||||||
|
await page.keyboard.press("Enter");
|
||||||
|
await page.waitForTimeout(7000);
|
||||||
|
|
||||||
|
const requestPromise = page.waitForRequest(
|
||||||
|
(request) =>
|
||||||
|
request.url().includes("/ukvisa-api/api/fetch-jobs-data") &&
|
||||||
|
request.method() === "POST",
|
||||||
|
{ timeout: 30000 },
|
||||||
|
);
|
||||||
|
|
||||||
|
await page.goto(OPEN_JOBS_URL, { waitUntil: "networkidle" });
|
||||||
|
await page.waitForTimeout(5000);
|
||||||
|
|
||||||
|
let fetchRequest: Request | null = null;
|
||||||
try {
|
try {
|
||||||
await page.goto(SIGNIN_URL, { waitUntil: 'domcontentloaded' });
|
fetchRequest = await requestPromise;
|
||||||
await page.waitForSelector('#email', { timeout: 15000 });
|
} catch (_error) {
|
||||||
await page.fill('#email', email);
|
fetchRequest = null;
|
||||||
await page.fill('#password', password);
|
|
||||||
await page.keyboard.press('Enter');
|
|
||||||
await page.waitForTimeout(7000);
|
|
||||||
|
|
||||||
const requestPromise = page.waitForRequest(
|
|
||||||
(request) => request.url().includes('/ukvisa-api/api/fetch-jobs-data') && request.method() === 'POST',
|
|
||||||
{ timeout: 30000 }
|
|
||||||
);
|
|
||||||
|
|
||||||
await page.goto(OPEN_JOBS_URL, { waitUntil: 'networkidle' });
|
|
||||||
await page.waitForTimeout(5000);
|
|
||||||
|
|
||||||
let fetchRequest: Request | null = null;
|
|
||||||
try {
|
|
||||||
fetchRequest = await requestPromise;
|
|
||||||
} catch (error) {
|
|
||||||
fetchRequest = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const cookies = await context.cookies('https://my.ukvisajobs.com');
|
|
||||||
const csrfToken = cookies.find((cookie) => cookie.name === 'csrf_token')?.value || '';
|
|
||||||
const ciSession = cookies.find((cookie) => cookie.name === 'ci_session')?.value || '';
|
|
||||||
const authToken = cookies.find((cookie) => cookie.name === 'authToken')?.value || '';
|
|
||||||
const token = fetchRequest ? extractTokenFromRequest(fetchRequest) : authToken;
|
|
||||||
|
|
||||||
if (!token) {
|
|
||||||
throw new Error('Failed to locate auth token from browser session.');
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
token,
|
|
||||||
authToken: authToken || token,
|
|
||||||
csrfToken,
|
|
||||||
ciSession,
|
|
||||||
fetchedAt: new Date().toISOString(),
|
|
||||||
source: 'browser',
|
|
||||||
};
|
|
||||||
} finally {
|
|
||||||
await browser.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const cookies = await context.cookies("https://my.ukvisajobs.com");
|
||||||
|
const csrfToken =
|
||||||
|
cookies.find((cookie) => cookie.name === "csrf_token")?.value || "";
|
||||||
|
const ciSession =
|
||||||
|
cookies.find((cookie) => cookie.name === "ci_session")?.value || "";
|
||||||
|
const authToken =
|
||||||
|
cookies.find((cookie) => cookie.name === "authToken")?.value || "";
|
||||||
|
const token = fetchRequest
|
||||||
|
? extractTokenFromRequest(fetchRequest)
|
||||||
|
: authToken;
|
||||||
|
|
||||||
|
if (!token) {
|
||||||
|
throw new Error("Failed to locate auth token from browser session.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
token,
|
||||||
|
authToken: authToken || token,
|
||||||
|
csrfToken,
|
||||||
|
ciSession,
|
||||||
|
fetchedAt: new Date().toISOString(),
|
||||||
|
source: "browser",
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main(): Promise<void> {
|
async function main(): Promise<void> {
|
||||||
console.log('🇬🇧 UK Visa Jobs Extractor starting...');
|
console.log("🇬🇧 UK Visa Jobs Extractor starting...");
|
||||||
const credentials = getLoginCredentials();
|
const credentials = getLoginCredentials();
|
||||||
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
|
const searchKeyword = process.env.UKVISAJOBS_SEARCH_KEYWORD || undefined;
|
||||||
const refreshOnly = process.env.UKVISAJOBS_REFRESH_ONLY === '1';
|
const refreshOnly = process.env.UKVISAJOBS_REFRESH_ONLY === "1";
|
||||||
|
|
||||||
let authSession = await loadCachedAuthSession();
|
let authSession = await loadCachedAuthSession();
|
||||||
|
|
||||||
if (refreshOnly) {
|
if (refreshOnly) {
|
||||||
|
if (!credentials) {
|
||||||
|
console.error(
|
||||||
|
"ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set",
|
||||||
|
);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
console.log(" Refresh-only mode: logging in to refresh tokens...");
|
||||||
|
authSession = await loginWithBrowser(
|
||||||
|
credentials.email,
|
||||||
|
credentials.password,
|
||||||
|
);
|
||||||
|
await saveCachedAuthSession(authSession);
|
||||||
|
console.log(" Auth session refreshed.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!authSession) {
|
||||||
|
if (!credentials) {
|
||||||
|
console.error(
|
||||||
|
"ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set",
|
||||||
|
);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
console.log(" No cached session found. Logging in to refresh tokens...");
|
||||||
|
authSession = await loginWithBrowser(
|
||||||
|
credentials.email,
|
||||||
|
credentials.password,
|
||||||
|
);
|
||||||
|
await saveCachedAuthSession(authSession);
|
||||||
|
}
|
||||||
|
|
||||||
|
const cookies = buildCookieHeader(authSession);
|
||||||
|
console.log(` Auth source: ${authSession.source}`);
|
||||||
|
console.log(` Cookies configured: ${cookies ? "Yes" : "No"}`);
|
||||||
|
console.log(` Token length: ${authSession.token.length}`);
|
||||||
|
|
||||||
|
// Get max jobs from environment
|
||||||
|
const maxJobsEnv = toNumberOrNull(process.env.UKVISAJOBS_MAX_JOBS);
|
||||||
|
const maxJobs = Math.min(maxJobsEnv ?? DEFAULT_MAX_JOBS, MAX_ALLOWED_JOBS);
|
||||||
|
const maxPages = Math.ceil(maxJobs / JOBS_PER_PAGE);
|
||||||
|
|
||||||
|
console.log(` Max jobs: ${maxJobs} (${maxPages} pages)`);
|
||||||
|
if (searchKeyword) {
|
||||||
|
console.log(` Search keyword: ${searchKeyword}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const allJobs: ExtractedJob[] = [];
|
||||||
|
const seenIds = new Set<string>();
|
||||||
|
let totalAvailable = 0;
|
||||||
|
let pageNo = 1;
|
||||||
|
|
||||||
|
try {
|
||||||
|
while (pageNo <= maxPages && allJobs.length < maxJobs) {
|
||||||
|
console.log(` Fetching page ${pageNo}/${maxPages}...`);
|
||||||
|
|
||||||
|
let response: UkVisaJobsApiResponse;
|
||||||
|
try {
|
||||||
|
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
||||||
|
} catch (error) {
|
||||||
if (!credentials) {
|
if (!credentials) {
|
||||||
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
|
if (error instanceof UkVisaJobsAuthError) {
|
||||||
process.exit(1);
|
throw new Error(
|
||||||
|
"UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
}
|
}
|
||||||
console.log(' Refresh-only mode: logging in to refresh tokens...');
|
|
||||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
const reason =
|
||||||
|
error instanceof UkVisaJobsAuthError
|
||||||
|
? "Auth expired."
|
||||||
|
: "Fetch failed.";
|
||||||
|
console.log(` ${reason} Refreshing tokens and retrying...`);
|
||||||
|
authSession = await loginWithBrowser(
|
||||||
|
credentials.email,
|
||||||
|
credentials.password,
|
||||||
|
);
|
||||||
await saveCachedAuthSession(authSession);
|
await saveCachedAuthSession(authSession);
|
||||||
console.log(' Auth session refreshed.');
|
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
||||||
return;
|
}
|
||||||
|
|
||||||
|
if (response.status !== 1) {
|
||||||
|
console.warn(
|
||||||
|
` âš ï¸ API returned status ${response.status} on page ${pageNo}`,
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pageNo === 1) {
|
||||||
|
totalAvailable = response.totalJobs;
|
||||||
|
console.log(` Total available: ${totalAvailable} jobs`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response.jobs || response.jobs.length === 0) {
|
||||||
|
console.log(` No more jobs on page ${pageNo}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const rawJob of response.jobs) {
|
||||||
|
if (allJobs.length >= maxJobs) break;
|
||||||
|
|
||||||
|
// Deduplicate by ID
|
||||||
|
if (seenIds.has(rawJob.id)) continue;
|
||||||
|
seenIds.add(rawJob.id);
|
||||||
|
|
||||||
|
const mapped = mapJob(rawJob);
|
||||||
|
allJobs.push(mapped);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we got fewer jobs than a full page, we're at the end
|
||||||
|
if (response.jobs.length < JOBS_PER_PAGE) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
pageNo++;
|
||||||
|
|
||||||
|
// Small delay to be nice to the API
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!authSession) {
|
console.log(`✅ Scraped ${allJobs.length} jobs`);
|
||||||
if (!credentials) {
|
|
||||||
console.error('ERROR: UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD must be set');
|
// Write output to storage directory (similar to Crawlee dataset structure)
|
||||||
process.exit(1);
|
const storageDir = join(__dirname, "../storage/datasets/default");
|
||||||
}
|
await mkdir(storageDir, { recursive: true });
|
||||||
console.log(' No cached session found. Logging in to refresh tokens...');
|
|
||||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
// Write each job as a separate JSON file (Crawlee dataset format)
|
||||||
await saveCachedAuthSession(authSession);
|
for (let i = 0; i < allJobs.length; i++) {
|
||||||
|
const filename = join(
|
||||||
|
storageDir,
|
||||||
|
`${String(i + 1).padStart(6, "0")}.json`,
|
||||||
|
);
|
||||||
|
await writeFile(filename, JSON.stringify(allJobs[i], null, 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
const cookies = buildCookieHeader(authSession);
|
// Also write a combined output file for easier consumption
|
||||||
console.log(` Auth source: ${authSession.source}`);
|
const outputFile = join(storageDir, "jobs.json");
|
||||||
console.log(` Cookies configured: ${cookies ? 'Yes' : 'No'}`);
|
await writeFile(outputFile, JSON.stringify(allJobs, null, 2));
|
||||||
console.log(` Token length: ${authSession.token.length}`);
|
|
||||||
|
|
||||||
// Get max jobs from environment
|
console.log(` Output written to: ${storageDir}`);
|
||||||
const maxJobsEnv = toNumberOrNull(process.env.UKVISAJOBS_MAX_JOBS);
|
console.log(` Jobs file: ${outputFile}`);
|
||||||
const maxJobs = Math.min(maxJobsEnv ?? DEFAULT_MAX_JOBS, MAX_ALLOWED_JOBS);
|
} catch (error) {
|
||||||
const maxPages = Math.ceil(maxJobs / JOBS_PER_PAGE);
|
const message = error instanceof Error ? error.message : "Unknown error";
|
||||||
|
console.error(`⌠Error: ${message}`);
|
||||||
console.log(` Max jobs: ${maxJobs} (${maxPages} pages)`);
|
process.exit(1);
|
||||||
if (searchKeyword) {
|
}
|
||||||
console.log(` Search keyword: ${searchKeyword}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const allJobs: ExtractedJob[] = [];
|
|
||||||
const seenIds = new Set<string>();
|
|
||||||
let totalAvailable = 0;
|
|
||||||
let pageNo = 1;
|
|
||||||
|
|
||||||
try {
|
|
||||||
while (pageNo <= maxPages && allJobs.length < maxJobs) {
|
|
||||||
console.log(` Fetching page ${pageNo}/${maxPages}...`);
|
|
||||||
|
|
||||||
let response: UkVisaJobsApiResponse;
|
|
||||||
try {
|
|
||||||
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
|
||||||
} catch (error) {
|
|
||||||
if (!credentials) {
|
|
||||||
if (error instanceof UkVisaJobsAuthError) {
|
|
||||||
throw new Error('UKVisaJobs auth expired. Set UKVISAJOBS_EMAIL and UKVISAJOBS_PASSWORD to refresh.');
|
|
||||||
}
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
|
|
||||||
const reason = error instanceof UkVisaJobsAuthError ? 'Auth expired.' : 'Fetch failed.';
|
|
||||||
console.log(` ${reason} Refreshing tokens and retrying...`);
|
|
||||||
authSession = await loginWithBrowser(credentials.email, credentials.password);
|
|
||||||
await saveCachedAuthSession(authSession);
|
|
||||||
response = await fetchPage(pageNo, authSession, { searchKeyword });
|
|
||||||
}
|
|
||||||
|
|
||||||
if (response.status !== 1) {
|
|
||||||
console.warn(` âš ï¸ API returned status ${response.status} on page ${pageNo}`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pageNo === 1) {
|
|
||||||
totalAvailable = response.totalJobs;
|
|
||||||
console.log(` Total available: ${totalAvailable} jobs`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!response.jobs || response.jobs.length === 0) {
|
|
||||||
console.log(` No more jobs on page ${pageNo}`);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const rawJob of response.jobs) {
|
|
||||||
if (allJobs.length >= maxJobs) break;
|
|
||||||
|
|
||||||
// Deduplicate by ID
|
|
||||||
if (seenIds.has(rawJob.id)) continue;
|
|
||||||
seenIds.add(rawJob.id);
|
|
||||||
|
|
||||||
const mapped = mapJob(rawJob);
|
|
||||||
allJobs.push(mapped);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we got fewer jobs than a full page, we're at the end
|
|
||||||
if (response.jobs.length < JOBS_PER_PAGE) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
pageNo++;
|
|
||||||
|
|
||||||
// Small delay to be nice to the API
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`✅ Scraped ${allJobs.length} jobs`);
|
|
||||||
|
|
||||||
// Write output to storage directory (similar to Crawlee dataset structure)
|
|
||||||
const storageDir = join(__dirname, '../storage/datasets/default');
|
|
||||||
await mkdir(storageDir, { recursive: true });
|
|
||||||
|
|
||||||
// Write each job as a separate JSON file (Crawlee dataset format)
|
|
||||||
for (let i = 0; i < allJobs.length; i++) {
|
|
||||||
const filename = join(storageDir, `${String(i + 1).padStart(6, '0')}.json`);
|
|
||||||
await writeFile(filename, JSON.stringify(allJobs[i], null, 2));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also write a combined output file for easier consumption
|
|
||||||
const outputFile = join(storageDir, 'jobs.json');
|
|
||||||
await writeFile(outputFile, JSON.stringify(allJobs, null, 2));
|
|
||||||
|
|
||||||
console.log(` Output written to: ${storageDir}`);
|
|
||||||
console.log(` Jobs file: ${outputFile}`);
|
|
||||||
|
|
||||||
} catch (error) {
|
|
||||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
||||||
console.error(`⌠Error: ${message}`);
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch((error) => {
|
main().catch((error) => {
|
||||||
console.error('Fatal error:', error);
|
console.error("Fatal error:", error);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"extends": "@apify/tsconfig",
|
"extends": "@apify/tsconfig",
|
||||||
"compilerOptions": {
|
"compilerOptions": {
|
||||||
"module": "NodeNext",
|
"module": "NodeNext",
|
||||||
"moduleResolution": "NodeNext",
|
"moduleResolution": "NodeNext",
|
||||||
"target": "ES2022",
|
"target": "ES2022",
|
||||||
"outDir": "dist",
|
"outDir": "dist",
|
||||||
"noUnusedLocals": false,
|
"noUnusedLocals": false,
|
||||||
"lib": ["DOM"]
|
"lib": ["DOM"]
|
||||||
},
|
},
|
||||||
"include": ["./src/**/*"]
|
"include": ["./src/**/*"]
|
||||||
}
|
}
|
||||||
|
|||||||
@ -122,6 +122,10 @@ describe.sequential("Jobs API routes", () => {
|
|||||||
employer: "Acme",
|
employer: "Acme",
|
||||||
jobUrl: "https://example.com/job/5",
|
jobUrl: "https://example.com/job/5",
|
||||||
jobDescription: "Test description",
|
jobDescription: "Test description",
|
||||||
|
});
|
||||||
|
|
||||||
|
const { updateJob } = await import("../../repositories/jobs.js");
|
||||||
|
await updateJob(job.id, {
|
||||||
suitabilityScore: 55,
|
suitabilityScore: 55,
|
||||||
suitabilityReason: "Old fit",
|
suitabilityReason: "Old fit",
|
||||||
});
|
});
|
||||||
|
|||||||
@ -61,6 +61,7 @@ describe.sequential("Profile API routes", () => {
|
|||||||
id: "proj1",
|
id: "proj1",
|
||||||
name: "Project 1",
|
name: "Project 1",
|
||||||
description: "Desc 1",
|
description: "Desc 1",
|
||||||
|
summary: "Summary 1",
|
||||||
date: "2024",
|
date: "2024",
|
||||||
visible: true,
|
visible: true,
|
||||||
},
|
},
|
||||||
@ -68,6 +69,7 @@ describe.sequential("Profile API routes", () => {
|
|||||||
id: "proj2",
|
id: "proj2",
|
||||||
name: "Project 2",
|
name: "Project 2",
|
||||||
description: "Desc 2",
|
description: "Desc 2",
|
||||||
|
summary: "Summary 2",
|
||||||
date: "2023",
|
date: "2023",
|
||||||
visible: false,
|
visible: false,
|
||||||
},
|
},
|
||||||
|
|||||||
@ -4,6 +4,11 @@ import { pickProjectIdsForJob } from "./projectSelection.js";
|
|||||||
import { scoreJobSuitability } from "./scorer.js";
|
import { scoreJobSuitability } from "./scorer.js";
|
||||||
|
|
||||||
// --- Mocks ---
|
// --- Mocks ---
|
||||||
|
vi.mock("../repositories/settings.js", () => ({
|
||||||
|
getSetting: vi.fn().mockResolvedValue(null),
|
||||||
|
getAllSettings: vi.fn().mockResolvedValue({}),
|
||||||
|
}));
|
||||||
|
|
||||||
// We need to mock 'fetch' globally for these tests
|
// We need to mock 'fetch' globally for these tests
|
||||||
const globalFetch = global.fetch;
|
const globalFetch = global.fetch;
|
||||||
|
|
||||||
|
|||||||
@ -136,7 +136,7 @@ export async function callOpenRouter<T>(
|
|||||||
const shouldRetry =
|
const shouldRetry =
|
||||||
message.includes("parse") ||
|
message.includes("parse") ||
|
||||||
status === 429 ||
|
status === 429 ||
|
||||||
(status >= 500 && status <= 599) ||
|
(status !== undefined && status >= 500 && status <= 599) ||
|
||||||
message.toLowerCase().includes("timeout") ||
|
message.toLowerCase().includes("timeout") ||
|
||||||
message.toLowerCase().includes("fetch failed");
|
message.toLowerCase().includes("fetch failed");
|
||||||
|
|
||||||
|
|||||||
@ -76,6 +76,13 @@ vi.mock("fs/promises", async () => {
|
|||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
vi.mock("node:fs/promises", async () => {
|
||||||
|
return {
|
||||||
|
default: mocks,
|
||||||
|
...mocks,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
vi.mock("fs", () => ({
|
vi.mock("fs", () => ({
|
||||||
existsSync: vi.fn().mockReturnValue(true),
|
existsSync: vi.fn().mockReturnValue(true),
|
||||||
createWriteStream: vi.fn().mockReturnValue({
|
createWriteStream: vi.fn().mockReturnValue({
|
||||||
@ -93,6 +100,23 @@ vi.mock("fs", () => ({
|
|||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("node:fs", () => ({
|
||||||
|
existsSync: vi.fn().mockReturnValue(true),
|
||||||
|
createWriteStream: vi.fn().mockReturnValue({
|
||||||
|
on: vi.fn(),
|
||||||
|
write: vi.fn(),
|
||||||
|
end: vi.fn(),
|
||||||
|
}),
|
||||||
|
default: {
|
||||||
|
existsSync: vi.fn().mockReturnValue(true),
|
||||||
|
createWriteStream: vi.fn().mockReturnValue({
|
||||||
|
on: vi.fn(),
|
||||||
|
write: vi.fn(),
|
||||||
|
end: vi.fn(),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("../repositories/settings.js", () => ({
|
vi.mock("../repositories/settings.js", () => ({
|
||||||
getSetting: vi.fn().mockImplementation((key: string) => {
|
getSetting: vi.fn().mockImplementation((key: string) => {
|
||||||
if (key === "rxresumeEmail") return Promise.resolve("test@example.com");
|
if (key === "rxresumeEmail") return Promise.resolve("test@example.com");
|
||||||
@ -126,7 +150,9 @@ vi.mock("./resumeProjects.js", () => ({
|
|||||||
|
|
||||||
// Mock the RxResumeClient
|
// Mock the RxResumeClient
|
||||||
vi.mock("./rxresume-client.js", () => ({
|
vi.mock("./rxresume-client.js", () => ({
|
||||||
RxResumeClient: vi.fn().mockImplementation(() => mockRxResumeClient),
|
RxResumeClient: vi.fn().mockImplementation(function (this: any) {
|
||||||
|
return mockRxResumeClient;
|
||||||
|
}),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Mock stream pipeline for downloading PDF
|
// Mock stream pipeline for downloading PDF
|
||||||
@ -137,6 +163,13 @@ vi.mock("stream/promises", () => ({
|
|||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("node:stream/promises", () => ({
|
||||||
|
pipeline: vi.fn().mockResolvedValue(undefined),
|
||||||
|
default: {
|
||||||
|
pipeline: vi.fn().mockResolvedValue(undefined),
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
// Mock stream Readable
|
// Mock stream Readable
|
||||||
vi.mock("stream", () => ({
|
vi.mock("stream", () => ({
|
||||||
Readable: {
|
Readable: {
|
||||||
@ -153,6 +186,21 @@ vi.mock("stream", () => ({
|
|||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("node:stream", () => ({
|
||||||
|
Readable: {
|
||||||
|
fromWeb: vi.fn().mockReturnValue({
|
||||||
|
pipe: vi.fn(),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
default: {
|
||||||
|
Readable: {
|
||||||
|
fromWeb: vi.fn().mockReturnValue({
|
||||||
|
pipe: vi.fn(),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
// Mock global fetch for PDF download
|
// Mock global fetch for PDF download
|
||||||
vi.stubGlobal(
|
vi.stubGlobal(
|
||||||
"fetch",
|
"fetch",
|
||||||
@ -217,12 +265,20 @@ describe("PDF Service Skills Validation", () => {
|
|||||||
sections: {
|
sections: {
|
||||||
...mockProfile.sections,
|
...mockProfile.sections,
|
||||||
skills: {
|
skills: {
|
||||||
|
...mockProfile.sections.skills,
|
||||||
items: [
|
items: [
|
||||||
{ name: "Invalid Skill" }, // Missing visible, description, id, level
|
{
|
||||||
|
id: "invalid-1",
|
||||||
|
name: "Invalid Skill",
|
||||||
|
description: "",
|
||||||
|
level: 1,
|
||||||
|
keywords: [],
|
||||||
|
visible: true,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
} as any;
|
||||||
vi.mocked(getProfile).mockResolvedValueOnce(invalidProfile);
|
vi.mocked(getProfile).mockResolvedValueOnce(invalidProfile);
|
||||||
|
|
||||||
// No tailoring, pass dummy path to bypass getProfile cache and use readFile mock
|
// No tailoring, pass dummy path to bypass getProfile cache and use readFile mock
|
||||||
@ -246,14 +302,36 @@ describe("PDF Service Skills Validation", () => {
|
|||||||
sections: {
|
sections: {
|
||||||
...mockProfile.sections,
|
...mockProfile.sections,
|
||||||
skills: {
|
skills: {
|
||||||
|
...mockProfile.sections.skills,
|
||||||
items: [
|
items: [
|
||||||
{ name: "Skill 1", keywords: ["a"] },
|
{
|
||||||
{ name: "Skill 2", keywords: ["b"] },
|
id: "",
|
||||||
{ name: "Skill 3", keywords: ["c"] },
|
name: "Skill 1",
|
||||||
|
keywords: ["a"],
|
||||||
|
description: "",
|
||||||
|
level: 1,
|
||||||
|
visible: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "",
|
||||||
|
name: "Skill 2",
|
||||||
|
keywords: ["b"],
|
||||||
|
description: "",
|
||||||
|
level: 1,
|
||||||
|
visible: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "",
|
||||||
|
name: "Skill 3",
|
||||||
|
keywords: ["c"],
|
||||||
|
description: "",
|
||||||
|
level: 1,
|
||||||
|
visible: true,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
} as any;
|
||||||
vi.mocked(getProfile).mockResolvedValueOnce(profileWithoutIds);
|
vi.mocked(getProfile).mockResolvedValueOnce(profileWithoutIds);
|
||||||
|
|
||||||
await generatePdf("job-cuid2-test", {}, "Job Desc", "dummy.json");
|
await generatePdf("job-cuid2-test", {}, "Job Desc", "dummy.json");
|
||||||
@ -285,10 +363,20 @@ describe("PDF Service Skills Validation", () => {
|
|||||||
sections: {
|
sections: {
|
||||||
...mockProfile.sections,
|
...mockProfile.sections,
|
||||||
skills: {
|
skills: {
|
||||||
items: [{ name: "Skill Without ID", keywords: ["test"] }],
|
...mockProfile.sections.skills,
|
||||||
|
items: [
|
||||||
|
{
|
||||||
|
id: "",
|
||||||
|
name: "Skill Without ID",
|
||||||
|
keywords: ["test"],
|
||||||
|
description: "",
|
||||||
|
level: 1,
|
||||||
|
visible: true,
|
||||||
|
},
|
||||||
|
],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
} as any;
|
||||||
vi.mocked(getProfile).mockResolvedValueOnce(profileWithoutIds);
|
vi.mocked(getProfile).mockResolvedValueOnce(profileWithoutIds);
|
||||||
|
|
||||||
await generatePdf("job-no-skill-prefix", {}, "Job Desc", "dummy.json");
|
await generatePdf("job-no-skill-prefix", {}, "Job Desc", "dummy.json");
|
||||||
|
|||||||
@ -71,6 +71,13 @@ vi.mock("fs/promises", async () => {
|
|||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
vi.mock("node:fs/promises", async () => {
|
||||||
|
return {
|
||||||
|
default: mocks,
|
||||||
|
...mocks,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
vi.mock("fs", () => ({
|
vi.mock("fs", () => ({
|
||||||
existsSync: vi.fn().mockReturnValue(true),
|
existsSync: vi.fn().mockReturnValue(true),
|
||||||
createWriteStream: vi.fn().mockReturnValue({
|
createWriteStream: vi.fn().mockReturnValue({
|
||||||
@ -88,6 +95,23 @@ vi.mock("fs", () => ({
|
|||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("node:fs", () => ({
|
||||||
|
existsSync: vi.fn().mockReturnValue(true),
|
||||||
|
createWriteStream: vi.fn().mockReturnValue({
|
||||||
|
on: vi.fn(),
|
||||||
|
write: vi.fn(),
|
||||||
|
end: vi.fn(),
|
||||||
|
}),
|
||||||
|
default: {
|
||||||
|
existsSync: vi.fn().mockReturnValue(true),
|
||||||
|
createWriteStream: vi.fn().mockReturnValue({
|
||||||
|
on: vi.fn(),
|
||||||
|
write: vi.fn(),
|
||||||
|
end: vi.fn(),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("../repositories/settings.js", () => ({
|
vi.mock("../repositories/settings.js", () => ({
|
||||||
getSetting: vi.fn().mockImplementation((key: string) => {
|
getSetting: vi.fn().mockImplementation((key: string) => {
|
||||||
if (key === "rxresumeEmail") return Promise.resolve("test@example.com");
|
if (key === "rxresumeEmail") return Promise.resolve("test@example.com");
|
||||||
@ -125,7 +149,9 @@ vi.mock("./resumeProjects.js", () => ({
|
|||||||
|
|
||||||
// Mock the RxResumeClient
|
// Mock the RxResumeClient
|
||||||
vi.mock("./rxresume-client.js", () => ({
|
vi.mock("./rxresume-client.js", () => ({
|
||||||
RxResumeClient: vi.fn().mockImplementation(() => mockRxResumeClient),
|
RxResumeClient: vi.fn().mockImplementation(function (this: any) {
|
||||||
|
return mockRxResumeClient;
|
||||||
|
}),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Mock stream pipeline for downloading PDF
|
// Mock stream pipeline for downloading PDF
|
||||||
@ -136,6 +162,13 @@ vi.mock("stream/promises", () => ({
|
|||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("node:stream/promises", () => ({
|
||||||
|
pipeline: vi.fn().mockResolvedValue(undefined),
|
||||||
|
default: {
|
||||||
|
pipeline: vi.fn().mockResolvedValue(undefined),
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
// Mock stream Readable
|
// Mock stream Readable
|
||||||
vi.mock("stream", () => ({
|
vi.mock("stream", () => ({
|
||||||
Readable: {
|
Readable: {
|
||||||
@ -152,6 +185,21 @@ vi.mock("stream", () => ({
|
|||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("node:stream", () => ({
|
||||||
|
Readable: {
|
||||||
|
fromWeb: vi.fn().mockReturnValue({
|
||||||
|
pipe: vi.fn(),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
default: {
|
||||||
|
Readable: {
|
||||||
|
fromWeb: vi.fn().mockReturnValue({
|
||||||
|
pipe: vi.fn(),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
// Mock global fetch
|
// Mock global fetch
|
||||||
vi.stubGlobal(
|
vi.stubGlobal(
|
||||||
"fetch",
|
"fetch",
|
||||||
|
|||||||
@ -49,7 +49,7 @@ export async function getProfile(forceRefresh = false): Promise<ResumeProfile> {
|
|||||||
throw new Error("Resume data is empty or invalid");
|
throw new Error("Resume data is empty or invalid");
|
||||||
}
|
}
|
||||||
|
|
||||||
cachedProfile = resume.data;
|
cachedProfile = resume.data as unknown as ResumeProfile;
|
||||||
cachedResumeId = rxresumeBaseResumeId;
|
cachedResumeId = rxresumeBaseResumeId;
|
||||||
console.log(`✅ Profile loaded from RxResume v4 API`);
|
console.log(`✅ Profile loaded from RxResume v4 API`);
|
||||||
return cachedProfile;
|
return cachedProfile;
|
||||||
|
|||||||
@ -29,15 +29,24 @@ describe("Resume Projects Logic", () => {
|
|||||||
{
|
{
|
||||||
id: "p1",
|
id: "p1",
|
||||||
name: "Proj 1",
|
name: "Proj 1",
|
||||||
|
description: "Desc 1",
|
||||||
summary: "<p>Desc 1</p>",
|
summary: "<p>Desc 1</p>",
|
||||||
|
date: "2024",
|
||||||
visible: true,
|
visible: true,
|
||||||
},
|
},
|
||||||
{ id: "p2", name: "Proj 2", summary: "Desc 2", visible: false },
|
{
|
||||||
{ name: "No ID" }, // Should be skipped
|
id: "p2",
|
||||||
|
name: "Proj 2",
|
||||||
|
description: "Desc 2",
|
||||||
|
summary: "Desc 2",
|
||||||
|
date: "2023",
|
||||||
|
visible: false,
|
||||||
|
},
|
||||||
|
{ name: "No ID" } as any, // Should be skipped
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
} as any;
|
||||||
|
|
||||||
const { catalog, selectionItems } =
|
const { catalog, selectionItems } =
|
||||||
rp.extractProjectsFromProfile(profile);
|
rp.extractProjectsFromProfile(profile);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user