diff --git a/job-extractor/src/routes.ts b/job-extractor/src/routes.ts index b2205f3..27a9729 100644 --- a/job-extractor/src/routes.ts +++ b/job-extractor/src/routes.ts @@ -1,4 +1,47 @@ import { createPlaywrightRouter, log } from "crawlee"; +import { readFileSync } from "node:fs"; + +function normalizeUrl(raw: string | null | undefined): string | null { + if (!raw) return null; + try { + const url = new URL(raw); + url.hash = ""; + // Keep search params (some sites encode job IDs there); just normalize trailing slash. + const normalized = url.toString().replace(/\/$/, ""); + return normalized; + } catch { + return raw.replace(/\/$/, ""); + } +} + +function getExistingJobUrlSet(): Set { + const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE; + const raw = + filePath + ? (() => { + try { + return readFileSync(filePath, "utf-8"); + } catch { + return null; + } + })() + : process.env.JOBOPS_EXISTING_JOB_URLS; + + if (!raw) return new Set(); + try { + const parsed = JSON.parse(raw); + if (!Array.isArray(parsed)) return new Set(); + const normalized = parsed + .map((u) => normalizeUrl(typeof u === "string" ? u : null)) + .filter((u): u is string => Boolean(u)); + return new Set(normalized); + } catch { + return new Set(); + } +} + +const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1"; +const EXISTING_JOB_URLS = getExistingJobUrlSet(); interface Job { title: string | null; @@ -37,6 +80,8 @@ router.addHandler( const articles = await page.locator("article[wire\\:key]").all(); const jobs: Job[] = []; + let skippedKnownJobs = 0; + let enqueuedJobs = 0; console.log(`${articles.length} jobs found`); @@ -117,17 +162,33 @@ router.addHandler( // append more links to crawl: single job pages if (jobUrl) { - await enqueueLinks({ - urls: [jobUrl], - userData: { - ...jobs[jobs.length - 1], - label: "gradcracker-single-job-page" - }, - }); + const jobUrlNormalized = normalizeUrl(jobUrl); + const isKnownJob = + SKIP_APPLY_FOR_EXISTING && + jobUrlNormalized !== null && + EXISTING_JOB_URLS.has(jobUrlNormalized); + + if (isKnownJob) { + skippedKnownJobs++; + } else { + await enqueueLinks({ + urls: [jobUrl], + userData: { + ...jobs[jobs.length - 1], + label: "gradcracker-single-job-page" + }, + }); + enqueuedJobs++; + } } } log.info(`Extracted ${jobs.length} jobs`); + if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) { + log.info( + `Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.` + ); + } } ); @@ -149,10 +210,16 @@ router.addHandler( const applyButton = page.locator('a[dusk="apply-button"]'); const hasApplyButton = (await applyButton.count()) > 0; + const requestUrlNormalized = normalizeUrl(request.url); + const isKnownJob = + SKIP_APPLY_FOR_EXISTING && + requestUrlNormalized !== null && + EXISTING_JOB_URLS.has(requestUrlNormalized); + let applicationLink: string | null = null; let spawnedPage: typeof page | null = null; - if (hasApplyButton) { + if (hasApplyButton && !isKnownJob) { const originalUrl = page.url(); // Prefer page-scoped popup detection. Using the browser context's "page" event @@ -224,8 +291,10 @@ router.addHandler( await spawnedPage.close().catch(() => null); } } - } else { + } else if (!hasApplyButton) { log.warning(`Apply button not found on page: ${request.url}`); + } else { + log.info(`Skipping apply click for known job: ${request.url}`); } await pushData({ diff --git a/orchestrator/src/server/pipeline/orchestrator.ts b/orchestrator/src/server/pipeline/orchestrator.ts index 0a7d75c..99a78a8 100644 --- a/orchestrator/src/server/pipeline/orchestrator.ts +++ b/orchestrator/src/server/pipeline/orchestrator.ts @@ -72,7 +72,8 @@ export async function runPipeline(config: Partial = {}): Promise // Step 2: Run crawler console.log('\n🕷️ Running crawler...'); progressHelpers.startCrawling(); - const crawlerResult = await runCrawler(); + const existingJobUrls = await jobsRepo.getAllJobUrls(); + const crawlerResult = await runCrawler({ existingJobUrls }); if (!crawlerResult.success) { throw new Error(`Crawler failed: ${crawlerResult.error}`); diff --git a/orchestrator/src/server/repositories/jobs.ts b/orchestrator/src/server/repositories/jobs.ts index a280893..19a3461 100644 --- a/orchestrator/src/server/repositories/jobs.ts +++ b/orchestrator/src/server/repositories/jobs.ts @@ -37,6 +37,14 @@ export async function getJobByUrl(jobUrl: string): Promise { return row ? mapRowToJob(row) : null; } +/** + * Get all known job URLs (for deduplication / crawler optimizations). + */ +export async function getAllJobUrls(): Promise { + const rows = await db.select({ jobUrl: jobs.jobUrl }).from(jobs); + return rows.map(r => r.jobUrl); +} + /** * Create a new job (or return existing if URL matches). */ diff --git a/orchestrator/src/server/services/crawler.ts b/orchestrator/src/server/services/crawler.ts index 7de78ac..acb8915 100644 --- a/orchestrator/src/server/services/crawler.ts +++ b/orchestrator/src/server/services/crawler.ts @@ -6,12 +6,13 @@ import { spawn } from 'child_process'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; -import { readdir, readFile } from 'fs/promises'; +import { mkdir, readdir, readFile, writeFile } from 'fs/promises'; import type { CreateJobInput } from '../../shared/types.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); const CRAWLER_DIR = join(__dirname, '../../../../job-extractor'); const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default'); +const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops'); export interface CrawlerResult { success: boolean; @@ -19,15 +20,33 @@ export interface CrawlerResult { error?: string; } +export interface RunCrawlerOptions { + /** + * List of job page URLs already present in the orchestrator DB. + * Used by the crawler to avoid expensive/undesired interactions (e.g. apply button click). + */ + existingJobUrls?: string[]; +} + +async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined): Promise { + if (!existingJobUrls || existingJobUrls.length === 0) return null; + await mkdir(JOBOPS_STORAGE_DIR, { recursive: true }); + const filePath = join(JOBOPS_STORAGE_DIR, 'existing-job-urls.json'); + await writeFile(filePath, JSON.stringify(existingJobUrls), 'utf-8'); + return filePath; +} + /** * Run the job-extractor crawler and return discovered jobs. */ -export async function runCrawler(): Promise { +export async function runCrawler(options: RunCrawlerOptions = {}): Promise { console.log('🕷️ Starting job crawler...'); try { // Clear previous results await clearStorageDataset(); + + const existingJobUrlsFile = await writeExistingJobUrlsFile(options.existingJobUrls); // Run the crawler await new Promise((resolve, reject) => { @@ -35,6 +54,11 @@ export async function runCrawler(): Promise { cwd: CRAWLER_DIR, shell: true, stdio: 'inherit', + env: { + ...process.env, + JOBOPS_SKIP_APPLY_FOR_EXISTING: '1', + ...(existingJobUrlsFile ? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile } : {}), + }, }); child.on('close', (code) => {