don't recrawl jobs if already seen

This commit is contained in:
DaKheera47 2025-12-14 15:35:29 +00:00
parent 9122f0f9aa
commit 59c943b4b6
4 changed files with 114 additions and 12 deletions

View File

@ -1,4 +1,47 @@
import { createPlaywrightRouter, log } from "crawlee";
import { readFileSync } from "node:fs";
function normalizeUrl(raw: string | null | undefined): string | null {
if (!raw) return null;
try {
const url = new URL(raw);
url.hash = "";
// Keep search params (some sites encode job IDs there); just normalize trailing slash.
const normalized = url.toString().replace(/\/$/, "");
return normalized;
} catch {
return raw.replace(/\/$/, "");
}
}
function getExistingJobUrlSet(): Set<string> {
const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
const raw =
filePath
? (() => {
try {
return readFileSync(filePath, "utf-8");
} catch {
return null;
}
})()
: process.env.JOBOPS_EXISTING_JOB_URLS;
if (!raw) return new Set();
try {
const parsed = JSON.parse(raw);
if (!Array.isArray(parsed)) return new Set();
const normalized = parsed
.map((u) => normalizeUrl(typeof u === "string" ? u : null))
.filter((u): u is string => Boolean(u));
return new Set(normalized);
} catch {
return new Set();
}
}
const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
const EXISTING_JOB_URLS = getExistingJobUrlSet();
interface Job {
title: string | null;
@ -37,6 +80,8 @@ router.addHandler(
const articles = await page.locator("article[wire\\:key]").all();
const jobs: Job[] = [];
let skippedKnownJobs = 0;
let enqueuedJobs = 0;
console.log(`${articles.length} jobs found`);
@ -117,17 +162,33 @@ router.addHandler(
// append more links to crawl: single job pages
if (jobUrl) {
await enqueueLinks({
urls: [jobUrl],
userData: {
...jobs[jobs.length - 1],
label: "gradcracker-single-job-page"
},
});
const jobUrlNormalized = normalizeUrl(jobUrl);
const isKnownJob =
SKIP_APPLY_FOR_EXISTING &&
jobUrlNormalized !== null &&
EXISTING_JOB_URLS.has(jobUrlNormalized);
if (isKnownJob) {
skippedKnownJobs++;
} else {
await enqueueLinks({
urls: [jobUrl],
userData: {
...jobs[jobs.length - 1],
label: "gradcracker-single-job-page"
},
});
enqueuedJobs++;
}
}
}
log.info(`Extracted ${jobs.length} jobs`);
if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
log.info(
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`
);
}
}
);
@ -149,10 +210,16 @@ router.addHandler(
const applyButton = page.locator('a[dusk="apply-button"]');
const hasApplyButton = (await applyButton.count()) > 0;
const requestUrlNormalized = normalizeUrl(request.url);
const isKnownJob =
SKIP_APPLY_FOR_EXISTING &&
requestUrlNormalized !== null &&
EXISTING_JOB_URLS.has(requestUrlNormalized);
let applicationLink: string | null = null;
let spawnedPage: typeof page | null = null;
if (hasApplyButton) {
if (hasApplyButton && !isKnownJob) {
const originalUrl = page.url();
// Prefer page-scoped popup detection. Using the browser context's "page" event
@ -224,8 +291,10 @@ router.addHandler(
await spawnedPage.close().catch(() => null);
}
}
} else {
} else if (!hasApplyButton) {
log.warning(`Apply button not found on page: ${request.url}`);
} else {
log.info(`Skipping apply click for known job: ${request.url}`);
}
await pushData({

View File

@ -72,7 +72,8 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
// Step 2: Run crawler
console.log('\n🕷 Running crawler...');
progressHelpers.startCrawling();
const crawlerResult = await runCrawler();
const existingJobUrls = await jobsRepo.getAllJobUrls();
const crawlerResult = await runCrawler({ existingJobUrls });
if (!crawlerResult.success) {
throw new Error(`Crawler failed: ${crawlerResult.error}`);

View File

@ -37,6 +37,14 @@ export async function getJobByUrl(jobUrl: string): Promise<Job | null> {
return row ? mapRowToJob(row) : null;
}
/**
* Get all known job URLs (for deduplication / crawler optimizations).
*/
export async function getAllJobUrls(): Promise<string[]> {
const rows = await db.select({ jobUrl: jobs.jobUrl }).from(jobs);
return rows.map(r => r.jobUrl);
}
/**
* Create a new job (or return existing if URL matches).
*/

View File

@ -6,12 +6,13 @@
import { spawn } from 'child_process';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { readdir, readFile } from 'fs/promises';
import { mkdir, readdir, readFile, writeFile } from 'fs/promises';
import type { CreateJobInput } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const CRAWLER_DIR = join(__dirname, '../../../../job-extractor');
const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');
export interface CrawlerResult {
success: boolean;
@ -19,15 +20,33 @@ export interface CrawlerResult {
error?: string;
}
export interface RunCrawlerOptions {
/**
* List of job page URLs already present in the orchestrator DB.
* Used by the crawler to avoid expensive/undesired interactions (e.g. apply button click).
*/
existingJobUrls?: string[];
}
async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined): Promise<string | null> {
if (!existingJobUrls || existingJobUrls.length === 0) return null;
await mkdir(JOBOPS_STORAGE_DIR, { recursive: true });
const filePath = join(JOBOPS_STORAGE_DIR, 'existing-job-urls.json');
await writeFile(filePath, JSON.stringify(existingJobUrls), 'utf-8');
return filePath;
}
/**
* Run the job-extractor crawler and return discovered jobs.
*/
export async function runCrawler(): Promise<CrawlerResult> {
export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
console.log('🕷️ Starting job crawler...');
try {
// Clear previous results
await clearStorageDataset();
const existingJobUrlsFile = await writeExistingJobUrlsFile(options.existingJobUrls);
// Run the crawler
await new Promise<void>((resolve, reject) => {
@ -35,6 +54,11 @@ export async function runCrawler(): Promise<CrawlerResult> {
cwd: CRAWLER_DIR,
shell: true,
stdio: 'inherit',
env: {
...process.env,
JOBOPS_SKIP_APPLY_FOR_EXISTING: '1',
...(existingJobUrlsFile ? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile } : {}),
},
});
child.on('close', (code) => {