don't recrawl jobs if already seen

2025-12-14 15:35:29 +00:00 · 2025-12-14 15:35:29 +00:00 · 59c943b4b6
commit 59c943b4b6
parent 9122f0f9aa
4 changed files with 114 additions and 12 deletions
--- a/job-extractor/src/routes.ts
+++ b/job-extractor/src/routes.ts
@ -1,4 +1,47 @@
 import { createPlaywrightRouter, log } from "crawlee";
+import { readFileSync } from "node:fs";
+
+function normalizeUrl(raw: string | null | undefined): string | null {
+  if (!raw) return null;
+  try {
+    const url = new URL(raw);
+    url.hash = "";
+    // Keep search params (some sites encode job IDs there); just normalize trailing slash.
+    const normalized = url.toString().replace(/\/$/, "");
+    return normalized;
+  } catch {
+    return raw.replace(/\/$/, "");
+  }
+}
+
+function getExistingJobUrlSet(): Set<string> {
+  const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
+  const raw =
+    filePath
+      ? (() => {
+          try {
+            return readFileSync(filePath, "utf-8");
+          } catch {
+            return null;
+          }
+        })()
+      : process.env.JOBOPS_EXISTING_JOB_URLS;
+
+  if (!raw) return new Set();
+  try {
+    const parsed = JSON.parse(raw);
+    if (!Array.isArray(parsed)) return new Set();
+    const normalized = parsed
+      .map((u) => normalizeUrl(typeof u === "string" ? u : null))
+      .filter((u): u is string => Boolean(u));
+    return new Set(normalized);
+  } catch {
+    return new Set();
+  }
+}
+
+const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
+const EXISTING_JOB_URLS = getExistingJobUrlSet();

 interface Job {
  title: string | null;
@ -37,6 +80,8 @@ router.addHandler(

    const articles = await page.locator("article[wire\\:key]").all();
    const jobs: Job[] = [];
+    let skippedKnownJobs = 0;
+    let enqueuedJobs = 0;

    console.log(`${articles.length} jobs found`);

@ -117,17 +162,33 @@ router.addHandler(

      // append more links to crawl: single job pages
      if (jobUrl) {
-        await enqueueLinks({
-          urls: [jobUrl],
-          userData: {
-            ...jobs[jobs.length - 1],
-            label: "gradcracker-single-job-page"
-          },
-        });
+        const jobUrlNormalized = normalizeUrl(jobUrl);
+        const isKnownJob =
+          SKIP_APPLY_FOR_EXISTING &&
+          jobUrlNormalized !== null &&
+          EXISTING_JOB_URLS.has(jobUrlNormalized);
+
+        if (isKnownJob) {
+          skippedKnownJobs++;
+        } else {
+          await enqueueLinks({
+            urls: [jobUrl],
+            userData: {
+              ...jobs[jobs.length - 1],
+              label: "gradcracker-single-job-page"
+            },
+          });
+          enqueuedJobs++;
+        }
      }
    }

    log.info(`Extracted ${jobs.length} jobs`);
+    if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
+      log.info(
+        `Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`
+      );
+    }
  }
 );

@ -149,10 +210,16 @@ router.addHandler(
    const applyButton = page.locator('a[dusk="apply-button"]');
    const hasApplyButton = (await applyButton.count()) > 0;

+    const requestUrlNormalized = normalizeUrl(request.url);
+    const isKnownJob =
+      SKIP_APPLY_FOR_EXISTING &&
+      requestUrlNormalized !== null &&
+      EXISTING_JOB_URLS.has(requestUrlNormalized);
+
    let applicationLink: string | null = null;
    let spawnedPage: typeof page | null = null;

-    if (hasApplyButton) {
+    if (hasApplyButton && !isKnownJob) {
      const originalUrl = page.url();

      // Prefer page-scoped popup detection. Using the browser context's "page" event
@ -224,8 +291,10 @@ router.addHandler(
          await spawnedPage.close().catch(() => null);
        }
      }
-    } else {
+    } else if (!hasApplyButton) {
      log.warning(`Apply button not found on page: ${request.url}`);
+    } else {
+      log.info(`Skipping apply click for known job: ${request.url}`);
    }

    await pushData({
--- a/orchestrator/src/server/pipeline/orchestrator.ts
+++ b/orchestrator/src/server/pipeline/orchestrator.ts
@ -72,7 +72,8 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
    // Step 2: Run crawler
    console.log('\n🕷️ Running crawler...');
    progressHelpers.startCrawling();
-    const crawlerResult = await runCrawler();
+    const existingJobUrls = await jobsRepo.getAllJobUrls();
+    const crawlerResult = await runCrawler({ existingJobUrls });
    
    if (!crawlerResult.success) {
      throw new Error(`Crawler failed: ${crawlerResult.error}`);
--- a/orchestrator/src/server/repositories/jobs.ts
+++ b/orchestrator/src/server/repositories/jobs.ts
@ -37,6 +37,14 @@ export async function getJobByUrl(jobUrl: string): Promise<Job | null> {
  return row ? mapRowToJob(row) : null;
 }

+/**
+ * Get all known job URLs (for deduplication / crawler optimizations).
+ */
+export async function getAllJobUrls(): Promise<string[]> {
+  const rows = await db.select({ jobUrl: jobs.jobUrl }).from(jobs);
+  return rows.map(r => r.jobUrl);
+}
+
 /**
 * Create a new job (or return existing if URL matches).
 */
--- a/orchestrator/src/server/services/crawler.ts
+++ b/orchestrator/src/server/services/crawler.ts
@ -6,12 +6,13 @@
 import { spawn } from 'child_process';
 import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';
-import { readdir, readFile } from 'fs/promises';
+import { mkdir, readdir, readFile, writeFile } from 'fs/promises';
 import type { CreateJobInput } from '../../shared/types.js';

 const __dirname = dirname(fileURLToPath(import.meta.url));
 const CRAWLER_DIR = join(__dirname, '../../../../job-extractor');
 const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
+const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');

 export interface CrawlerResult {
  success: boolean;
@ -19,22 +20,45 @@ export interface CrawlerResult {
  error?: string;
 }

+export interface RunCrawlerOptions {
+  /**
+   * List of job page URLs already present in the orchestrator DB.
+   * Used by the crawler to avoid expensive/undesired interactions (e.g. apply button click).
+   */
+  existingJobUrls?: string[];
+}
+
+async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined): Promise<string | null> {
+  if (!existingJobUrls || existingJobUrls.length === 0) return null;
+  await mkdir(JOBOPS_STORAGE_DIR, { recursive: true });
+  const filePath = join(JOBOPS_STORAGE_DIR, 'existing-job-urls.json');
+  await writeFile(filePath, JSON.stringify(existingJobUrls), 'utf-8');
+  return filePath;
+}
+
 /**
 * Run the job-extractor crawler and return discovered jobs.
 */
-export async function runCrawler(): Promise<CrawlerResult> {
+export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
  console.log('🕷️ Starting job crawler...');
  
  try {
    // Clear previous results
    await clearStorageDataset();

+    const existingJobUrlsFile = await writeExistingJobUrlsFile(options.existingJobUrls);
+    
    // Run the crawler
    await new Promise<void>((resolve, reject) => {
      const child = spawn('npm', ['run', 'start'], {
        cwd: CRAWLER_DIR,
        shell: true,
        stdio: 'inherit',
+        env: {
+          ...process.env,
+          JOBOPS_SKIP_APPLY_FOR_EXISTING: '1',
+          ...(existingJobUrlsFile ? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile } : {}),
+        },
      });
      
      child.on('close', (code) => {