Jobber/orchestrator/src/server/services/crawler.ts

/**
 * Service for running the Gradcracker crawler (extractors/gradcracker).
 * Wraps the existing Crawlee-based crawler.
 */

import { spawn } from 'child_process';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { mkdir, readdir, readFile, writeFile } from 'fs/promises';
import { createInterface } from 'readline';
import type { CreateJobInput } from '../../shared/types.js';

const __dirname = dirname(fileURLToPath(import.meta.url));
const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker');
const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');

export interface CrawlerResult {
  success: boolean;
  jobs: CreateJobInput[];
  error?: string;
}

export interface RunCrawlerOptions {
  /**
   * List of job page URLs already present in the orchestrator DB.
   * Used by the crawler to avoid expensive/undesired interactions (e.g. apply button click).
   */
  existingJobUrls?: string[];

  /**
   * Optional callback for live crawl progress emitted by the Gradcracker extractor.
   */
  onProgress?: (update: JobExtractorProgress) => void;

  /**
   * List of search terms to be used as roles for URL generation.
   */
  searchTerms?: string[];

  /**
   * Max jobs to fetch per search term.
   */
  maxJobsPerTerm?: number;
}

interface JobExtractorProgress {
  phase?: 'list' | 'job';
  currentUrl?: string;
  listPagesProcessed?: number;
  listPagesTotal?: number;
  jobCardsFound?: number;
  jobPagesEnqueued?: number;
  jobPagesSkipped?: number;
  jobPagesProcessed?: number;
  ts?: string;
}

const JOBOPS_PROGRESS_PREFIX = 'JOBOPS_PROGRESS ';

async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined): Promise<string | null> {
  if (!existingJobUrls || existingJobUrls.length === 0) return null;
  await mkdir(JOBOPS_STORAGE_DIR, { recursive: true });
  const filePath = join(JOBOPS_STORAGE_DIR, 'existing-job-urls.json');
  await writeFile(filePath, JSON.stringify(existingJobUrls), 'utf-8');
  return filePath;
}

/**
 * Run the Gradcracker crawler and return discovered jobs.
 */
export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
  console.log('🕷️ Starting job crawler...');

  try {
    // Clear previous results
    await clearStorageDataset();

    const existingJobUrlsFile = await writeExistingJobUrlsFile(options.existingJobUrls);

    // Run the crawler
    await new Promise<void>((resolve, reject) => {
      const child = spawn('npm', ['run', 'start'], {
        cwd: CRAWLER_DIR,
        shell: true,
        stdio: ['ignore', 'pipe', 'pipe'],
        env: {
          ...process.env,
          JOBOPS_SKIP_APPLY_FOR_EXISTING: '1',
          JOBOPS_EMIT_PROGRESS: '1',
          GRADCRACKER_SEARCH_TERMS: options.searchTerms ? JSON.stringify(options.searchTerms) : '',
          GRADCRACKER_MAX_JOBS_PER_TERM: options.maxJobsPerTerm ? String(options.maxJobsPerTerm) : '',
          ...(existingJobUrlsFile ? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile } : {}),
        },
      });

      const handleLine = (line: string, stream: NodeJS.WriteStream) => {
        if (line.startsWith(JOBOPS_PROGRESS_PREFIX)) {
          const raw = line.slice(JOBOPS_PROGRESS_PREFIX.length).trim();
          try {
            const parsed = JSON.parse(raw) as JobExtractorProgress;
            options.onProgress?.(parsed);
          } catch {
            // Ignore malformed progress lines
          }
          return;
        }
        stream.write(`${line}\n`);
      };

      const stdoutRl = child.stdout ? createInterface({ input: child.stdout }) : null;
      const stderrRl = child.stderr ? createInterface({ input: child.stderr }) : null;

      stdoutRl?.on('line', (line) => handleLine(line, process.stdout));
      stderrRl?.on('line', (line) => handleLine(line, process.stderr));

      child.on('close', (code) => {
        stdoutRl?.close();
        stderrRl?.close();
        if (code === 0) {
          resolve();
        } else {
          reject(new Error(`Crawler exited with code ${code}`));
        }
      });

      child.on('error', reject);
    });

    // Read crawled jobs from storage
    const jobs = await readCrawledJobs();

    console.log(`✅ Crawler completed. Found ${jobs.length} jobs.`);

    return { success: true, jobs };
  } catch (error) {
    const message = error instanceof Error ? error.message : 'Unknown error';
    console.error('❌ Crawler failed:', message);
    return { success: false, jobs: [], error: message };
  }
}

/**
 * Read crawled jobs from the Crawlee storage dataset.
 */
async function readCrawledJobs(): Promise<CreateJobInput[]> {
  try {
    const files = await readdir(STORAGE_DIR);
    const jsonFiles = files.filter(f => f.endsWith('.json'));

    const jobs: CreateJobInput[] = [];

    for (const file of jsonFiles) {
      const content = await readFile(join(STORAGE_DIR, file), 'utf-8');
      const data = JSON.parse(content);

      // Map crawler output to our job input format
      jobs.push({
        source: 'gradcracker',
        title: data.title || 'Unknown Title',
        employer: data.employer || 'Unknown Employer',
        employerUrl: data.employerUrl,
        jobUrl: data.url || data.jobUrl,
        applicationLink: data.applicationLink,
        disciplines: data.disciplines,
        deadline: data.deadline,
        salary: data.salary,
        location: data.location,
        degreeRequired: data.degreeRequired,
        starting: data.starting,
        jobDescription: data.jobDescription,
      });
    }

    return jobs;
  } catch (error) {
    console.error('Failed to read crawled jobs:', error);
    return [];
  }
}

/**
 * Clear previous crawl results.
 */
async function clearStorageDataset(): Promise<void> {
  const { rm } = await import('fs/promises');
  try {
    await rm(STORAGE_DIR, { recursive: true, force: true });
  } catch {
    // Ignore if directory doesn't exist
  }
}