2026-01-15 19:17:23 +00:00

193 lines
5.9 KiB
TypeScript

/**
* Service for running the Gradcracker crawler (extractors/gradcracker).
* Wraps the existing Crawlee-based crawler.
*/
import { spawn } from 'child_process';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { mkdir, readdir, readFile, writeFile } from 'fs/promises';
import { createInterface } from 'readline';
import type { CreateJobInput } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker');
const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');
export interface CrawlerResult {
success: boolean;
jobs: CreateJobInput[];
error?: string;
}
export interface RunCrawlerOptions {
/**
* List of job page URLs already present in the orchestrator DB.
* Used by the crawler to avoid expensive/undesired interactions (e.g. apply button click).
*/
existingJobUrls?: string[];
/**
* Optional callback for live crawl progress emitted by the Gradcracker extractor.
*/
onProgress?: (update: JobExtractorProgress) => void;
/**
* List of search terms to be used as roles for URL generation.
*/
searchTerms?: string[];
/**
* Max jobs to fetch per search term.
*/
maxJobsPerTerm?: number;
}
interface JobExtractorProgress {
phase?: 'list' | 'job';
currentUrl?: string;
listPagesProcessed?: number;
listPagesTotal?: number;
jobCardsFound?: number;
jobPagesEnqueued?: number;
jobPagesSkipped?: number;
jobPagesProcessed?: number;
ts?: string;
}
const JOBOPS_PROGRESS_PREFIX = 'JOBOPS_PROGRESS ';
async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined): Promise<string | null> {
if (!existingJobUrls || existingJobUrls.length === 0) return null;
await mkdir(JOBOPS_STORAGE_DIR, { recursive: true });
const filePath = join(JOBOPS_STORAGE_DIR, 'existing-job-urls.json');
await writeFile(filePath, JSON.stringify(existingJobUrls), 'utf-8');
return filePath;
}
/**
* Run the Gradcracker crawler and return discovered jobs.
*/
export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
console.log('🕷️ Starting job crawler...');
try {
// Clear previous results
await clearStorageDataset();
const existingJobUrlsFile = await writeExistingJobUrlsFile(options.existingJobUrls);
// Run the crawler
await new Promise<void>((resolve, reject) => {
const child = spawn('npm', ['run', 'start'], {
cwd: CRAWLER_DIR,
shell: true,
stdio: ['ignore', 'pipe', 'pipe'],
env: {
...process.env,
JOBOPS_SKIP_APPLY_FOR_EXISTING: '1',
JOBOPS_EMIT_PROGRESS: '1',
GRADCRACKER_SEARCH_TERMS: options.searchTerms ? JSON.stringify(options.searchTerms) : '',
GRADCRACKER_MAX_JOBS_PER_TERM: options.maxJobsPerTerm ? String(options.maxJobsPerTerm) : '',
...(existingJobUrlsFile ? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile } : {}),
},
});
const handleLine = (line: string, stream: NodeJS.WriteStream) => {
if (line.startsWith(JOBOPS_PROGRESS_PREFIX)) {
const raw = line.slice(JOBOPS_PROGRESS_PREFIX.length).trim();
try {
const parsed = JSON.parse(raw) as JobExtractorProgress;
options.onProgress?.(parsed);
} catch {
// Ignore malformed progress lines
}
return;
}
stream.write(`${line}\n`);
};
const stdoutRl = child.stdout ? createInterface({ input: child.stdout }) : null;
const stderrRl = child.stderr ? createInterface({ input: child.stderr }) : null;
stdoutRl?.on('line', (line) => handleLine(line, process.stdout));
stderrRl?.on('line', (line) => handleLine(line, process.stderr));
child.on('close', (code) => {
stdoutRl?.close();
stderrRl?.close();
if (code === 0) {
resolve();
} else {
reject(new Error(`Crawler exited with code ${code}`));
}
});
child.on('error', reject);
});
// Read crawled jobs from storage
const jobs = await readCrawledJobs();
console.log(`✅ Crawler completed. Found ${jobs.length} jobs.`);
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
console.error('❌ Crawler failed:', message);
return { success: false, jobs: [], error: message };
}
}
/**
* Read crawled jobs from the Crawlee storage dataset.
*/
async function readCrawledJobs(): Promise<CreateJobInput[]> {
try {
const files = await readdir(STORAGE_DIR);
const jsonFiles = files.filter(f => f.endsWith('.json'));
const jobs: CreateJobInput[] = [];
for (const file of jsonFiles) {
const content = await readFile(join(STORAGE_DIR, file), 'utf-8');
const data = JSON.parse(content);
// Map crawler output to our job input format
jobs.push({
source: 'gradcracker',
title: data.title || 'Unknown Title',
employer: data.employer || 'Unknown Employer',
employerUrl: data.employerUrl,
jobUrl: data.url || data.jobUrl,
applicationLink: data.applicationLink,
disciplines: data.disciplines,
deadline: data.deadline,
salary: data.salary,
location: data.location,
degreeRequired: data.degreeRequired,
starting: data.starting,
jobDescription: data.jobDescription,
});
}
return jobs;
} catch (error) {
console.error('Failed to read crawled jobs:', error);
return [];
}
}
/**
* Clear previous crawl results.
*/
async function clearStorageDataset(): Promise<void> {
const { rm } = await import('fs/promises');
try {
await rm(STORAGE_DIR, { recursive: true, force: true });
} catch {
// Ignore if directory doesn't exist
}
}