performance improvement for gradcracker fetcher

This commit is contained in:
DaKheera47 2025-12-12 00:30:46 +00:00
parent 051c09d943
commit 2e52c37bcb

View File

@ -21,7 +21,7 @@ router.addHandler(
log.info(`Processing: ${request.url}`); log.info(`Processing: ${request.url}`);
// Wait until the job cards are rendered // Wait until the job cards are rendered
await page.waitForSelector("article[wire\\:key]", { timeout: 30000 }); await page.waitForSelector("article[wire\\:key]", { timeout: 10000 });
// Add delay to see the page load // Add delay to see the page load
await page.waitForTimeout(3000); await page.waitForTimeout(3000);
@ -38,6 +38,9 @@ router.addHandler(
const articles = await page.locator("article[wire\\:key]").all(); const articles = await page.locator("article[wire\\:key]").all();
const jobs: Job[] = []; const jobs: Job[] = [];
console.log(`${articles.length} jobs found`);
let idx = 1;
for (const article of articles) { for (const article of articles) {
const titleLocator = article.locator("h2 a"); const titleLocator = article.locator("h2 a");
const title = (await titleLocator.textContent())?.trim() ?? null; const title = (await titleLocator.textContent())?.trim() ?? null;
@ -49,8 +52,13 @@ router.addHandler(
const employerAnchor = article.locator("figure a"); const employerAnchor = article.locator("figure a");
const employerUrl = toAbsolute(await employerAnchor.getAttribute("href")); const employerUrl = toAbsolute(await employerAnchor.getAttribute("href"));
const disciplinesEl = article.locator("h3"); let disciplines: string | null = null;
const disciplines = (await disciplinesEl.textContent())?.trim() ?? null; try {
const disciplinesEl = article.locator("h3");
disciplines = (await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
} catch {
// h3 not found or timed out - that's okay, disciplines is optional
}
// Find the "Deadline: ..." pill // Find the "Deadline: ..." pill
const deadlineLocator = article const deadlineLocator = article
@ -90,6 +98,8 @@ router.addHandler(
const degreeRequired = await getDdText("Degree required"); const degreeRequired = await getDdText("Degree required");
const starting = await getDdText("Starting"); const starting = await getDdText("Starting");
console.log(`Got job ${idx}/${articles.length}: ${title}`);
jobs.push({ jobs.push({
title, title,
jobUrl, jobUrl,
@ -103,6 +113,8 @@ router.addHandler(
starting, starting,
}); });
idx++;
// append more links to crawl: single job pages // append more links to crawl: single job pages
if (jobUrl) { if (jobUrl) {
await enqueueLinks({ await enqueueLinks({
@ -126,7 +138,7 @@ router.addHandler(
log.info(`Processing single job page: ${request.url}`); log.info(`Processing single job page: ${request.url}`);
// Wait for job content to be present // Wait for job content to be present
await page.waitForSelector(".body-content", { timeout: 30000 }); await page.waitForSelector(".body-content", { timeout: 10000 });
// Optional delay if you want to visually see it while debugging // Optional delay if you want to visually see it while debugging
await page.waitForTimeout(2000); await page.waitForTimeout(2000);