performance improvement for gradcracker fetcher
This commit is contained in:
parent
051c09d943
commit
2e52c37bcb
@ -21,7 +21,7 @@ router.addHandler(
|
|||||||
log.info(`Processing: ${request.url}`);
|
log.info(`Processing: ${request.url}`);
|
||||||
|
|
||||||
// Wait until the job cards are rendered
|
// Wait until the job cards are rendered
|
||||||
await page.waitForSelector("article[wire\\:key]", { timeout: 30000 });
|
await page.waitForSelector("article[wire\\:key]", { timeout: 10000 });
|
||||||
|
|
||||||
// Add delay to see the page load
|
// Add delay to see the page load
|
||||||
await page.waitForTimeout(3000);
|
await page.waitForTimeout(3000);
|
||||||
@ -38,6 +38,9 @@ router.addHandler(
|
|||||||
const articles = await page.locator("article[wire\\:key]").all();
|
const articles = await page.locator("article[wire\\:key]").all();
|
||||||
const jobs: Job[] = [];
|
const jobs: Job[] = [];
|
||||||
|
|
||||||
|
console.log(`${articles.length} jobs found`);
|
||||||
|
|
||||||
|
let idx = 1;
|
||||||
for (const article of articles) {
|
for (const article of articles) {
|
||||||
const titleLocator = article.locator("h2 a");
|
const titleLocator = article.locator("h2 a");
|
||||||
const title = (await titleLocator.textContent())?.trim() ?? null;
|
const title = (await titleLocator.textContent())?.trim() ?? null;
|
||||||
@ -49,8 +52,13 @@ router.addHandler(
|
|||||||
const employerAnchor = article.locator("figure a");
|
const employerAnchor = article.locator("figure a");
|
||||||
const employerUrl = toAbsolute(await employerAnchor.getAttribute("href"));
|
const employerUrl = toAbsolute(await employerAnchor.getAttribute("href"));
|
||||||
|
|
||||||
const disciplinesEl = article.locator("h3");
|
let disciplines: string | null = null;
|
||||||
const disciplines = (await disciplinesEl.textContent())?.trim() ?? null;
|
try {
|
||||||
|
const disciplinesEl = article.locator("h3");
|
||||||
|
disciplines = (await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
|
||||||
|
} catch {
|
||||||
|
// h3 not found or timed out - that's okay, disciplines is optional
|
||||||
|
}
|
||||||
|
|
||||||
// Find the "Deadline: ..." pill
|
// Find the "Deadline: ..." pill
|
||||||
const deadlineLocator = article
|
const deadlineLocator = article
|
||||||
@ -90,6 +98,8 @@ router.addHandler(
|
|||||||
const degreeRequired = await getDdText("Degree required");
|
const degreeRequired = await getDdText("Degree required");
|
||||||
const starting = await getDdText("Starting");
|
const starting = await getDdText("Starting");
|
||||||
|
|
||||||
|
console.log(`Got job ${idx}/${articles.length}: ${title}`);
|
||||||
|
|
||||||
jobs.push({
|
jobs.push({
|
||||||
title,
|
title,
|
||||||
jobUrl,
|
jobUrl,
|
||||||
@ -103,6 +113,8 @@ router.addHandler(
|
|||||||
starting,
|
starting,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
idx++;
|
||||||
|
|
||||||
// append more links to crawl: single job pages
|
// append more links to crawl: single job pages
|
||||||
if (jobUrl) {
|
if (jobUrl) {
|
||||||
await enqueueLinks({
|
await enqueueLinks({
|
||||||
@ -126,7 +138,7 @@ router.addHandler(
|
|||||||
log.info(`Processing single job page: ${request.url}`);
|
log.info(`Processing single job page: ${request.url}`);
|
||||||
|
|
||||||
// Wait for job content to be present
|
// Wait for job content to be present
|
||||||
await page.waitForSelector(".body-content", { timeout: 30000 });
|
await page.waitForSelector(".body-content", { timeout: 10000 });
|
||||||
|
|
||||||
// Optional delay if you want to visually see it while debugging
|
// Optional delay if you want to visually see it while debugging
|
||||||
await page.waitForTimeout(2000);
|
await page.waitForTimeout(2000);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user