366 lines
11 KiB
TypeScript
366 lines
11 KiB
TypeScript
import { readFileSync } from "node:fs";
|
|
import { createPlaywrightRouter, log } from "crawlee";
|
|
import { markJobPageDone, markListPageDone } from "./progress.js";
|
|
|
|
function normalizeUrl(raw: string | null | undefined): string | null {
|
|
if (!raw) return null;
|
|
try {
|
|
const url = new URL(raw);
|
|
url.hash = "";
|
|
// Keep search params (some sites encode job IDs there); just normalize trailing slash.
|
|
const normalized = url.toString().replace(/\/$/, "");
|
|
return normalized;
|
|
} catch {
|
|
return raw.replace(/\/$/, "");
|
|
}
|
|
}
|
|
|
|
function getExistingJobUrlSet(): Set<string> {
|
|
const filePath = process.env.JOBOPS_EXISTING_JOB_URLS_FILE;
|
|
const raw = filePath
|
|
? (() => {
|
|
try {
|
|
return readFileSync(filePath, "utf-8");
|
|
} catch {
|
|
return null;
|
|
}
|
|
})()
|
|
: process.env.JOBOPS_EXISTING_JOB_URLS;
|
|
|
|
if (!raw) return new Set();
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
if (!Array.isArray(parsed)) return new Set();
|
|
const normalized = parsed
|
|
.map((u) => normalizeUrl(typeof u === "string" ? u : null))
|
|
.filter((u): u is string => Boolean(u));
|
|
return new Set(normalized);
|
|
} catch {
|
|
return new Set();
|
|
}
|
|
}
|
|
|
|
const SKIP_APPLY_FOR_EXISTING =
|
|
process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1";
|
|
const EXISTING_JOB_URLS = getExistingJobUrlSet();
|
|
|
|
// Global counters for max jobs per search term
|
|
const jobCounts = new Map<string, number>();
|
|
const MAX_JOBS_PER_TERM = parseInt(
|
|
process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0",
|
|
10,
|
|
);
|
|
|
|
interface Job {
|
|
title: string | null;
|
|
jobUrl: string | null;
|
|
employer: string | null;
|
|
employerUrl: string | null;
|
|
disciplines: string | null;
|
|
deadline: string | null;
|
|
salary: string | null;
|
|
location: string | null;
|
|
degreeRequired: string | null;
|
|
starting: string | null;
|
|
}
|
|
|
|
export const router = createPlaywrightRouter();
|
|
|
|
router.addHandler(
|
|
"gradcracker-list-page",
|
|
async ({ page, request, enqueueLinks }) => {
|
|
const { role } = request.userData;
|
|
log.info(`Processing: ${request.url} (Role: ${role})`);
|
|
|
|
if (MAX_JOBS_PER_TERM > 0) {
|
|
const currentCount = jobCounts.get(role) || 0;
|
|
if (currentCount >= MAX_JOBS_PER_TERM) {
|
|
log.info(
|
|
`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`,
|
|
);
|
|
markListPageDone({
|
|
currentUrl: request.url,
|
|
jobCardsFound: 0,
|
|
jobPagesEnqueued: 0,
|
|
jobPagesSkipped: 0,
|
|
});
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Wait until the job cards are rendered
|
|
await page.waitForSelector("article[wire\\:key]", { timeout: 10000 });
|
|
|
|
// Add delay to see the page load
|
|
await page.waitForTimeout(3000);
|
|
|
|
const toAbsolute = (href: string | null) => {
|
|
if (!href) return null;
|
|
try {
|
|
return new URL(href, request.loadedUrl).href;
|
|
} catch {
|
|
return href;
|
|
}
|
|
};
|
|
|
|
const articles = await page.locator("article[wire\\:key]").all();
|
|
const jobs: Job[] = [];
|
|
let skippedKnownJobs = 0;
|
|
let enqueuedJobs = 0;
|
|
|
|
console.log(`${articles.length} jobs found`);
|
|
|
|
let idx = 1;
|
|
for (const article of articles) {
|
|
const titleLocator = article.locator("h2 a");
|
|
const title = (await titleLocator.textContent())?.trim() ?? null;
|
|
const jobUrl = toAbsolute(await titleLocator.getAttribute("href"));
|
|
|
|
const employerImg = article.locator("figure img");
|
|
const employer = (await employerImg.getAttribute("alt"))?.trim() ?? null;
|
|
|
|
const employerAnchor = article.locator("figure a");
|
|
const employerUrl = toAbsolute(await employerAnchor.getAttribute("href"));
|
|
|
|
let disciplines: string | null = null;
|
|
try {
|
|
const disciplinesEl = article.locator("h3");
|
|
disciplines =
|
|
(await disciplinesEl.textContent({ timeout: 1000 }))?.trim() ?? null;
|
|
} catch {
|
|
// h3 not found or timed out - that's okay, disciplines is optional
|
|
}
|
|
|
|
// Find the "Deadline: ..." pill
|
|
const deadlineLocator = article
|
|
.locator("div", { hasText: "Deadline:" })
|
|
.first();
|
|
let deadline: string | null = null;
|
|
if ((await deadlineLocator.count()) > 0) {
|
|
const deadlineText = (await deadlineLocator.textContent()) ?? "";
|
|
// Extract deadline and clean up whitespace
|
|
deadline =
|
|
deadlineText
|
|
.replace("Deadline:", "")
|
|
.split("\n")[0] // Take only first line
|
|
.trim() || null;
|
|
}
|
|
|
|
const getDdText = async (label: string) => {
|
|
// Find dt that has the exact label text (ignoring whitespace)
|
|
const dt = article
|
|
.locator("dt")
|
|
.filter({ hasText: new RegExp(`^\\s*${label}\\s*$`) });
|
|
if ((await dt.count()) === 0) return null;
|
|
|
|
// Get the next sibling dd
|
|
const dd = dt.locator("+ dd");
|
|
if ((await dd.count()) > 0) {
|
|
const text = await dd.textContent();
|
|
if (!text) return null;
|
|
// Clean up: remove extra whitespace and newlines
|
|
return text.replace(/\s+/g, " ").trim() || null;
|
|
}
|
|
return null;
|
|
};
|
|
|
|
const salary = await getDdText("Salary");
|
|
const location = await getDdText("Location");
|
|
const degreeRequired = await getDdText("Degree required");
|
|
const starting = await getDdText("Starting");
|
|
|
|
console.log(`Got job ${idx}/${articles.length}: ${title}`);
|
|
|
|
jobs.push({
|
|
title,
|
|
jobUrl,
|
|
employer,
|
|
employerUrl,
|
|
disciplines,
|
|
deadline,
|
|
salary,
|
|
location,
|
|
degreeRequired,
|
|
starting,
|
|
});
|
|
|
|
idx++;
|
|
|
|
// append more links to crawl: single job pages
|
|
if (jobUrl) {
|
|
const jobUrlNormalized = normalizeUrl(jobUrl);
|
|
const isKnownJob =
|
|
SKIP_APPLY_FOR_EXISTING &&
|
|
jobUrlNormalized !== null &&
|
|
EXISTING_JOB_URLS.has(jobUrlNormalized);
|
|
|
|
if (isKnownJob) {
|
|
skippedKnownJobs++;
|
|
} else {
|
|
// Check if we reached the limit for this search term
|
|
if (MAX_JOBS_PER_TERM > 0) {
|
|
const currentCount = jobCounts.get(role) || 0;
|
|
if (currentCount >= MAX_JOBS_PER_TERM) {
|
|
log.info(
|
|
`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`,
|
|
);
|
|
break;
|
|
}
|
|
jobCounts.set(role, currentCount + 1);
|
|
}
|
|
|
|
await enqueueLinks({
|
|
urls: [jobUrl],
|
|
userData: {
|
|
...jobs[jobs.length - 1],
|
|
label: "gradcracker-single-job-page",
|
|
},
|
|
});
|
|
enqueuedJobs++;
|
|
}
|
|
}
|
|
}
|
|
|
|
log.info(`Extracted ${jobs.length} jobs`);
|
|
if (SKIP_APPLY_FOR_EXISTING && skippedKnownJobs > 0) {
|
|
log.info(
|
|
`Skipping ${skippedKnownJobs} already-known job pages; enqueued ${enqueuedJobs} new job pages.`,
|
|
);
|
|
}
|
|
|
|
markListPageDone({
|
|
currentUrl: request.url,
|
|
jobCardsFound: jobs.length,
|
|
jobPagesEnqueued: enqueuedJobs,
|
|
jobPagesSkipped: skippedKnownJobs,
|
|
});
|
|
},
|
|
);
|
|
|
|
router.addHandler(
|
|
"gradcracker-single-job-page",
|
|
async ({ page, request, pushData, log }) => {
|
|
const { label, ...jobSummary } = request.userData;
|
|
log.info(`Processing single job page: ${request.url}`);
|
|
|
|
// Wait for job content to be present
|
|
await page.waitForSelector(".body-content", { timeout: 10000 });
|
|
|
|
// Optional delay if you want to visually see it while debugging
|
|
await page.waitForTimeout(2000);
|
|
|
|
const jobDescription =
|
|
(await page.locator(".body-content").textContent())?.trim() || null;
|
|
|
|
const applyButton = page.locator('a[dusk="apply-button"]');
|
|
const hasApplyButton = (await applyButton.count()) > 0;
|
|
|
|
const requestUrlNormalized = normalizeUrl(request.url);
|
|
const isKnownJob =
|
|
SKIP_APPLY_FOR_EXISTING &&
|
|
requestUrlNormalized !== null &&
|
|
EXISTING_JOB_URLS.has(requestUrlNormalized);
|
|
|
|
let applicationLink: string | null = null;
|
|
let spawnedPage: typeof page | null = null;
|
|
|
|
if (hasApplyButton && !isKnownJob) {
|
|
const originalUrl = page.url();
|
|
|
|
// Prefer page-scoped popup detection. Using the browser context's "page" event
|
|
// can accidentally capture unrelated pages created by other concurrent requests.
|
|
const popupPromise = page
|
|
.waitForEvent("popup", { timeout: 8000 })
|
|
.catch(() => null);
|
|
const navigationPromise = page
|
|
.waitForNavigation({ timeout: 8000, waitUntil: "domcontentloaded" })
|
|
.catch(() => null);
|
|
|
|
try {
|
|
// Don't let Playwright auto-wait for navigation; we explicitly handle popup vs same-tab.
|
|
await applyButton.click();
|
|
|
|
// Wait for URL to stabilize (same URL for 3 consecutive checks)
|
|
const waitForUrlStable = async (
|
|
targetPage: typeof page,
|
|
maxWaitMs = 10000,
|
|
checkIntervalMs = 100,
|
|
requiredStableChecks = 3,
|
|
) => {
|
|
let lastUrl = targetPage.url();
|
|
let stableCount = 0;
|
|
const startTime = Date.now();
|
|
|
|
while (Date.now() - startTime < maxWaitMs) {
|
|
await targetPage.waitForTimeout(checkIntervalMs);
|
|
const currentUrl = targetPage.url();
|
|
if (currentUrl === lastUrl && !currentUrl.includes("gradcracker")) {
|
|
stableCount++;
|
|
if (stableCount >= requiredStableChecks) return currentUrl;
|
|
} else {
|
|
stableCount = 1;
|
|
lastUrl = currentUrl;
|
|
}
|
|
}
|
|
return lastUrl;
|
|
};
|
|
|
|
await waitForUrlStable(page);
|
|
|
|
const maybePopup = await popupPromise;
|
|
spawnedPage = maybePopup;
|
|
|
|
const targetPage = maybePopup ?? page;
|
|
|
|
if (maybePopup) {
|
|
await maybePopup
|
|
.waitForLoadState("domcontentloaded", { timeout: 15000 })
|
|
.catch(() => null);
|
|
// If the popup initially opens as about:blank, give it a moment to redirect.
|
|
if (maybePopup.url() === "about:blank") {
|
|
await maybePopup
|
|
.waitForURL((u) => u.toString() !== "about:blank", {
|
|
timeout: 15000,
|
|
})
|
|
.catch(() => null);
|
|
}
|
|
} else {
|
|
// Same-tab navigation case.
|
|
await navigationPromise;
|
|
await page
|
|
.waitForURL((u) => u.toString() !== originalUrl, { timeout: 15000 })
|
|
.catch(() => null);
|
|
}
|
|
|
|
applicationLink = targetPage.url();
|
|
|
|
if (applicationLink === originalUrl) {
|
|
log.info(
|
|
`Apply click did not change URL (still Gradcracker): ${applicationLink}`,
|
|
);
|
|
} else {
|
|
log.info(`Captured application URL: ${applicationLink}`);
|
|
}
|
|
} finally {
|
|
// Ensure we don't leak tabs on retries/errors.
|
|
if (spawnedPage && spawnedPage !== page) {
|
|
await spawnedPage.close().catch(() => null);
|
|
}
|
|
}
|
|
} else if (!hasApplyButton) {
|
|
log.warning(`Apply button not found on page: ${request.url}`);
|
|
} else {
|
|
log.info(`Skipping apply click for known job: ${request.url}`);
|
|
}
|
|
|
|
await pushData({
|
|
...jobSummary,
|
|
url: request.url, // Gradcracker job page
|
|
applicationLink, // External or same-page URL after click
|
|
jobDescription,
|
|
});
|
|
|
|
markJobPageDone({ currentUrl: request.url });
|
|
},
|
|
);
|