2025-12-14 22:44:37 +00:00

67 lines
1.7 KiB
TypeScript

// For more information, see https://crawlee.dev/
import { launchOptions } from "camoufox-js";
import { PlaywrightCrawler } from "crawlee";
import { firefox } from "playwright";
import { router } from "./routes.js";
import { initJobOpsProgress } from "./progress.js";
// locations
const locations = [
"london-and-south-east",
"north-west",
"yorkshire",
"east-midlands",
"west-midlands",
"south-west",
];
// roles
const roles = [
"web-development",
"software-systems",
];
// combo of locations and roles
const gradcrackerUrls = locations.flatMap((location) => {
return roles.map((role) => {
return `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`;
});
});
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`)
const startUrls = gradcrackerUrls.map((url) => ({
url,
userData: { label: "gradcracker-list-page" },
}));
initJobOpsProgress(startUrls.length);
const crawler = new PlaywrightCrawler({
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }),
requestHandler: router,
// Comment this option to scrape the full website.
// maxRequestsPerCrawl: 2000,
// Add delay between requests to slow down the process
minConcurrency: 1,
maxConcurrency: 2,
navigationTimeoutSecs: 60,
// Add delay between requests (in milliseconds)
requestHandlerTimeoutSecs: 100,
browserPoolOptions: {
// Disable the default fingerprint spoofing to avoid conflicts with Camoufox.
useFingerprints: false,
},
launchContext: {
launcher: firefox,
launchOptions: await launchOptions({
headless: true,
humanize: true,
geoip: true,
}),
},
});
await crawler.run(startUrls);