67 lines
1.7 KiB
TypeScript
67 lines
1.7 KiB
TypeScript
// For more information, see https://crawlee.dev/
|
|
import { launchOptions } from "camoufox-js";
|
|
import { PlaywrightCrawler } from "crawlee";
|
|
import { firefox } from "playwright";
|
|
|
|
import { router } from "./routes.js";
|
|
import { initJobOpsProgress } from "./progress.js";
|
|
|
|
// locations
|
|
const locations = [
|
|
"london-and-south-east",
|
|
"north-west",
|
|
"yorkshire",
|
|
"east-midlands",
|
|
"west-midlands",
|
|
"south-west",
|
|
];
|
|
|
|
// roles
|
|
const roles = [
|
|
"web-development",
|
|
"software-systems",
|
|
];
|
|
|
|
// combo of locations and roles
|
|
const gradcrackerUrls = locations.flatMap((location) => {
|
|
return roles.map((role) => {
|
|
return `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`;
|
|
});
|
|
});
|
|
|
|
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`)
|
|
|
|
const startUrls = gradcrackerUrls.map((url) => ({
|
|
url,
|
|
userData: { label: "gradcracker-list-page" },
|
|
}));
|
|
|
|
initJobOpsProgress(startUrls.length);
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }),
|
|
requestHandler: router,
|
|
// Comment this option to scrape the full website.
|
|
// maxRequestsPerCrawl: 2000,
|
|
// Add delay between requests to slow down the process
|
|
minConcurrency: 1,
|
|
maxConcurrency: 2,
|
|
navigationTimeoutSecs: 60,
|
|
// Add delay between requests (in milliseconds)
|
|
requestHandlerTimeoutSecs: 100,
|
|
browserPoolOptions: {
|
|
// Disable the default fingerprint spoofing to avoid conflicts with Camoufox.
|
|
useFingerprints: false,
|
|
},
|
|
launchContext: {
|
|
launcher: firefox,
|
|
launchOptions: await launchOptions({
|
|
headless: true,
|
|
humanize: true,
|
|
geoip: true,
|
|
}),
|
|
},
|
|
});
|
|
|
|
await crawler.run(startUrls);
|