scraping more gradcracker urls
This commit is contained in:
parent
302fadb494
commit
d743aacd1a
@ -1,28 +1,51 @@
|
|||||||
// For more information, see https://crawlee.dev/
|
// For more information, see https://crawlee.dev/
|
||||||
import { launchOptions } from "camoufox-js";
|
import { launchOptions } from "camoufox-js";
|
||||||
import { PlaywrightCrawler, ProxyConfiguration } from "crawlee";
|
import { PlaywrightCrawler } from "crawlee";
|
||||||
import { firefox } from "playwright";
|
import { firefox } from "playwright";
|
||||||
|
|
||||||
import { router } from "./routes.js";
|
import { router } from "./routes.js";
|
||||||
|
|
||||||
const startUrls = [
|
// locations
|
||||||
{
|
const locations = [
|
||||||
url: "https://www.gradcracker.com/search/computing-technology/web-development-graduate-jobs-in-north-west?order=dateAdded",
|
"london-and-south-east",
|
||||||
userData: { label: "gradcracker-list-page" },
|
"north-west",
|
||||||
},
|
"yorkshire",
|
||||||
|
"east-midlands",
|
||||||
|
"west-midlands",
|
||||||
|
"south-west",
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// roles
|
||||||
|
const roles = [
|
||||||
|
"web-development",
|
||||||
|
"software-systems",
|
||||||
|
];
|
||||||
|
|
||||||
|
// combo of locations and roles
|
||||||
|
const gradcrackerUrls = locations.flatMap((location) => {
|
||||||
|
return roles.map((role) => {
|
||||||
|
return `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`)
|
||||||
|
|
||||||
|
const startUrls = gradcrackerUrls.map((url) => ({
|
||||||
|
url,
|
||||||
|
userData: { label: "gradcracker-list-page" },
|
||||||
|
}));
|
||||||
|
|
||||||
const crawler = new PlaywrightCrawler({
|
const crawler = new PlaywrightCrawler({
|
||||||
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }),
|
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }),
|
||||||
requestHandler: router,
|
requestHandler: router,
|
||||||
// Comment this option to scrape the full website.
|
// Comment this option to scrape the full website.
|
||||||
maxRequestsPerCrawl: 20,
|
maxRequestsPerCrawl: 2000,
|
||||||
// Add delay between requests to slow down the process
|
// Add delay between requests to slow down the process
|
||||||
minConcurrency: 1,
|
minConcurrency: 1,
|
||||||
maxConcurrency: 10,
|
maxConcurrency: 1,
|
||||||
navigationTimeoutSecs: 60,
|
navigationTimeoutSecs: 60,
|
||||||
// Add delay between requests (in milliseconds)
|
// Add delay between requests (in milliseconds)
|
||||||
// requestHandlerTimeoutSecs: 50,
|
requestHandlerTimeoutSecs: 100,
|
||||||
browserPoolOptions: {
|
browserPoolOptions: {
|
||||||
// Disable the default fingerprint spoofing to avoid conflicts with Camoufox.
|
// Disable the default fingerprint spoofing to avoid conflicts with Camoufox.
|
||||||
useFingerprints: false,
|
useFingerprints: false,
|
||||||
@ -30,12 +53,9 @@ const crawler = new PlaywrightCrawler({
|
|||||||
launchContext: {
|
launchContext: {
|
||||||
launcher: firefox,
|
launcher: firefox,
|
||||||
launchOptions: await launchOptions({
|
launchOptions: await launchOptions({
|
||||||
headless: true,
|
headless: false,
|
||||||
// block_images: true,
|
humanize: true,
|
||||||
// Pass your own Camoufox parameters here...
|
geoip: true,
|
||||||
// block_images: true,
|
|
||||||
// fonts: ['Times New Roman'],
|
|
||||||
// ...
|
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|||||||
@ -108,12 +108,13 @@ router.addHandler(
|
|||||||
await enqueueLinks({
|
await enqueueLinks({
|
||||||
urls: [jobUrl],
|
urls: [jobUrl],
|
||||||
userData: {
|
userData: {
|
||||||
...jobs[jobs.length - 1],
|
...jobs[jobs.length - 1],
|
||||||
label: "gradcracker-single-job-page"
|
label: "gradcracker-single-job-page"
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info(`Extracted ${jobs.length} jobs`);
|
log.info(`Extracted ${jobs.length} jobs`);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user