86 lines
2.3 KiB
TypeScript
86 lines
2.3 KiB
TypeScript
// For more information, see https://crawlee.dev/
|
|
import { launchOptions } from "camoufox-js";
|
|
import { PlaywrightCrawler } from "crawlee";
|
|
import { firefox } from "playwright";
|
|
import { initJobOpsProgress } from "./progress.js";
|
|
import { router } from "./routes.js";
|
|
|
|
// locations
|
|
const locations = [
|
|
"london-and-south-east",
|
|
"north-west",
|
|
"yorkshire",
|
|
"east-midlands",
|
|
"west-midlands",
|
|
"south-west",
|
|
];
|
|
|
|
// roles
|
|
const defaultRoles = ["web-development", "software-systems"];
|
|
|
|
let roles = defaultRoles;
|
|
const envRolesRaw = process.env.GRADCRACKER_SEARCH_TERMS;
|
|
|
|
if (envRolesRaw) {
|
|
try {
|
|
const parsed = JSON.parse(envRolesRaw) as string[];
|
|
if (Array.isArray(parsed) && parsed.length > 0) {
|
|
roles = parsed.map((term) =>
|
|
term
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, "-")
|
|
.replace(/^-+|-+$/g, ""),
|
|
);
|
|
console.log(`Using configured search terms: ${roles.join(", ")}`);
|
|
}
|
|
} catch (e) {
|
|
console.warn("Failed to parse GRADCRACKER_SEARCH_TERMS", e);
|
|
}
|
|
}
|
|
|
|
// combo of locations and roles
|
|
const gradcrackerUrls = locations.flatMap((location) => {
|
|
return roles.map((role) => {
|
|
return {
|
|
url: `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`,
|
|
role,
|
|
};
|
|
});
|
|
});
|
|
|
|
console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`);
|
|
|
|
const startUrls = gradcrackerUrls.map(({ url, role }) => ({
|
|
url,
|
|
userData: { label: "gradcracker-list-page", role },
|
|
}));
|
|
|
|
initJobOpsProgress(startUrls.length);
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
// proxyConfiguration: new ProxyConfiguration({ proxyUrls: ['...'] }),
|
|
requestHandler: router,
|
|
// Comment this option to scrape the full website.
|
|
// maxRequestsPerCrawl: 2000,
|
|
// Add delay between requests to slow down the process
|
|
minConcurrency: 1,
|
|
maxConcurrency: 2,
|
|
navigationTimeoutSecs: 60,
|
|
// Add delay between requests (in milliseconds)
|
|
requestHandlerTimeoutSecs: 100,
|
|
browserPoolOptions: {
|
|
// Disable the default fingerprint spoofing to avoid conflicts with Camoufox.
|
|
useFingerprints: false,
|
|
},
|
|
launchContext: {
|
|
launcher: firefox,
|
|
launchOptions: await launchOptions({
|
|
headless: true,
|
|
humanize: true,
|
|
geoip: true,
|
|
}),
|
|
},
|
|
});
|
|
|
|
await crawler.run(startUrls);
|