303 lines
7.5 KiB
JavaScript
303 lines
7.5 KiB
JavaScript
/**
|
|
* SkipTheDrive Parsing Strategy
|
|
*
|
|
* Uses core-parser for browser management and ai-analyzer for utilities
|
|
*/
|
|
|
|
const {
|
|
logger,
|
|
cleanText,
|
|
containsAnyKeyword,
|
|
validateLocationAgainstFilters,
|
|
} = require("ai-analyzer");
|
|
|
|
/**
|
|
* SkipTheDrive URL builder
|
|
*/
|
|
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
|
const baseUrl = "https://www.skipthedrive.com/";
|
|
const params = new URLSearchParams({
|
|
s: keyword,
|
|
orderby: orderBy,
|
|
});
|
|
|
|
if (jobTypes && jobTypes.length > 0) {
|
|
params.append("job_type", jobTypes.join(","));
|
|
}
|
|
|
|
return `${baseUrl}?${params.toString()}`;
|
|
}
|
|
|
|
/**
|
|
* SkipTheDrive parsing strategy function
|
|
*/
|
|
async function skipthedriveStrategy(coreParser, options = {}) {
|
|
const {
|
|
keywords = ["software engineer", "developer", "programmer"],
|
|
locationFilter = null,
|
|
maxPages = 5,
|
|
jobTypes = [],
|
|
} = options;
|
|
|
|
const results = [];
|
|
const rejectedResults = [];
|
|
const seenJobs = new Set();
|
|
|
|
try {
|
|
// Create main page
|
|
const page = await coreParser.createPage("skipthedrive-main");
|
|
|
|
logger.info("🚀 Starting SkipTheDrive parser...");
|
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
|
|
|
// Search for each keyword
|
|
for (const keyword of keywords) {
|
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
|
|
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
|
|
|
try {
|
|
// Navigate to search results
|
|
await coreParser.navigateTo(searchUrl, {
|
|
pageId: "skipthedrive-main",
|
|
retries: 2,
|
|
timeout: 30000,
|
|
});
|
|
|
|
// Wait for job listings to load
|
|
const hasResults = await coreParser
|
|
.waitForSelector(
|
|
"#loops-wrapper",
|
|
{
|
|
timeout: 5000,
|
|
},
|
|
"skipthedrive-main"
|
|
)
|
|
.catch(() => {
|
|
logger.warning(`No results found for keyword: ${keyword}`);
|
|
return false;
|
|
});
|
|
|
|
if (!hasResults) {
|
|
continue;
|
|
}
|
|
|
|
// Process multiple pages
|
|
let currentPage = 1;
|
|
let hasNextPage = true;
|
|
|
|
while (hasNextPage && currentPage <= maxPages) {
|
|
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
|
|
|
// Extract jobs from current page
|
|
const pageJobs = await extractJobsFromPage(
|
|
page,
|
|
keyword,
|
|
locationFilter
|
|
);
|
|
|
|
for (const job of pageJobs) {
|
|
// Skip duplicates
|
|
if (seenJobs.has(job.jobId)) continue;
|
|
seenJobs.add(job.jobId);
|
|
|
|
// Validate location if filtering enabled
|
|
if (locationFilter) {
|
|
const locationValid = validateLocationAgainstFilters(
|
|
job.location,
|
|
locationFilter
|
|
);
|
|
|
|
if (!locationValid) {
|
|
rejectedResults.push({
|
|
...job,
|
|
rejectionReason: "Location filter mismatch",
|
|
});
|
|
continue;
|
|
}
|
|
}
|
|
|
|
results.push(job);
|
|
}
|
|
|
|
// Check for next page
|
|
hasNextPage = await hasNextPageAvailable(page);
|
|
if (hasNextPage && currentPage < maxPages) {
|
|
await navigateToNextPage(page, currentPage + 1);
|
|
currentPage++;
|
|
|
|
// Wait for new page to load
|
|
await page.waitForTimeout(2000);
|
|
} else {
|
|
hasNextPage = false;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
|
}
|
|
}
|
|
|
|
logger.info(
|
|
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
|
|
);
|
|
|
|
return {
|
|
results,
|
|
rejectedResults,
|
|
summary: {
|
|
totalJobs: results.length,
|
|
totalRejected: rejectedResults.length,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
source: "skipthedrive",
|
|
},
|
|
};
|
|
} catch (error) {
|
|
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract jobs from current page
|
|
*/
|
|
async function extractJobsFromPage(page, keyword, locationFilter) {
|
|
const jobs = [];
|
|
|
|
try {
|
|
// Get all job article elements
|
|
const jobElements = await page.$$("article.job_listing");
|
|
|
|
for (const jobElement of jobElements) {
|
|
try {
|
|
const job = await extractJobData(jobElement, keyword);
|
|
if (job) {
|
|
jobs.push(job);
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`Failed to extract job data: ${error.message}`);
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to extract jobs from page: ${error.message}`);
|
|
}
|
|
|
|
return jobs;
|
|
}
|
|
|
|
/**
|
|
* Extract data from individual job element
|
|
*/
|
|
async function extractJobData(jobElement, keyword) {
|
|
try {
|
|
// Extract job ID
|
|
const articleId = (await jobElement.getAttribute("id")) || "";
|
|
const jobId = articleId ? articleId.replace("post-", "") : "";
|
|
|
|
// Extract title
|
|
const titleElement = await jobElement.$(".job_listing-title a");
|
|
const title = titleElement
|
|
? cleanText(await titleElement.textContent())
|
|
: "";
|
|
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
|
|
|
// Extract company
|
|
const companyElement = await jobElement.$(".company");
|
|
const company = companyElement
|
|
? cleanText(await companyElement.textContent())
|
|
: "";
|
|
|
|
// Extract location
|
|
const locationElement = await jobElement.$(".location");
|
|
const location = locationElement
|
|
? cleanText(await locationElement.textContent())
|
|
: "";
|
|
|
|
// Extract date posted
|
|
const dateElement = await jobElement.$(".job-date");
|
|
const dateText = dateElement
|
|
? cleanText(await dateElement.textContent())
|
|
: "";
|
|
|
|
// Extract description
|
|
const descElement = await jobElement.$(".job_listing-description");
|
|
const description = descElement
|
|
? cleanText(await descElement.textContent())
|
|
: "";
|
|
|
|
// Check if featured
|
|
const featuredElement = await jobElement.$(".featured");
|
|
const isFeatured = featuredElement !== null;
|
|
|
|
// Parse date
|
|
let datePosted = null;
|
|
let daysAgo = null;
|
|
|
|
if (dateText) {
|
|
const match = dateText.match(/(\d+)\s+days?\s+ago/);
|
|
if (match) {
|
|
daysAgo = parseInt(match[1]);
|
|
const date = new Date();
|
|
date.setDate(date.getDate() - daysAgo);
|
|
datePosted = date.toISOString().split("T")[0];
|
|
}
|
|
}
|
|
|
|
return {
|
|
jobId,
|
|
title,
|
|
company,
|
|
location,
|
|
jobUrl,
|
|
datePosted,
|
|
dateText,
|
|
daysAgo,
|
|
description,
|
|
isFeatured,
|
|
keyword,
|
|
extractedAt: new Date().toISOString(),
|
|
source: "skipthedrive",
|
|
};
|
|
} catch (error) {
|
|
logger.warning(`Error extracting job data: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if next page is available
|
|
*/
|
|
async function hasNextPageAvailable(page) {
|
|
try {
|
|
const nextButton = await page.$(".next-page");
|
|
return nextButton !== null;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Navigate to next page
|
|
*/
|
|
async function navigateToNextPage(page, pageNumber) {
|
|
try {
|
|
const nextButton = await page.$(".next-page");
|
|
if (nextButton) {
|
|
await nextButton.click();
|
|
}
|
|
} catch (error) {
|
|
logger.warning(
|
|
`Failed to navigate to page ${pageNumber}: ${error.message}`
|
|
);
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
skipthedriveStrategy,
|
|
buildSearchUrl,
|
|
extractJobsFromPage,
|
|
extractJobData,
|
|
};
|