linkedout/job-search-parser/strategies/skipthedrive-strategy.js

303 lines
7.8 KiB
JavaScript

/**
* SkipTheDrive Parsing Strategy
*
* Uses core-parser for browser management and ai-analyzer for utilities
*/
const {
logger,
cleanText,
containsAnyKeyword,
validateLocationAgainstFilters,
} = require("ai-analyzer");
/**
* SkipTheDrive URL builder
*/
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
const baseUrl = "https://www.skipthedrive.com/";
const params = new URLSearchParams({
s: keyword,
orderby: orderBy,
});
if (jobTypes && jobTypes.length > 0) {
params.append("job_type", jobTypes.join(","));
}
return `${baseUrl}?${params.toString()}`;
}
/**
* SkipTheDrive parsing strategy function
*/
async function skipthedriveStrategy(coreParser, options = {}) {
const {
keywords = ["software engineer", "developer", "programmer"],
locationFilter = null,
maxPages = 5,
jobTypes = [],
} = options;
const results = [];
const rejectedResults = [];
const seenJobs = new Set();
try {
// Create main page
const page = await coreParser.createPage("skipthedrive-main");
logger.info("🚀 Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
// Search for each keyword
for (const keyword of keywords) {
logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
try {
// Navigate to search results
await coreParser.navigateTo(searchUrl, {
pageId: "skipthedrive-main",
retries: 2,
timeout: 30000,
});
// Wait for job listings to load
const hasResults = await coreParser
.waitForSelector(
"#loops-wrapper",
{
timeout: 5000,
},
"skipthedrive-main"
)
.catch(() => {
logger.warning(`No results found for keyword: ${keyword}`);
return false;
});
if (!hasResults) {
continue;
}
// Process multiple pages
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) {
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
// Extract jobs from current page
const pageJobs = await extractJobsFromPage(
page,
keyword,
locationFilter
);
for (const job of pageJobs) {
// Skip duplicates
if (seenJobs.has(job.jobId)) continue;
seenJobs.add(job.jobId);
// Validate location if filtering enabled
if (locationFilter) {
const locationValid = validateLocationAgainstFilters(
job.location,
locationFilter
);
if (!locationValid) {
rejectedResults.push({
...job,
rejectionReason: "Location filter mismatch",
});
continue;
}
}
results.push(job);
}
// Check for next page
hasNextPage = await hasNextPageAvailable(page);
if (hasNextPage && currentPage < maxPages) {
await navigateToNextPage(page, currentPage + 1);
currentPage++;
// Wait for new page to load
await page.waitForTimeout(2000);
} else {
hasNextPage = false;
}
}
} catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
}
}
logger.info(
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
);
return {
results,
rejectedResults,
summary: {
totalJobs: results.length,
totalRejected: rejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
source: "skipthedrive",
},
};
} catch (error) {
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
throw error;
}
}
/**
* Extract jobs from current page
*/
async function extractJobsFromPage(page, keyword, locationFilter) {
const jobs = [];
try {
// Get all job article elements
const jobElements = await page.$$("article.job_listing");
for (const jobElement of jobElements) {
try {
const job = await extractJobData(jobElement, keyword);
if (job) {
jobs.push(job);
}
} catch (error) {
logger.warning(`Failed to extract job data: ${error.message}`);
}
}
} catch (error) {
logger.error(`Failed to extract jobs from page: ${error.message}`);
}
return jobs;
}
/**
* Extract data from individual job element
*/
async function extractJobData(jobElement, keyword) {
try {
// Extract job ID
const articleId = (await jobElement.getAttribute("id")) || "";
const jobId = articleId ? articleId.replace("post-", "") : "";
// Extract title
const titleElement = await jobElement.$(".job_listing-title a");
const title = titleElement
? cleanText(await titleElement.textContent())
: "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract company
const companyElement = await jobElement.$(".company");
const company = companyElement
? cleanText(await companyElement.textContent())
: "";
// Extract location
const locationElement = await jobElement.$(".location");
const location = locationElement
? cleanText(await locationElement.textContent())
: "";
// Extract date posted
const dateElement = await jobElement.$(".job-date");
const dateText = dateElement
? cleanText(await dateElement.textContent())
: "";
// Extract description
const descElement = await jobElement.$(".job_listing-description");
const description = descElement
? cleanText(await descElement.textContent())
: "";
// Check if featured
const featuredElement = await jobElement.$(".featured");
const isFeatured = featuredElement !== null;
// Parse date
let datePosted = null;
let daysAgo = null;
if (dateText) {
const match = dateText.match(/(\d+)\s+days?\s+ago/);
if (match) {
daysAgo = parseInt(match[1]);
const date = new Date();
date.setDate(date.getDate() - daysAgo);
datePosted = date.toISOString().split("T")[0];
}
}
return {
jobId,
title,
company,
location,
jobUrl,
datePosted,
dateText,
daysAgo,
description,
isFeatured,
keyword,
extractedAt: new Date().toISOString(),
source: "skipthedrive",
};
} catch (error) {
logger.warning(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Check if next page is available
*/
async function hasNextPageAvailable(page) {
try {
const nextButton = await page.$(".next-page");
return nextButton !== null;
} catch {
return false;
}
}
/**
* Navigate to next page
*/
async function navigateToNextPage(page, pageNumber) {
try {
const nextButton = await page.$(".next-page");
if (nextButton) {
await nextButton.click();
}
} catch (error) {
logger.warning(
`Failed to navigate to page ${pageNumber}: ${error.message}`
);
}
}
module.exports = {
skipthedriveStrategy,
buildSearchUrl,
extractJobsFromPage,
extractJobData,
};