- Implemented grouped AND/OR logic for keyword searches, allowing for more flexible job matching criteria. - Added a minimum date filter to restrict job results to postings after a specified date. - Enhanced job detail extraction to include role duties and job requirements from job descriptions. - Updated README with new command line options and examples for using date filters and keyword logic. - Improved logging to provide clearer insights into keyword matching logic and job search parameters.
494 lines
14 KiB
JavaScript
494 lines
14 KiB
JavaScript
/**
|
|
* SkipTheDrive Parsing Strategy
|
|
*
|
|
* Uses core-parser for browser management and ai-analyzer for utilities
|
|
*/
|
|
|
|
const {
|
|
logger,
|
|
cleanText,
|
|
containsAnyKeyword,
|
|
containsAllKeywords,
|
|
matchesKeywordGroups,
|
|
validateLocationAgainstFilters,
|
|
} = require("ai-analyzer");
|
|
|
|
/**
|
|
* SkipTheDrive URL builder
|
|
*/
|
|
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
|
const baseUrl = "https://www.skipthedrive.com/";
|
|
const params = new URLSearchParams({
|
|
s: keyword,
|
|
orderby: orderBy,
|
|
});
|
|
|
|
if (jobTypes && jobTypes.length > 0) {
|
|
params.append("job_type", jobTypes.join(","));
|
|
}
|
|
|
|
return `${baseUrl}?${params.toString()}`;
|
|
}
|
|
|
|
/**
|
|
* SkipTheDrive parsing strategy function
|
|
*/
|
|
async function skipthedriveStrategy(coreParser, options = {}) {
|
|
const {
|
|
keywords = ["software engineer", "developer", "programmer"],
|
|
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
|
|
locationFilter = null,
|
|
maxPages = 5,
|
|
jobTypes = [],
|
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
|
} = options;
|
|
|
|
const results = [];
|
|
const rejectedResults = [];
|
|
const seenJobs = new Set();
|
|
|
|
try {
|
|
// Create main page
|
|
const page = await coreParser.createPage("skipthedrive-main");
|
|
|
|
logger.info("🚀 Starting SkipTheDrive parser...");
|
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
|
if (keywordGroups) {
|
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
|
} else {
|
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
|
}
|
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
|
|
|
// Determine search keywords based on logic type
|
|
let searchKeywords;
|
|
if (keywordGroups) {
|
|
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
|
|
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
|
|
} else if (useAndLogic) {
|
|
// For simple AND logic, combine all keywords into a single search query
|
|
searchKeywords = [keywords.join(" ")];
|
|
} else {
|
|
// For OR logic, search each keyword separately
|
|
searchKeywords = keywords;
|
|
}
|
|
|
|
// Search for each keyword (or combined keyword for AND logic)
|
|
for (const keyword of searchKeywords) {
|
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
|
|
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
|
|
|
try {
|
|
// Navigate to search results
|
|
await coreParser.navigateTo(searchUrl, {
|
|
pageId: "skipthedrive-main",
|
|
retries: 2,
|
|
timeout: 30000,
|
|
});
|
|
|
|
// Wait for job listings to load
|
|
const hasResults = await page
|
|
.waitForSelector("#loops-wrapper", {
|
|
timeout: 5000,
|
|
})
|
|
.then(() => true)
|
|
.catch(() => {
|
|
logger.warning(`No results found for keyword: ${keyword}`);
|
|
return false;
|
|
});
|
|
|
|
if (!hasResults) {
|
|
continue;
|
|
}
|
|
|
|
// Process multiple pages
|
|
let currentPage = 1;
|
|
let hasNextPage = true;
|
|
|
|
while (hasNextPage && currentPage <= maxPages) {
|
|
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
|
|
|
// Extract jobs from current page
|
|
const pageJobs = await extractJobsFromPage(
|
|
page,
|
|
keyword,
|
|
locationFilter,
|
|
keywords,
|
|
keywordGroups,
|
|
useAndLogic
|
|
);
|
|
|
|
for (const job of pageJobs) {
|
|
// Skip duplicates
|
|
if (seenJobs.has(job.jobId)) continue;
|
|
seenJobs.add(job.jobId);
|
|
|
|
// Validate keywords based on logic type
|
|
if (keywordGroups) {
|
|
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
|
|
const fullText = `${job.title} ${job.description} ${job.company}`;
|
|
if (!matchesKeywordGroups(fullText, keywordGroups)) {
|
|
rejectedResults.push({
|
|
...job,
|
|
rejectionReason: "Job does not match all keyword groups",
|
|
});
|
|
continue;
|
|
}
|
|
} else if (useAndLogic) {
|
|
// Simple AND logic: all keywords must match
|
|
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
|
|
if (!containsAllKeywords(fullText, keywords)) {
|
|
rejectedResults.push({
|
|
...job,
|
|
rejectionReason: "Not all keywords found in job listing",
|
|
});
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Validate location if filtering enabled
|
|
if (locationFilter) {
|
|
const locationValid = validateLocationAgainstFilters(
|
|
job.location,
|
|
locationFilter
|
|
);
|
|
|
|
if (!locationValid) {
|
|
rejectedResults.push({
|
|
...job,
|
|
rejectionReason: "Location filter mismatch",
|
|
});
|
|
continue;
|
|
}
|
|
}
|
|
|
|
results.push(job);
|
|
}
|
|
|
|
// Check for next page
|
|
hasNextPage = await hasNextPageAvailable(page);
|
|
if (hasNextPage && currentPage < maxPages) {
|
|
await navigateToNextPage(page, currentPage + 1);
|
|
currentPage++;
|
|
|
|
// Wait for new page to load
|
|
await page.waitForTimeout(2000);
|
|
} else {
|
|
hasNextPage = false;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
|
}
|
|
}
|
|
|
|
logger.info(
|
|
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
|
|
);
|
|
|
|
return {
|
|
results,
|
|
rejectedResults,
|
|
summary: {
|
|
totalJobs: results.length,
|
|
totalRejected: rejectedResults.length,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
source: "skipthedrive",
|
|
},
|
|
};
|
|
} catch (error) {
|
|
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract jobs from current page
|
|
*/
|
|
async function extractJobsFromPage(page, keyword, locationFilter, allKeywords = [], keywordGroups = null, useAndLogic = false) {
|
|
const jobs = [];
|
|
|
|
try {
|
|
// Get all job article elements
|
|
const jobElements = await page.$$("article.job_listing");
|
|
|
|
for (const jobElement of jobElements) {
|
|
try {
|
|
const job = await extractJobData(jobElement, keyword);
|
|
if (job) {
|
|
jobs.push(job);
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`Failed to extract job data: ${error.message}`);
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to extract jobs from page: ${error.message}`);
|
|
}
|
|
|
|
return jobs;
|
|
}
|
|
|
|
/**
|
|
* Parse job description to separate role duties from job requirements
|
|
*/
|
|
function parseDutiesAndRequirements(description) {
|
|
if (!description || description.trim().length === 0) {
|
|
return { duties: "", requirements: "" };
|
|
}
|
|
|
|
// Common section headers that indicate duties/responsibilities
|
|
const dutiesKeywords = [
|
|
/responsibilities?:/i,
|
|
/duties?:/i,
|
|
/what you['\u2019]ll do/i,
|
|
/key responsibilities/i,
|
|
/your role/i,
|
|
/position overview/i,
|
|
/about the role/i,
|
|
/role overview/i,
|
|
/what we need/i,
|
|
/you will:/i,
|
|
/you['\u2019]ll be responsible/i,
|
|
];
|
|
|
|
// Common section headers that indicate requirements/qualifications
|
|
const requirementsKeywords = [
|
|
/requirements?:/i,
|
|
/qualifications?:/i,
|
|
/must have/i,
|
|
/required:/i,
|
|
/what you['\u2019]ll bring/i,
|
|
/you have:/i,
|
|
/skills required/i,
|
|
/minimum requirements/i,
|
|
/preferred qualifications/i,
|
|
/education:/i,
|
|
/experience:/i,
|
|
/you must have/i,
|
|
/we['\u2019]re looking for/i,
|
|
];
|
|
|
|
// Split description into sections (by common delimiters)
|
|
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
|
|
|
|
let currentSection = "duties"; // Default to duties
|
|
let dutiesText = "";
|
|
let requirementsText = "";
|
|
|
|
for (const section of sections) {
|
|
const sectionLower = section.toLowerCase();
|
|
|
|
// Check if this section is about requirements
|
|
let isRequirementsSection = false;
|
|
for (const keyword of requirementsKeywords) {
|
|
if (keyword.test(section)) {
|
|
isRequirementsSection = true;
|
|
currentSection = "requirements";
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check if this section is about duties/responsibilities
|
|
if (!isRequirementsSection) {
|
|
for (const keyword of dutiesKeywords) {
|
|
if (keyword.test(section)) {
|
|
currentSection = "duties";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add to appropriate section
|
|
if (currentSection === "requirements") {
|
|
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
|
|
} else {
|
|
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
|
|
}
|
|
}
|
|
|
|
// If we couldn't split by sections, try to find bullet points or numbered lists
|
|
if (!dutiesText && !requirementsText) {
|
|
const lines = description.split(/\n/);
|
|
let foundRequirementsHeader = false;
|
|
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const line = lines[i].trim();
|
|
if (line.length === 0) continue;
|
|
|
|
// Check if this line is a requirements header
|
|
for (const keyword of requirementsKeywords) {
|
|
if (keyword.test(line)) {
|
|
foundRequirementsHeader = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (foundRequirementsHeader) {
|
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
|
} else {
|
|
// Check if it's a duties header
|
|
let isDutiesHeader = false;
|
|
for (const keyword of dutiesKeywords) {
|
|
if (keyword.test(line)) {
|
|
isDutiesHeader = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!isDutiesHeader) {
|
|
// Add to duties if we haven't found requirements header yet
|
|
if (!foundRequirementsHeader) {
|
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
|
} else {
|
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
|
}
|
|
} else {
|
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
|
|
if (!dutiesText && !requirementsText && description) {
|
|
const midPoint = Math.floor(description.length * 0.6);
|
|
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
|
|
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
|
|
const splitPoint = Math.max(
|
|
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
|
|
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
|
|
);
|
|
|
|
dutiesText = description.substring(0, splitPoint).trim();
|
|
requirementsText = description.substring(splitPoint).trim();
|
|
}
|
|
|
|
return {
|
|
duties: dutiesText.trim(),
|
|
requirements: requirementsText.trim(),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract data from individual job element
|
|
*/
|
|
async function extractJobData(jobElement, keyword) {
|
|
try {
|
|
// Extract job ID
|
|
const articleId = (await jobElement.getAttribute("id")) || "";
|
|
const jobId = articleId ? articleId.replace("post-", "") : "";
|
|
|
|
// Extract title
|
|
const titleElement = await jobElement.$(".job_listing-title a");
|
|
const title = titleElement
|
|
? cleanText(await titleElement.textContent())
|
|
: "";
|
|
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
|
|
|
// Extract company
|
|
const companyElement = await jobElement.$(".company");
|
|
const company = companyElement
|
|
? cleanText(await companyElement.textContent())
|
|
: "";
|
|
|
|
// Extract location
|
|
const locationElement = await jobElement.$(".location");
|
|
const location = locationElement
|
|
? cleanText(await locationElement.textContent())
|
|
: "";
|
|
|
|
// Extract date posted
|
|
const dateElement = await jobElement.$(".job-date");
|
|
const dateText = dateElement
|
|
? cleanText(await dateElement.textContent())
|
|
: "";
|
|
|
|
// Extract description
|
|
const descElement = await jobElement.$(".job_listing-description");
|
|
const description = descElement
|
|
? cleanText(await descElement.textContent())
|
|
: "";
|
|
|
|
// Check if featured
|
|
const featuredElement = await jobElement.$(".featured");
|
|
const isFeatured = featuredElement !== null;
|
|
|
|
// Parse date
|
|
let datePosted = null;
|
|
let daysAgo = null;
|
|
|
|
if (dateText) {
|
|
const match = dateText.match(/(\d+)\s+days?\s+ago/);
|
|
if (match) {
|
|
daysAgo = parseInt(match[1]);
|
|
const date = new Date();
|
|
date.setDate(date.getDate() - daysAgo);
|
|
datePosted = date.toISOString().split("T")[0];
|
|
}
|
|
}
|
|
|
|
// Parse duties and requirements from description if available
|
|
const parsed = parseDutiesAndRequirements(description);
|
|
|
|
return {
|
|
jobId,
|
|
title,
|
|
company,
|
|
location,
|
|
jobUrl,
|
|
datePosted,
|
|
dateText,
|
|
daysAgo,
|
|
description,
|
|
roleDuties: parsed.duties,
|
|
jobRequirements: parsed.requirements,
|
|
isFeatured,
|
|
keyword,
|
|
extractedAt: new Date().toISOString(),
|
|
source: "skipthedrive",
|
|
};
|
|
} catch (error) {
|
|
logger.warning(`Error extracting job data: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if next page is available
|
|
*/
|
|
async function hasNextPageAvailable(page) {
|
|
try {
|
|
const nextButton = await page.$(".next-page");
|
|
return nextButton !== null;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Navigate to next page
|
|
*/
|
|
async function navigateToNextPage(page, pageNumber) {
|
|
try {
|
|
const nextButton = await page.$(".next-page");
|
|
if (nextButton) {
|
|
await nextButton.click();
|
|
}
|
|
} catch (error) {
|
|
logger.warning(
|
|
`Failed to navigate to page ${pageNumber}: ${error.message}`
|
|
);
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
skipthedriveStrategy,
|
|
buildSearchUrl,
|
|
extractJobsFromPage,
|
|
extractJobData,
|
|
};
|