linkedout/job-search-parser/strategies/skipthedrive-strategy.js
tanyar09 47cdc03fb8 Enhance job search parser with advanced keyword filtering and job detail extraction
- Implemented grouped AND/OR logic for keyword searches, allowing for more flexible job matching criteria.
- Added a minimum date filter to restrict job results to postings after a specified date.
- Enhanced job detail extraction to include role duties and job requirements from job descriptions.
- Updated README with new command line options and examples for using date filters and keyword logic.
- Improved logging to provide clearer insights into keyword matching logic and job search parameters.
2025-12-18 13:33:19 -05:00

494 lines
14 KiB
JavaScript

/**
* SkipTheDrive Parsing Strategy
*
* Uses core-parser for browser management and ai-analyzer for utilities
*/
const {
logger,
cleanText,
containsAnyKeyword,
containsAllKeywords,
matchesKeywordGroups,
validateLocationAgainstFilters,
} = require("ai-analyzer");
/**
* SkipTheDrive URL builder
*/
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
const baseUrl = "https://www.skipthedrive.com/";
const params = new URLSearchParams({
s: keyword,
orderby: orderBy,
});
if (jobTypes && jobTypes.length > 0) {
params.append("job_type", jobTypes.join(","));
}
return `${baseUrl}?${params.toString()}`;
}
/**
* SkipTheDrive parsing strategy function
*/
async function skipthedriveStrategy(coreParser, options = {}) {
const {
keywords = ["software engineer", "developer", "programmer"],
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
locationFilter = null,
maxPages = 5,
jobTypes = [],
useAndLogic = false, // Use AND logic instead of OR logic for keywords
} = options;
const results = [];
const rejectedResults = [];
const seenJobs = new Set();
try {
// Create main page
const page = await coreParser.createPage("skipthedrive-main");
logger.info("🚀 Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
if (keywordGroups) {
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
} else {
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
}
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
// Determine search keywords based on logic type
let searchKeywords;
if (keywordGroups) {
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
} else if (useAndLogic) {
// For simple AND logic, combine all keywords into a single search query
searchKeywords = [keywords.join(" ")];
} else {
// For OR logic, search each keyword separately
searchKeywords = keywords;
}
// Search for each keyword (or combined keyword for AND logic)
for (const keyword of searchKeywords) {
logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
try {
// Navigate to search results
await coreParser.navigateTo(searchUrl, {
pageId: "skipthedrive-main",
retries: 2,
timeout: 30000,
});
// Wait for job listings to load
const hasResults = await page
.waitForSelector("#loops-wrapper", {
timeout: 5000,
})
.then(() => true)
.catch(() => {
logger.warning(`No results found for keyword: ${keyword}`);
return false;
});
if (!hasResults) {
continue;
}
// Process multiple pages
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) {
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
// Extract jobs from current page
const pageJobs = await extractJobsFromPage(
page,
keyword,
locationFilter,
keywords,
keywordGroups,
useAndLogic
);
for (const job of pageJobs) {
// Skip duplicates
if (seenJobs.has(job.jobId)) continue;
seenJobs.add(job.jobId);
// Validate keywords based on logic type
if (keywordGroups) {
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
const fullText = `${job.title} ${job.description} ${job.company}`;
if (!matchesKeywordGroups(fullText, keywordGroups)) {
rejectedResults.push({
...job,
rejectionReason: "Job does not match all keyword groups",
});
continue;
}
} else if (useAndLogic) {
// Simple AND logic: all keywords must match
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
if (!containsAllKeywords(fullText, keywords)) {
rejectedResults.push({
...job,
rejectionReason: "Not all keywords found in job listing",
});
continue;
}
}
// Validate location if filtering enabled
if (locationFilter) {
const locationValid = validateLocationAgainstFilters(
job.location,
locationFilter
);
if (!locationValid) {
rejectedResults.push({
...job,
rejectionReason: "Location filter mismatch",
});
continue;
}
}
results.push(job);
}
// Check for next page
hasNextPage = await hasNextPageAvailable(page);
if (hasNextPage && currentPage < maxPages) {
await navigateToNextPage(page, currentPage + 1);
currentPage++;
// Wait for new page to load
await page.waitForTimeout(2000);
} else {
hasNextPage = false;
}
}
} catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
}
}
logger.info(
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
);
return {
results,
rejectedResults,
summary: {
totalJobs: results.length,
totalRejected: rejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
source: "skipthedrive",
},
};
} catch (error) {
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
throw error;
}
}
/**
* Extract jobs from current page
*/
async function extractJobsFromPage(page, keyword, locationFilter, allKeywords = [], keywordGroups = null, useAndLogic = false) {
const jobs = [];
try {
// Get all job article elements
const jobElements = await page.$$("article.job_listing");
for (const jobElement of jobElements) {
try {
const job = await extractJobData(jobElement, keyword);
if (job) {
jobs.push(job);
}
} catch (error) {
logger.warning(`Failed to extract job data: ${error.message}`);
}
}
} catch (error) {
logger.error(`Failed to extract jobs from page: ${error.message}`);
}
return jobs;
}
/**
* Parse job description to separate role duties from job requirements
*/
function parseDutiesAndRequirements(description) {
if (!description || description.trim().length === 0) {
return { duties: "", requirements: "" };
}
// Common section headers that indicate duties/responsibilities
const dutiesKeywords = [
/responsibilities?:/i,
/duties?:/i,
/what you['\u2019]ll do/i,
/key responsibilities/i,
/your role/i,
/position overview/i,
/about the role/i,
/role overview/i,
/what we need/i,
/you will:/i,
/you['\u2019]ll be responsible/i,
];
// Common section headers that indicate requirements/qualifications
const requirementsKeywords = [
/requirements?:/i,
/qualifications?:/i,
/must have/i,
/required:/i,
/what you['\u2019]ll bring/i,
/you have:/i,
/skills required/i,
/minimum requirements/i,
/preferred qualifications/i,
/education:/i,
/experience:/i,
/you must have/i,
/we['\u2019]re looking for/i,
];
// Split description into sections (by common delimiters)
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
let currentSection = "duties"; // Default to duties
let dutiesText = "";
let requirementsText = "";
for (const section of sections) {
const sectionLower = section.toLowerCase();
// Check if this section is about requirements
let isRequirementsSection = false;
for (const keyword of requirementsKeywords) {
if (keyword.test(section)) {
isRequirementsSection = true;
currentSection = "requirements";
break;
}
}
// Check if this section is about duties/responsibilities
if (!isRequirementsSection) {
for (const keyword of dutiesKeywords) {
if (keyword.test(section)) {
currentSection = "duties";
break;
}
}
}
// Add to appropriate section
if (currentSection === "requirements") {
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
} else {
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
}
}
// If we couldn't split by sections, try to find bullet points or numbered lists
if (!dutiesText && !requirementsText) {
const lines = description.split(/\n/);
let foundRequirementsHeader = false;
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.length === 0) continue;
// Check if this line is a requirements header
for (const keyword of requirementsKeywords) {
if (keyword.test(line)) {
foundRequirementsHeader = true;
break;
}
}
if (foundRequirementsHeader) {
requirementsText += (requirementsText ? "\n" : "") + line;
} else {
// Check if it's a duties header
let isDutiesHeader = false;
for (const keyword of dutiesKeywords) {
if (keyword.test(line)) {
isDutiesHeader = true;
break;
}
}
if (!isDutiesHeader) {
// Add to duties if we haven't found requirements header yet
if (!foundRequirementsHeader) {
dutiesText += (dutiesText ? "\n" : "") + line;
} else {
requirementsText += (requirementsText ? "\n" : "") + line;
}
} else {
dutiesText += (dutiesText ? "\n" : "") + line;
}
}
}
}
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
if (!dutiesText && !requirementsText && description) {
const midPoint = Math.floor(description.length * 0.6);
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
const splitPoint = Math.max(
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
);
dutiesText = description.substring(0, splitPoint).trim();
requirementsText = description.substring(splitPoint).trim();
}
return {
duties: dutiesText.trim(),
requirements: requirementsText.trim(),
};
}
/**
* Extract data from individual job element
*/
async function extractJobData(jobElement, keyword) {
try {
// Extract job ID
const articleId = (await jobElement.getAttribute("id")) || "";
const jobId = articleId ? articleId.replace("post-", "") : "";
// Extract title
const titleElement = await jobElement.$(".job_listing-title a");
const title = titleElement
? cleanText(await titleElement.textContent())
: "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract company
const companyElement = await jobElement.$(".company");
const company = companyElement
? cleanText(await companyElement.textContent())
: "";
// Extract location
const locationElement = await jobElement.$(".location");
const location = locationElement
? cleanText(await locationElement.textContent())
: "";
// Extract date posted
const dateElement = await jobElement.$(".job-date");
const dateText = dateElement
? cleanText(await dateElement.textContent())
: "";
// Extract description
const descElement = await jobElement.$(".job_listing-description");
const description = descElement
? cleanText(await descElement.textContent())
: "";
// Check if featured
const featuredElement = await jobElement.$(".featured");
const isFeatured = featuredElement !== null;
// Parse date
let datePosted = null;
let daysAgo = null;
if (dateText) {
const match = dateText.match(/(\d+)\s+days?\s+ago/);
if (match) {
daysAgo = parseInt(match[1]);
const date = new Date();
date.setDate(date.getDate() - daysAgo);
datePosted = date.toISOString().split("T")[0];
}
}
// Parse duties and requirements from description if available
const parsed = parseDutiesAndRequirements(description);
return {
jobId,
title,
company,
location,
jobUrl,
datePosted,
dateText,
daysAgo,
description,
roleDuties: parsed.duties,
jobRequirements: parsed.requirements,
isFeatured,
keyword,
extractedAt: new Date().toISOString(),
source: "skipthedrive",
};
} catch (error) {
logger.warning(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Check if next page is available
*/
async function hasNextPageAvailable(page) {
try {
const nextButton = await page.$(".next-page");
return nextButton !== null;
} catch {
return false;
}
}
/**
* Navigate to next page
*/
async function navigateToNextPage(page, pageNumber) {
try {
const nextButton = await page.$(".next-page");
if (nextButton) {
await nextButton.click();
}
} catch (error) {
logger.warning(
`Failed to navigate to page ${pageNumber}: ${error.message}`
);
}
}
module.exports = {
skipthedriveStrategy,
buildSearchUrl,
extractJobsFromPage,
extractJobData,
};