tanyar09 47cdc03fb8 Enhance job search parser with advanced keyword filtering and job detail extraction
- Implemented grouped AND/OR logic for keyword searches, allowing for more flexible job matching criteria.
- Added a minimum date filter to restrict job results to postings after a specified date.
- Enhanced job detail extraction to include role duties and job requirements from job descriptions.
- Updated README with new command line options and examples for using date filters and keyword logic.
- Improved logging to provide clearer insights into keyword matching logic and job search parameters.
2025-12-18 13:33:19 -05:00

346 lines
11 KiB
JavaScript

/**
* SkipTheDrive Job Parser
*
* Parses remote job listings from SkipTheDrive.com
* Supports keyword search, job type filters, and pagination
*/
const { chromium } = require("playwright");
const path = require("path");
// Import from ai-analyzer core package
const {
logger,
cleanText,
containsAnyKeyword,
containsAllKeywords,
parseLocationFilters,
validateLocationAgainstFilters,
extractLocationFromProfile,
analyzeBatch,
checkOllamaStatus,
} = require("../../ai-analyzer");
/**
* Build search URL for SkipTheDrive
* @param {string} keyword - Search keyword
* @param {string} orderBy - Sort order (date, relevance)
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
* @returns {string} - Formatted search URL
*/
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
if (orderBy) {
url += `&orderby=${orderBy}`;
}
// Add job type filters
jobTypes.forEach((type) => {
url += `&jobtype=${encodeURIComponent(type)}`;
});
return url;
}
/**
* Extract job data from a single job listing element
* @param {Element} article - Job listing DOM element
* @returns {Object} - Extracted job data
*/
async function extractJobData(article) {
try {
// Extract job title and URL
const titleElement = await article.$("h2.post-title a");
const title = titleElement ? await titleElement.textContent() : "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract date
const dateElement = await article.$("time.post-date");
const datePosted = dateElement
? await dateElement.getAttribute("datetime")
: "";
const dateText = dateElement ? await dateElement.textContent() : "";
// Extract company name
const companyElement = await article.$(
".custom_fields_company_name_display_search_results"
);
let company = companyElement ? await companyElement.textContent() : "";
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract days ago
const daysAgoElement = await article.$(
".custom_fields_job_date_display_search_results"
);
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract job description excerpt
const excerptElement = await article.$(".excerpt_part");
const description = excerptElement
? await excerptElement.textContent()
: "";
// Check if featured/sponsored
const featuredElement = await article.$(".custom_fields_sponsored_job");
const isFeatured = !!featuredElement;
// Extract job ID from article ID
const articleId = await article.getAttribute("id");
const jobId = articleId ? articleId.replace("post-", "") : "";
return {
jobId,
title: cleanText(title),
company: cleanText(company),
jobUrl,
datePosted,
dateText: cleanText(dateText),
daysAgo: cleanText(daysAgo),
description: cleanText(description),
isFeatured,
source: "skipthedrive",
timestamp: new Date().toISOString(),
};
} catch (error) {
logger.error(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Parse SkipTheDrive job listings
* @param {Object} options - Parser options
* @returns {Promise<Array>} - Array of parsed job listings
*/
async function parseSkipTheDrive(options = {}) {
const {
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
"software engineer",
"developer",
],
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
locationFilter = process.env.LOCATION_FILTER || "",
maxPages = parseInt(process.env.MAX_PAGES) || 5,
headless = process.env.HEADLESS !== "false",
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
useAndLogic = false, // Use AND logic instead of OR logic for keywords
} = options;
logger.step("Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
logger.info(
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
const browser = await chromium.launch({
headless,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
});
const context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
});
const results = [];
const rejectedResults = [];
const seenJobs = new Set();
try {
// For AND logic, combine all keywords into a single search query
// For OR logic, search each keyword separately
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
// Search for each keyword (or combined keyword for AND logic)
for (const keyword of searchKeywords) {
logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
const page = await context.newPage();
try {
logger.info(
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
);
await page.goto(searchUrl, {
waitUntil: "domcontentloaded",
timeout: 30000,
});
logger.info(
`Navigation completed successfully at ${new Date().toISOString()}`
);
// Wait for job listings to load
logger.info("Waiting for selector #loops-wrapper");
await page
.waitForSelector("#loops-wrapper", { timeout: 5000 })
.catch(() => {
logger.warning(`No results found for keyword: ${keyword}`);
});
logger.info("Selector wait completed");
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) {
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
// Extract all job articles on current page
const jobArticles = await page.$$("article[id^='post-']");
logger.info(
`Found ${jobArticles.length} job listings on page ${currentPage}`
);
for (const article of jobArticles) {
const jobData = await extractJobData(article);
if (!jobData || seenJobs.has(jobData.jobId)) {
continue;
}
seenJobs.add(jobData.jobId);
// Add keyword that found this job
jobData.searchKeyword = keyword;
// Validate job against keywords
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
const keywordMatch = useAndLogic
? containsAllKeywords(fullText, keywords)
: containsAnyKeyword(fullText, keywords);
if (!keywordMatch) {
rejectedResults.push({
...jobData,
rejected: true,
reason: useAndLogic
? "Not all keywords found in job listing"
: "Keywords not found in job listing",
});
continue;
}
// Location validation (if enabled)
if (locationFilter) {
const locationFilters = parseLocationFilters(locationFilter);
// For SkipTheDrive, most jobs are remote, but we can check the title/description
const locationValid =
fullText.toLowerCase().includes("remote") ||
locationFilters.some((filter) =>
fullText.toLowerCase().includes(filter.toLowerCase())
);
if (!locationValid) {
rejectedResults.push({
...jobData,
rejected: true,
reason: "Location requirements not met",
});
continue;
}
jobData.locationValid = locationValid;
}
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
results.push(jobData);
}
// Check for next page
const nextPageLink = await page.$("a.nextp");
if (nextPageLink && currentPage < maxPages) {
logger.info("📄 Moving to next page...");
await nextPageLink.click();
await page.waitForLoadState("domcontentloaded");
await page.waitForTimeout(2000); // Wait for content to load
currentPage++;
} else {
hasNextPage = false;
}
}
} catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} finally {
await page.close();
}
}
logger.success(`\n✅ Parsing complete!`);
logger.info(`📊 Total jobs found: ${results.length}`);
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
// Run AI analysis if enabled
let aiAnalysis = null;
if (enableAI && results.length > 0) {
logger.step("Running AI analysis on job listings...");
const aiAvailable = await checkOllamaStatus();
if (aiAvailable) {
const analysisData = results.map((job) => ({
text: `${job.title} at ${job.company}. ${job.description}`,
metadata: {
jobId: job.jobId,
company: job.company,
daysAgo: job.daysAgo,
},
}));
aiAnalysis = await analyzeBatch(analysisData, aiContext);
// Merge AI analysis with results
results.forEach((job, index) => {
if (aiAnalysis && aiAnalysis[index]) {
job.aiAnalysis = {
isRelevant: aiAnalysis[index].isRelevant,
confidence: aiAnalysis[index].confidence,
reasoning: aiAnalysis[index].reasoning,
};
}
});
logger.success("✅ AI analysis completed");
} else {
logger.warning("⚠️ AI not available - skipping analysis");
}
}
return {
results,
rejectedResults,
metadata: {
source: "skipthedrive",
totalJobs: results.length,
rejectedJobs: rejectedResults.length,
keywords: keywords,
jobTypes: jobTypes,
locationFilter: locationFilter,
aiAnalysisEnabled: enableAI,
aiAnalysisCompleted: !!aiAnalysis,
timestamp: new Date().toISOString(),
},
};
} catch (error) {
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
throw error;
} finally {
await browser.close();
}
}
// Export the parser
module.exports = {
parseSkipTheDrive,
buildSearchUrl,
extractJobData,
};