333 lines
11 KiB
JavaScript
333 lines
11 KiB
JavaScript
/**
|
|
* SkipTheDrive Job Parser
|
|
*
|
|
* Parses remote job listings from SkipTheDrive.com
|
|
* Supports keyword search, job type filters, and pagination
|
|
*/
|
|
|
|
const { chromium } = require("playwright");
|
|
const path = require("path");
|
|
|
|
// Import from ai-analyzer core package
|
|
const {
|
|
logger,
|
|
cleanText,
|
|
containsAnyKeyword,
|
|
parseLocationFilters,
|
|
validateLocationAgainstFilters,
|
|
extractLocationFromProfile,
|
|
analyzeBatch,
|
|
checkOllamaStatus,
|
|
} = require("../../ai-analyzer");
|
|
|
|
/**
|
|
* Build search URL for SkipTheDrive
|
|
* @param {string} keyword - Search keyword
|
|
* @param {string} orderBy - Sort order (date, relevance)
|
|
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
|
|
* @returns {string} - Formatted search URL
|
|
*/
|
|
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
|
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
|
|
|
|
if (orderBy) {
|
|
url += `&orderby=${orderBy}`;
|
|
}
|
|
|
|
// Add job type filters
|
|
jobTypes.forEach((type) => {
|
|
url += `&jobtype=${encodeURIComponent(type)}`;
|
|
});
|
|
|
|
return url;
|
|
}
|
|
|
|
/**
|
|
* Extract job data from a single job listing element
|
|
* @param {Element} article - Job listing DOM element
|
|
* @returns {Object} - Extracted job data
|
|
*/
|
|
async function extractJobData(article) {
|
|
try {
|
|
// Extract job title and URL
|
|
const titleElement = await article.$("h2.post-title a");
|
|
const title = titleElement ? await titleElement.textContent() : "";
|
|
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
|
|
|
// Extract date
|
|
const dateElement = await article.$("time.post-date");
|
|
const datePosted = dateElement
|
|
? await dateElement.getAttribute("datetime")
|
|
: "";
|
|
const dateText = dateElement ? await dateElement.textContent() : "";
|
|
|
|
// Extract company name
|
|
const companyElement = await article.$(
|
|
".custom_fields_company_name_display_search_results"
|
|
);
|
|
let company = companyElement ? await companyElement.textContent() : "";
|
|
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
|
|
|
// Extract days ago
|
|
const daysAgoElement = await article.$(
|
|
".custom_fields_job_date_display_search_results"
|
|
);
|
|
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
|
|
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
|
|
|
// Extract job description excerpt
|
|
const excerptElement = await article.$(".excerpt_part");
|
|
const description = excerptElement
|
|
? await excerptElement.textContent()
|
|
: "";
|
|
|
|
// Check if featured/sponsored
|
|
const featuredElement = await article.$(".custom_fields_sponsored_job");
|
|
const isFeatured = !!featuredElement;
|
|
|
|
// Extract job ID from article ID
|
|
const articleId = await article.getAttribute("id");
|
|
const jobId = articleId ? articleId.replace("post-", "") : "";
|
|
|
|
return {
|
|
jobId,
|
|
title: cleanText(title),
|
|
company: cleanText(company),
|
|
jobUrl,
|
|
datePosted,
|
|
dateText: cleanText(dateText),
|
|
daysAgo: cleanText(daysAgo),
|
|
description: cleanText(description),
|
|
isFeatured,
|
|
source: "skipthedrive",
|
|
timestamp: new Date().toISOString(),
|
|
};
|
|
} catch (error) {
|
|
logger.error(`Error extracting job data: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse SkipTheDrive job listings
|
|
* @param {Object} options - Parser options
|
|
* @returns {Promise<Array>} - Array of parsed job listings
|
|
*/
|
|
async function parseSkipTheDrive(options = {}) {
|
|
const {
|
|
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
|
|
"software engineer",
|
|
"developer",
|
|
],
|
|
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
|
|
locationFilter = process.env.LOCATION_FILTER || "",
|
|
maxPages = parseInt(process.env.MAX_PAGES) || 5,
|
|
headless = process.env.HEADLESS !== "false",
|
|
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
|
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
|
} = options;
|
|
|
|
logger.step("Starting SkipTheDrive parser...");
|
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
|
logger.info(
|
|
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
|
);
|
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
|
|
|
const browser = await chromium.launch({
|
|
headless,
|
|
args: [
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
],
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
userAgent:
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
});
|
|
|
|
const results = [];
|
|
const rejectedResults = [];
|
|
const seenJobs = new Set();
|
|
|
|
try {
|
|
// Search for each keyword
|
|
for (const keyword of keywords) {
|
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
|
|
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
logger.info(
|
|
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
|
|
);
|
|
await page.goto(searchUrl, {
|
|
waitUntil: "domcontentloaded",
|
|
timeout: 30000,
|
|
});
|
|
logger.info(
|
|
`Navigation completed successfully at ${new Date().toISOString()}`
|
|
);
|
|
|
|
// Wait for job listings to load
|
|
logger.info("Waiting for selector #loops-wrapper");
|
|
await page
|
|
.waitForSelector("#loops-wrapper", { timeout: 5000 })
|
|
.catch(() => {
|
|
logger.warning(`No results found for keyword: ${keyword}`);
|
|
});
|
|
logger.info("Selector wait completed");
|
|
|
|
let currentPage = 1;
|
|
let hasNextPage = true;
|
|
|
|
while (hasNextPage && currentPage <= maxPages) {
|
|
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
|
|
|
// Extract all job articles on current page
|
|
const jobArticles = await page.$$("article[id^='post-']");
|
|
logger.info(
|
|
`Found ${jobArticles.length} job listings on page ${currentPage}`
|
|
);
|
|
|
|
for (const article of jobArticles) {
|
|
const jobData = await extractJobData(article);
|
|
|
|
if (!jobData || seenJobs.has(jobData.jobId)) {
|
|
continue;
|
|
}
|
|
|
|
seenJobs.add(jobData.jobId);
|
|
|
|
// Add keyword that found this job
|
|
jobData.searchKeyword = keyword;
|
|
|
|
// Validate job against keywords
|
|
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
|
if (!containsAnyKeyword(fullText, keywords)) {
|
|
rejectedResults.push({
|
|
...jobData,
|
|
rejected: true,
|
|
reason: "Keywords not found in job listing",
|
|
});
|
|
continue;
|
|
}
|
|
|
|
// Location validation (if enabled)
|
|
if (locationFilter) {
|
|
const locationFilters = parseLocationFilters(locationFilter);
|
|
// For SkipTheDrive, most jobs are remote, but we can check the title/description
|
|
const locationValid =
|
|
fullText.toLowerCase().includes("remote") ||
|
|
locationFilters.some((filter) =>
|
|
fullText.toLowerCase().includes(filter.toLowerCase())
|
|
);
|
|
|
|
if (!locationValid) {
|
|
rejectedResults.push({
|
|
...jobData,
|
|
rejected: true,
|
|
reason: "Location requirements not met",
|
|
});
|
|
continue;
|
|
}
|
|
|
|
jobData.locationValid = locationValid;
|
|
}
|
|
|
|
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
|
|
results.push(jobData);
|
|
}
|
|
|
|
// Check for next page
|
|
const nextPageLink = await page.$("a.nextp");
|
|
if (nextPageLink && currentPage < maxPages) {
|
|
logger.info("📄 Moving to next page...");
|
|
await nextPageLink.click();
|
|
await page.waitForLoadState("domcontentloaded");
|
|
await page.waitForTimeout(2000); // Wait for content to load
|
|
currentPage++;
|
|
} else {
|
|
hasNextPage = false;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
}
|
|
|
|
logger.success(`\n✅ Parsing complete!`);
|
|
logger.info(`📊 Total jobs found: ${results.length}`);
|
|
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
|
|
|
|
// Run AI analysis if enabled
|
|
let aiAnalysis = null;
|
|
if (enableAI && results.length > 0) {
|
|
logger.step("Running AI analysis on job listings...");
|
|
|
|
const aiAvailable = await checkOllamaStatus();
|
|
if (aiAvailable) {
|
|
const analysisData = results.map((job) => ({
|
|
text: `${job.title} at ${job.company}. ${job.description}`,
|
|
metadata: {
|
|
jobId: job.jobId,
|
|
company: job.company,
|
|
daysAgo: job.daysAgo,
|
|
},
|
|
}));
|
|
|
|
aiAnalysis = await analyzeBatch(analysisData, aiContext);
|
|
|
|
// Merge AI analysis with results
|
|
results.forEach((job, index) => {
|
|
if (aiAnalysis && aiAnalysis[index]) {
|
|
job.aiAnalysis = {
|
|
isRelevant: aiAnalysis[index].isRelevant,
|
|
confidence: aiAnalysis[index].confidence,
|
|
reasoning: aiAnalysis[index].reasoning,
|
|
};
|
|
}
|
|
});
|
|
|
|
logger.success("✅ AI analysis completed");
|
|
} else {
|
|
logger.warning("⚠️ AI not available - skipping analysis");
|
|
}
|
|
}
|
|
|
|
return {
|
|
results,
|
|
rejectedResults,
|
|
metadata: {
|
|
source: "skipthedrive",
|
|
totalJobs: results.length,
|
|
rejectedJobs: rejectedResults.length,
|
|
keywords: keywords,
|
|
jobTypes: jobTypes,
|
|
locationFilter: locationFilter,
|
|
aiAnalysisEnabled: enableAI,
|
|
aiAnalysisCompleted: !!aiAnalysis,
|
|
timestamp: new Date().toISOString(),
|
|
},
|
|
};
|
|
} catch (error) {
|
|
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
// Export the parser
|
|
module.exports = {
|
|
parseSkipTheDrive,
|
|
buildSearchUrl,
|
|
extractJobData,
|
|
};
|