/** * SkipTheDrive Job Parser * * Parses remote job listings from SkipTheDrive.com * Supports keyword search, job type filters, and pagination */ const { chromium } = require("playwright"); const path = require("path"); // Import from ai-analyzer core package const { logger, cleanText, containsAnyKeyword, parseLocationFilters, validateLocationAgainstFilters, extractLocationFromProfile, analyzeBatch, checkOllamaStatus, } = require("../../ai-analyzer"); /** * Build search URL for SkipTheDrive * @param {string} keyword - Search keyword * @param {string} orderBy - Sort order (date, relevance) * @param {Array} jobTypes - Job types to filter (part time, full time, contract) * @returns {string} - Formatted search URL */ function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`; if (orderBy) { url += `&orderby=${orderBy}`; } // Add job type filters jobTypes.forEach((type) => { url += `&jobtype=${encodeURIComponent(type)}`; }); return url; } /** * Extract job data from a single job listing element * @param {Element} article - Job listing DOM element * @returns {Object} - Extracted job data */ async function extractJobData(article) { try { // Extract job title and URL const titleElement = await article.$("h2.post-title a"); const title = titleElement ? await titleElement.textContent() : ""; const jobUrl = titleElement ? await titleElement.getAttribute("href") : ""; // Extract date const dateElement = await article.$("time.post-date"); const datePosted = dateElement ? await dateElement.getAttribute("datetime") : ""; const dateText = dateElement ? await dateElement.textContent() : ""; // Extract company name const companyElement = await article.$( ".custom_fields_company_name_display_search_results" ); let company = companyElement ? await companyElement.textContent() : ""; company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon // Extract days ago const daysAgoElement = await article.$( ".custom_fields_job_date_display_search_results" ); let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : ""; daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon // Extract job description excerpt const excerptElement = await article.$(".excerpt_part"); const description = excerptElement ? await excerptElement.textContent() : ""; // Check if featured/sponsored const featuredElement = await article.$(".custom_fields_sponsored_job"); const isFeatured = !!featuredElement; // Extract job ID from article ID const articleId = await article.getAttribute("id"); const jobId = articleId ? articleId.replace("post-", "") : ""; return { jobId, title: cleanText(title), company: cleanText(company), jobUrl, datePosted, dateText: cleanText(dateText), daysAgo: cleanText(daysAgo), description: cleanText(description), isFeatured, source: "skipthedrive", timestamp: new Date().toISOString(), }; } catch (error) { logger.error(`Error extracting job data: ${error.message}`); return null; } } /** * Parse SkipTheDrive job listings * @param {Object} options - Parser options * @returns {Promise} - Array of parsed job listings */ async function parseSkipTheDrive(options = {}) { const { keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [ "software engineer", "developer", ], jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [], locationFilter = process.env.LOCATION_FILTER || "", maxPages = parseInt(process.env.MAX_PAGES) || 5, headless = process.env.HEADLESS !== "false", enableAI = process.env.ENABLE_AI_ANALYSIS === "true", aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis", } = options; logger.step("Starting SkipTheDrive parser..."); logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); logger.info( `šŸ“‹ Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}` ); logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); logger.info(`šŸ“„ Max Pages: ${maxPages}`); const browser = await chromium.launch({ headless, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", ], }); const context = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", }); const results = []; const rejectedResults = []; const seenJobs = new Set(); try { // Search for each keyword for (const keyword of keywords) { logger.info(`\nšŸ” Searching for: ${keyword}`); const searchUrl = buildSearchUrl(keyword, "date", jobTypes); const page = await context.newPage(); try { logger.info( `Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}` ); await page.goto(searchUrl, { waitUntil: "domcontentloaded", timeout: 30000, }); logger.info( `Navigation completed successfully at ${new Date().toISOString()}` ); // Wait for job listings to load logger.info("Waiting for selector #loops-wrapper"); await page .waitForSelector("#loops-wrapper", { timeout: 5000 }) .catch(() => { logger.warning(`No results found for keyword: ${keyword}`); }); logger.info("Selector wait completed"); let currentPage = 1; let hasNextPage = true; while (hasNextPage && currentPage <= maxPages) { logger.info(`šŸ“„ Processing page ${currentPage} for "${keyword}"`); // Extract all job articles on current page const jobArticles = await page.$$("article[id^='post-']"); logger.info( `Found ${jobArticles.length} job listings on page ${currentPage}` ); for (const article of jobArticles) { const jobData = await extractJobData(article); if (!jobData || seenJobs.has(jobData.jobId)) { continue; } seenJobs.add(jobData.jobId); // Add keyword that found this job jobData.searchKeyword = keyword; // Validate job against keywords const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`; if (!containsAnyKeyword(fullText, keywords)) { rejectedResults.push({ ...jobData, rejected: true, reason: "Keywords not found in job listing", }); continue; } // Location validation (if enabled) if (locationFilter) { const locationFilters = parseLocationFilters(locationFilter); // For SkipTheDrive, most jobs are remote, but we can check the title/description const locationValid = fullText.toLowerCase().includes("remote") || locationFilters.some((filter) => fullText.toLowerCase().includes(filter.toLowerCase()) ); if (!locationValid) { rejectedResults.push({ ...jobData, rejected: true, reason: "Location requirements not met", }); continue; } jobData.locationValid = locationValid; } logger.success(`āœ… Found: ${jobData.title} at ${jobData.company}`); results.push(jobData); } // Check for next page const nextPageLink = await page.$("a.nextp"); if (nextPageLink && currentPage < maxPages) { logger.info("šŸ“„ Moving to next page..."); await nextPageLink.click(); await page.waitForLoadState("domcontentloaded"); await page.waitForTimeout(2000); // Wait for content to load currentPage++; } else { hasNextPage = false; } } } catch (error) { logger.error(`Error processing keyword "${keyword}": ${error.message}`); } finally { await page.close(); } } logger.success(`\nāœ… Parsing complete!`); logger.info(`šŸ“Š Total jobs found: ${results.length}`); logger.info(`āŒ Rejected jobs: ${rejectedResults.length}`); // Run AI analysis if enabled let aiAnalysis = null; if (enableAI && results.length > 0) { logger.step("Running AI analysis on job listings..."); const aiAvailable = await checkOllamaStatus(); if (aiAvailable) { const analysisData = results.map((job) => ({ text: `${job.title} at ${job.company}. ${job.description}`, metadata: { jobId: job.jobId, company: job.company, daysAgo: job.daysAgo, }, })); aiAnalysis = await analyzeBatch(analysisData, aiContext); // Merge AI analysis with results results.forEach((job, index) => { if (aiAnalysis && aiAnalysis[index]) { job.aiAnalysis = { isRelevant: aiAnalysis[index].isRelevant, confidence: aiAnalysis[index].confidence, reasoning: aiAnalysis[index].reasoning, }; } }); logger.success("āœ… AI analysis completed"); } else { logger.warning("āš ļø AI not available - skipping analysis"); } } return { results, rejectedResults, metadata: { source: "skipthedrive", totalJobs: results.length, rejectedJobs: rejectedResults.length, keywords: keywords, jobTypes: jobTypes, locationFilter: locationFilter, aiAnalysisEnabled: enableAI, aiAnalysisCompleted: !!aiAnalysis, timestamp: new Date().toISOString(), }, }; } catch (error) { logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`); throw error; } finally { await browser.close(); } } // Export the parser module.exports = { parseSkipTheDrive, buildSearchUrl, extractJobData, };