/** * SkipTheDrive Parsing Strategy * * Uses core-parser for browser management and ai-analyzer for utilities */ const { logger, cleanText, containsAnyKeyword, containsAllKeywords, matchesKeywordGroups, validateLocationAgainstFilters, } = require("ai-analyzer"); /** * SkipTheDrive URL builder */ function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { const baseUrl = "https://www.skipthedrive.com/"; const params = new URLSearchParams({ s: keyword, orderby: orderBy, }); if (jobTypes && jobTypes.length > 0) { params.append("job_type", jobTypes.join(",")); } return `${baseUrl}?${params.toString()}`; } /** * SkipTheDrive parsing strategy function */ async function skipthedriveStrategy(coreParser, options = {}) { const { keywords = ["software engineer", "developer", "programmer"], keywordGroups = null, // Array of keyword groups for grouped AND/OR logic locationFilter = null, maxPages = 5, jobTypes = [], useAndLogic = false, // Use AND logic instead of OR logic for keywords } = options; const results = []; const rejectedResults = []; const seenJobs = new Set(); try { // Create main page const page = await coreParser.createPage("skipthedrive-main"); logger.info("šŸš€ Starting SkipTheDrive parser..."); logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); if (keywordGroups) { logger.info(`šŸ”— Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); } else { logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); } logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); logger.info(`šŸ“„ Max Pages: ${maxPages}`); // Determine search keywords based on logic type let searchKeywords; if (keywordGroups) { // For grouped AND/OR logic, search each keyword in each group (OR within groups) searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups } else if (useAndLogic) { // For simple AND logic, combine all keywords into a single search query searchKeywords = [keywords.join(" ")]; } else { // For OR logic, search each keyword separately searchKeywords = keywords; } // Search for each keyword (or combined keyword for AND logic) for (const keyword of searchKeywords) { logger.info(`\nšŸ” Searching for: ${keyword}`); const searchUrl = buildSearchUrl(keyword, "date", jobTypes); try { // Navigate to search results await coreParser.navigateTo(searchUrl, { pageId: "skipthedrive-main", retries: 2, timeout: 30000, }); // Wait for job listings to load const hasResults = await page .waitForSelector("#loops-wrapper", { timeout: 5000, }) .then(() => true) .catch(() => { logger.warning(`No results found for keyword: ${keyword}`); return false; }); if (!hasResults) { continue; } // Process multiple pages let currentPage = 1; let hasNextPage = true; while (hasNextPage && currentPage <= maxPages) { logger.info(`šŸ“„ Processing page ${currentPage} for "${keyword}"`); // Extract jobs from current page const pageJobs = await extractJobsFromPage( page, keyword, locationFilter, keywords, keywordGroups, useAndLogic ); for (const job of pageJobs) { // Skip duplicates if (seenJobs.has(job.jobId)) continue; seenJobs.add(job.jobId); // Validate keywords based on logic type if (keywordGroups) { // Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR) const fullText = `${job.title} ${job.description} ${job.company}`; if (!matchesKeywordGroups(fullText, keywordGroups)) { rejectedResults.push({ ...job, rejectionReason: "Job does not match all keyword groups", }); continue; } } else if (useAndLogic) { // Simple AND logic: all keywords must match const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase(); if (!containsAllKeywords(fullText, keywords)) { rejectedResults.push({ ...job, rejectionReason: "Not all keywords found in job listing", }); continue; } } // Validate location if filtering enabled if (locationFilter) { const locationValid = validateLocationAgainstFilters( job.location, locationFilter ); if (!locationValid) { rejectedResults.push({ ...job, rejectionReason: "Location filter mismatch", }); continue; } } results.push(job); } // Check for next page hasNextPage = await hasNextPageAvailable(page); if (hasNextPage && currentPage < maxPages) { await navigateToNextPage(page, currentPage + 1); currentPage++; // Wait for new page to load await page.waitForTimeout(2000); } else { hasNextPage = false; } } } catch (error) { logger.error(`Error processing keyword "${keyword}": ${error.message}`); } } logger.info( `šŸŽÆ SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected` ); return { results, rejectedResults, summary: { totalJobs: results.length, totalRejected: rejectedResults.length, keywords: keywords.join(", "), locationFilter, source: "skipthedrive", }, }; } catch (error) { logger.error(`āŒ SkipTheDrive parsing failed: ${error.message}`); throw error; } } /** * Extract jobs from current page */ async function extractJobsFromPage(page, keyword, locationFilter, allKeywords = [], keywordGroups = null, useAndLogic = false) { const jobs = []; try { // Get all job article elements const jobElements = await page.$$("article.job_listing"); for (const jobElement of jobElements) { try { const job = await extractJobData(jobElement, keyword); if (job) { jobs.push(job); } } catch (error) { logger.warning(`Failed to extract job data: ${error.message}`); } } } catch (error) { logger.error(`Failed to extract jobs from page: ${error.message}`); } return jobs; } /** * Parse job description to separate role duties from job requirements */ function parseDutiesAndRequirements(description) { if (!description || description.trim().length === 0) { return { duties: "", requirements: "" }; } // Common section headers that indicate duties/responsibilities const dutiesKeywords = [ /responsibilities?:/i, /duties?:/i, /what you['\u2019]ll do/i, /key responsibilities/i, /your role/i, /position overview/i, /about the role/i, /role overview/i, /what we need/i, /you will:/i, /you['\u2019]ll be responsible/i, ]; // Common section headers that indicate requirements/qualifications const requirementsKeywords = [ /requirements?:/i, /qualifications?:/i, /must have/i, /required:/i, /what you['\u2019]ll bring/i, /you have:/i, /skills required/i, /minimum requirements/i, /preferred qualifications/i, /education:/i, /experience:/i, /you must have/i, /we['\u2019]re looking for/i, ]; // Split description into sections (by common delimiters) const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0); let currentSection = "duties"; // Default to duties let dutiesText = ""; let requirementsText = ""; for (const section of sections) { const sectionLower = section.toLowerCase(); // Check if this section is about requirements let isRequirementsSection = false; for (const keyword of requirementsKeywords) { if (keyword.test(section)) { isRequirementsSection = true; currentSection = "requirements"; break; } } // Check if this section is about duties/responsibilities if (!isRequirementsSection) { for (const keyword of dutiesKeywords) { if (keyword.test(section)) { currentSection = "duties"; break; } } } // Add to appropriate section if (currentSection === "requirements") { requirementsText += (requirementsText ? "\n\n" : "") + section.trim(); } else { dutiesText += (dutiesText ? "\n\n" : "") + section.trim(); } } // If we couldn't split by sections, try to find bullet points or numbered lists if (!dutiesText && !requirementsText) { const lines = description.split(/\n/); let foundRequirementsHeader = false; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); if (line.length === 0) continue; // Check if this line is a requirements header for (const keyword of requirementsKeywords) { if (keyword.test(line)) { foundRequirementsHeader = true; break; } } if (foundRequirementsHeader) { requirementsText += (requirementsText ? "\n" : "") + line; } else { // Check if it's a duties header let isDutiesHeader = false; for (const keyword of dutiesKeywords) { if (keyword.test(line)) { isDutiesHeader = true; break; } } if (!isDutiesHeader) { // Add to duties if we haven't found requirements header yet if (!foundRequirementsHeader) { dutiesText += (dutiesText ? "\n" : "") + line; } else { requirementsText += (requirementsText ? "\n" : "") + line; } } else { dutiesText += (dutiesText ? "\n" : "") + line; } } } } // Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements if (!dutiesText && !requirementsText && description) { const midPoint = Math.floor(description.length * 0.6); const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement"); const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification"); const splitPoint = Math.max( lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint, lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint ); dutiesText = description.substring(0, splitPoint).trim(); requirementsText = description.substring(splitPoint).trim(); } return { duties: dutiesText.trim(), requirements: requirementsText.trim(), }; } /** * Extract data from individual job element */ async function extractJobData(jobElement, keyword) { try { // Extract job ID const articleId = (await jobElement.getAttribute("id")) || ""; const jobId = articleId ? articleId.replace("post-", "") : ""; // Extract title const titleElement = await jobElement.$(".job_listing-title a"); const title = titleElement ? cleanText(await titleElement.textContent()) : ""; const jobUrl = titleElement ? await titleElement.getAttribute("href") : ""; // Extract company const companyElement = await jobElement.$(".company"); const company = companyElement ? cleanText(await companyElement.textContent()) : ""; // Extract location const locationElement = await jobElement.$(".location"); const location = locationElement ? cleanText(await locationElement.textContent()) : ""; // Extract date posted const dateElement = await jobElement.$(".job-date"); const dateText = dateElement ? cleanText(await dateElement.textContent()) : ""; // Extract description const descElement = await jobElement.$(".job_listing-description"); const description = descElement ? cleanText(await descElement.textContent()) : ""; // Check if featured const featuredElement = await jobElement.$(".featured"); const isFeatured = featuredElement !== null; // Parse date let datePosted = null; let daysAgo = null; if (dateText) { const match = dateText.match(/(\d+)\s+days?\s+ago/); if (match) { daysAgo = parseInt(match[1]); const date = new Date(); date.setDate(date.getDate() - daysAgo); datePosted = date.toISOString().split("T")[0]; } } // Parse duties and requirements from description if available const parsed = parseDutiesAndRequirements(description); return { jobId, title, company, location, jobUrl, datePosted, dateText, daysAgo, description, roleDuties: parsed.duties, jobRequirements: parsed.requirements, isFeatured, keyword, extractedAt: new Date().toISOString(), source: "skipthedrive", }; } catch (error) { logger.warning(`Error extracting job data: ${error.message}`); return null; } } /** * Check if next page is available */ async function hasNextPageAvailable(page) { try { const nextButton = await page.$(".next-page"); return nextButton !== null; } catch { return false; } } /** * Navigate to next page */ async function navigateToNextPage(page, pageNumber) { try { const nextButton = await page.$(".next-page"); if (nextButton) { await nextButton.click(); } } catch (error) { logger.warning( `Failed to navigate to page ${pageNumber}: ${error.message}` ); } } module.exports = { skipthedriveStrategy, buildSearchUrl, extractJobsFromPage, extractJobData, };