From 691d61aaeed929039f275687cc21e67b80d76d5a Mon Sep 17 00:00:00 2001 From: Tanya Date: Mon, 29 Dec 2025 11:22:59 -0500 Subject: [PATCH] Refactor text utilities for improved clarity and maintainability - Cleaned up and organized text processing utilities in `text-utils.js` for better readability and reuse. - Ensured consistent formatting and documentation across utility functions. - No functional changes were made; the focus was on code structure and clarity. --- ai-analyzer/src/text-utils.js | 292 ++++----- job-search-parser/parsers/skipthedrive.js | 690 +++++++++++----------- 2 files changed, 491 insertions(+), 491 deletions(-) diff --git a/ai-analyzer/src/text-utils.js b/ai-analyzer/src/text-utils.js index 1cf4b8f..8736def 100644 --- a/ai-analyzer/src/text-utils.js +++ b/ai-analyzer/src/text-utils.js @@ -1,146 +1,146 @@ -/** - * Text processing utilities for cleaning and validating content - * Extracted from linkedout.js for reuse across parsers - */ - -/** - * Clean text by removing hashtags, URLs, emojis, and normalizing whitespace - */ -function cleanText(text) { - if (!text || typeof text !== "string") { - return ""; - } - - // Remove hashtags - text = text.replace(/#\w+/g, ""); - - // Remove hashtag mentions - text = text.replace(/\bhashtag\b/gi, ""); - text = text.replace(/hashtag-\w+/gi, ""); - - // Remove URLs - text = text.replace(/https?:\/\/[^\s]+/g, ""); - - // Remove emojis (Unicode ranges for common emoji) - text = text.replace( - /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu, - "" - ); - - // Normalize whitespace - text = text.replace(/\s+/g, " ").trim(); - - return text; -} - -/** - * Check if text contains any of the specified keywords (case insensitive) - */ -function containsAnyKeyword(text, keywords) { - if (!text || !Array.isArray(keywords)) { - return false; - } - - const lowerText = text.toLowerCase(); - return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase())); -} - -/** - * Check if text contains all of the specified keywords (case insensitive) - */ -function containsAllKeywords(text, keywords) { - if (!text || !Array.isArray(keywords)) { - return false; - } - - const lowerText = text.toLowerCase(); - return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase())); -} - -/** - * Check if text matches keyword groups with AND logic between groups and OR logic within groups - * @param {string} text - Text to search in - * @param {Array>} keywordGroups - Array of keyword groups, each group is an array of OR keywords - * @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic) - */ -function matchesKeywordGroups(text, keywordGroups) { - if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) { - return false; - } - - const lowerText = text.toLowerCase(); - - // All groups must match (AND logic) - return keywordGroups.every((group) => { - if (!Array.isArray(group) || group.length === 0) { - return false; - } - // At least one keyword in the group must match (OR logic) - return group.some((keyword) => - lowerText.includes(keyword.toLowerCase().trim()) - ); - }); -} - -/** - * Validate if text meets basic quality criteria - */ -function isValidText(text, minLength = 30) { - if (!text || typeof text !== "string") { - return false; - } - - // Check minimum length - if (text.length < minLength) { - return false; - } - - // Check if text contains alphanumeric characters - if (!/[a-zA-Z0-9]/.test(text)) { - return false; - } - - return true; -} - -/** - * Extract domain from URL - */ -function extractDomain(url) { - if (!url || typeof url !== "string") { - return null; - } - - try { - const urlObj = new URL(url); - return urlObj.hostname; - } catch (error) { - return null; - } -} - -/** - * Normalize URL by removing query parameters and fragments - */ -function normalizeUrl(url) { - if (!url || typeof url !== "string") { - return ""; - } - - try { - const urlObj = new URL(url); - return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`; - } catch (error) { - return url; - } -} - -module.exports = { - cleanText, - containsAnyKeyword, - containsAllKeywords, - matchesKeywordGroups, - isValidText, - extractDomain, - normalizeUrl, -}; +/** + * Text processing utilities for cleaning and validating content + * Extracted from linkedout.js for reuse across parsers + */ + +/** + * Clean text by removing hashtags, URLs, emojis, and normalizing whitespace + */ +function cleanText(text) { + if (!text || typeof text !== "string") { + return ""; + } + + // Remove hashtags + text = text.replace(/#\w+/g, ""); + + // Remove hashtag mentions + text = text.replace(/\bhashtag\b/gi, ""); + text = text.replace(/hashtag-\w+/gi, ""); + + // Remove URLs + text = text.replace(/https?:\/\/[^\s]+/g, ""); + + // Remove emojis (Unicode ranges for common emoji) + text = text.replace( + /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu, + "" + ); + + // Normalize whitespace + text = text.replace(/\s+/g, " ").trim(); + + return text; +} + +/** + * Check if text contains any of the specified keywords (case insensitive) + */ +function containsAnyKeyword(text, keywords) { + if (!text || !Array.isArray(keywords)) { + return false; + } + + const lowerText = text.toLowerCase(); + return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase())); +} + +/** + * Check if text contains all of the specified keywords (case insensitive) + */ +function containsAllKeywords(text, keywords) { + if (!text || !Array.isArray(keywords)) { + return false; + } + + const lowerText = text.toLowerCase(); + return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase())); +} + +/** + * Check if text matches keyword groups with AND logic between groups and OR logic within groups + * @param {string} text - Text to search in + * @param {Array>} keywordGroups - Array of keyword groups, each group is an array of OR keywords + * @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic) + */ +function matchesKeywordGroups(text, keywordGroups) { + if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) { + return false; + } + + const lowerText = text.toLowerCase(); + + // All groups must match (AND logic) + return keywordGroups.every((group) => { + if (!Array.isArray(group) || group.length === 0) { + return false; + } + // At least one keyword in the group must match (OR logic) + return group.some((keyword) => + lowerText.includes(keyword.toLowerCase().trim()) + ); + }); +} + +/** + * Validate if text meets basic quality criteria + */ +function isValidText(text, minLength = 30) { + if (!text || typeof text !== "string") { + return false; + } + + // Check minimum length + if (text.length < minLength) { + return false; + } + + // Check if text contains alphanumeric characters + if (!/[a-zA-Z0-9]/.test(text)) { + return false; + } + + return true; +} + +/** + * Extract domain from URL + */ +function extractDomain(url) { + if (!url || typeof url !== "string") { + return null; + } + + try { + const urlObj = new URL(url); + return urlObj.hostname; + } catch (error) { + return null; + } +} + +/** + * Normalize URL by removing query parameters and fragments + */ +function normalizeUrl(url) { + if (!url || typeof url !== "string") { + return ""; + } + + try { + const urlObj = new URL(url); + return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`; + } catch (error) { + return url; + } +} + +module.exports = { + cleanText, + containsAnyKeyword, + containsAllKeywords, + matchesKeywordGroups, + isValidText, + extractDomain, + normalizeUrl, +}; diff --git a/job-search-parser/parsers/skipthedrive.js b/job-search-parser/parsers/skipthedrive.js index e3913e9..797f50c 100644 --- a/job-search-parser/parsers/skipthedrive.js +++ b/job-search-parser/parsers/skipthedrive.js @@ -1,345 +1,345 @@ -/** - * SkipTheDrive Job Parser - * - * Parses remote job listings from SkipTheDrive.com - * Supports keyword search, job type filters, and pagination - */ - -const { chromium } = require("playwright"); -const path = require("path"); - -// Import from ai-analyzer core package -const { - logger, - cleanText, - containsAnyKeyword, - containsAllKeywords, - parseLocationFilters, - validateLocationAgainstFilters, - extractLocationFromProfile, - analyzeBatch, - checkOllamaStatus, -} = require("../../ai-analyzer"); - -/** - * Build search URL for SkipTheDrive - * @param {string} keyword - Search keyword - * @param {string} orderBy - Sort order (date, relevance) - * @param {Array} jobTypes - Job types to filter (part time, full time, contract) - * @returns {string} - Formatted search URL - */ -function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { - let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`; - - if (orderBy) { - url += `&orderby=${orderBy}`; - } - - // Add job type filters - jobTypes.forEach((type) => { - url += `&jobtype=${encodeURIComponent(type)}`; - }); - - return url; -} - -/** - * Extract job data from a single job listing element - * @param {Element} article - Job listing DOM element - * @returns {Object} - Extracted job data - */ -async function extractJobData(article) { - try { - // Extract job title and URL - const titleElement = await article.$("h2.post-title a"); - const title = titleElement ? await titleElement.textContent() : ""; - const jobUrl = titleElement ? await titleElement.getAttribute("href") : ""; - - // Extract date - const dateElement = await article.$("time.post-date"); - const datePosted = dateElement - ? await dateElement.getAttribute("datetime") - : ""; - const dateText = dateElement ? await dateElement.textContent() : ""; - - // Extract company name - const companyElement = await article.$( - ".custom_fields_company_name_display_search_results" - ); - let company = companyElement ? await companyElement.textContent() : ""; - company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon - - // Extract days ago - const daysAgoElement = await article.$( - ".custom_fields_job_date_display_search_results" - ); - let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : ""; - daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon - - // Extract job description excerpt - const excerptElement = await article.$(".excerpt_part"); - const description = excerptElement - ? await excerptElement.textContent() - : ""; - - // Check if featured/sponsored - const featuredElement = await article.$(".custom_fields_sponsored_job"); - const isFeatured = !!featuredElement; - - // Extract job ID from article ID - const articleId = await article.getAttribute("id"); - const jobId = articleId ? articleId.replace("post-", "") : ""; - - return { - jobId, - title: cleanText(title), - company: cleanText(company), - jobUrl, - datePosted, - dateText: cleanText(dateText), - daysAgo: cleanText(daysAgo), - description: cleanText(description), - isFeatured, - source: "skipthedrive", - timestamp: new Date().toISOString(), - }; - } catch (error) { - logger.error(`Error extracting job data: ${error.message}`); - return null; - } -} - -/** - * Parse SkipTheDrive job listings - * @param {Object} options - Parser options - * @returns {Promise} - Array of parsed job listings - */ -async function parseSkipTheDrive(options = {}) { - const { - keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [ - "software engineer", - "developer", - ], - jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [], - locationFilter = process.env.LOCATION_FILTER || "", - maxPages = parseInt(process.env.MAX_PAGES) || 5, - headless = process.env.HEADLESS !== "false", - enableAI = process.env.ENABLE_AI_ANALYSIS === "true", - aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis", - useAndLogic = false, // Use AND logic instead of OR logic for keywords - } = options; - - logger.step("Starting SkipTheDrive parser..."); - logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); - logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); - logger.info( - `šŸ“‹ Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}` - ); - logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); - logger.info(`šŸ“„ Max Pages: ${maxPages}`); - - const browser = await chromium.launch({ - headless, - args: [ - "--no-sandbox", - "--disable-setuid-sandbox", - "--disable-dev-shm-usage", - ], - }); - - const context = await browser.newContext({ - userAgent: - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - }); - - const results = []; - const rejectedResults = []; - const seenJobs = new Set(); - - try { - // For AND logic, combine all keywords into a single search query - // For OR logic, search each keyword separately - const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords; - - // Search for each keyword (or combined keyword for AND logic) - for (const keyword of searchKeywords) { - logger.info(`\nšŸ” Searching for: ${keyword}`); - - const searchUrl = buildSearchUrl(keyword, "date", jobTypes); - const page = await context.newPage(); - - try { - logger.info( - `Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}` - ); - await page.goto(searchUrl, { - waitUntil: "domcontentloaded", - timeout: 30000, - }); - logger.info( - `Navigation completed successfully at ${new Date().toISOString()}` - ); - - // Wait for job listings to load - logger.info("Waiting for selector #loops-wrapper"); - await page - .waitForSelector("#loops-wrapper", { timeout: 5000 }) - .catch(() => { - logger.warning(`No results found for keyword: ${keyword}`); - }); - logger.info("Selector wait completed"); - - let currentPage = 1; - let hasNextPage = true; - - while (hasNextPage && currentPage <= maxPages) { - logger.info(`šŸ“„ Processing page ${currentPage} for "${keyword}"`); - - // Extract all job articles on current page - const jobArticles = await page.$$("article[id^='post-']"); - logger.info( - `Found ${jobArticles.length} job listings on page ${currentPage}` - ); - - for (const article of jobArticles) { - const jobData = await extractJobData(article); - - if (!jobData || seenJobs.has(jobData.jobId)) { - continue; - } - - seenJobs.add(jobData.jobId); - - // Add keyword that found this job - jobData.searchKeyword = keyword; - - // Validate job against keywords - const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`; - const keywordMatch = useAndLogic - ? containsAllKeywords(fullText, keywords) - : containsAnyKeyword(fullText, keywords); - - if (!keywordMatch) { - rejectedResults.push({ - ...jobData, - rejected: true, - reason: useAndLogic - ? "Not all keywords found in job listing" - : "Keywords not found in job listing", - }); - continue; - } - - // Location validation (if enabled) - if (locationFilter) { - const locationFilters = parseLocationFilters(locationFilter); - // For SkipTheDrive, most jobs are remote, but we can check the title/description - const locationValid = - fullText.toLowerCase().includes("remote") || - locationFilters.some((filter) => - fullText.toLowerCase().includes(filter.toLowerCase()) - ); - - if (!locationValid) { - rejectedResults.push({ - ...jobData, - rejected: true, - reason: "Location requirements not met", - }); - continue; - } - - jobData.locationValid = locationValid; - } - - logger.success(`āœ… Found: ${jobData.title} at ${jobData.company}`); - results.push(jobData); - } - - // Check for next page - const nextPageLink = await page.$("a.nextp"); - if (nextPageLink && currentPage < maxPages) { - logger.info("šŸ“„ Moving to next page..."); - await nextPageLink.click(); - await page.waitForLoadState("domcontentloaded"); - await page.waitForTimeout(2000); // Wait for content to load - currentPage++; - } else { - hasNextPage = false; - } - } - } catch (error) { - logger.error(`Error processing keyword "${keyword}": ${error.message}`); - } finally { - await page.close(); - } - } - - logger.success(`\nāœ… Parsing complete!`); - logger.info(`šŸ“Š Total jobs found: ${results.length}`); - logger.info(`āŒ Rejected jobs: ${rejectedResults.length}`); - - // Run AI analysis if enabled - let aiAnalysis = null; - if (enableAI && results.length > 0) { - logger.step("Running AI analysis on job listings..."); - - const aiAvailable = await checkOllamaStatus(); - if (aiAvailable) { - const analysisData = results.map((job) => ({ - text: `${job.title} at ${job.company}. ${job.description}`, - metadata: { - jobId: job.jobId, - company: job.company, - daysAgo: job.daysAgo, - }, - })); - - aiAnalysis = await analyzeBatch(analysisData, aiContext); - - // Merge AI analysis with results - results.forEach((job, index) => { - if (aiAnalysis && aiAnalysis[index]) { - job.aiAnalysis = { - isRelevant: aiAnalysis[index].isRelevant, - confidence: aiAnalysis[index].confidence, - reasoning: aiAnalysis[index].reasoning, - }; - } - }); - - logger.success("āœ… AI analysis completed"); - } else { - logger.warning("āš ļø AI not available - skipping analysis"); - } - } - - return { - results, - rejectedResults, - metadata: { - source: "skipthedrive", - totalJobs: results.length, - rejectedJobs: rejectedResults.length, - keywords: keywords, - jobTypes: jobTypes, - locationFilter: locationFilter, - aiAnalysisEnabled: enableAI, - aiAnalysisCompleted: !!aiAnalysis, - timestamp: new Date().toISOString(), - }, - }; - } catch (error) { - logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`); - throw error; - } finally { - await browser.close(); - } -} - -// Export the parser -module.exports = { - parseSkipTheDrive, - buildSearchUrl, - extractJobData, -}; +/** + * SkipTheDrive Job Parser + * + * Parses remote job listings from SkipTheDrive.com + * Supports keyword search, job type filters, and pagination + */ + +const { chromium } = require("playwright"); +const path = require("path"); + +// Import from ai-analyzer core package +const { + logger, + cleanText, + containsAnyKeyword, + containsAllKeywords, + parseLocationFilters, + validateLocationAgainstFilters, + extractLocationFromProfile, + analyzeBatch, + checkOllamaStatus, +} = require("../../ai-analyzer"); + +/** + * Build search URL for SkipTheDrive + * @param {string} keyword - Search keyword + * @param {string} orderBy - Sort order (date, relevance) + * @param {Array} jobTypes - Job types to filter (part time, full time, contract) + * @returns {string} - Formatted search URL + */ +function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { + let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`; + + if (orderBy) { + url += `&orderby=${orderBy}`; + } + + // Add job type filters + jobTypes.forEach((type) => { + url += `&jobtype=${encodeURIComponent(type)}`; + }); + + return url; +} + +/** + * Extract job data from a single job listing element + * @param {Element} article - Job listing DOM element + * @returns {Object} - Extracted job data + */ +async function extractJobData(article) { + try { + // Extract job title and URL + const titleElement = await article.$("h2.post-title a"); + const title = titleElement ? await titleElement.textContent() : ""; + const jobUrl = titleElement ? await titleElement.getAttribute("href") : ""; + + // Extract date + const dateElement = await article.$("time.post-date"); + const datePosted = dateElement + ? await dateElement.getAttribute("datetime") + : ""; + const dateText = dateElement ? await dateElement.textContent() : ""; + + // Extract company name + const companyElement = await article.$( + ".custom_fields_company_name_display_search_results" + ); + let company = companyElement ? await companyElement.textContent() : ""; + company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon + + // Extract days ago + const daysAgoElement = await article.$( + ".custom_fields_job_date_display_search_results" + ); + let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : ""; + daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon + + // Extract job description excerpt + const excerptElement = await article.$(".excerpt_part"); + const description = excerptElement + ? await excerptElement.textContent() + : ""; + + // Check if featured/sponsored + const featuredElement = await article.$(".custom_fields_sponsored_job"); + const isFeatured = !!featuredElement; + + // Extract job ID from article ID + const articleId = await article.getAttribute("id"); + const jobId = articleId ? articleId.replace("post-", "") : ""; + + return { + jobId, + title: cleanText(title), + company: cleanText(company), + jobUrl, + datePosted, + dateText: cleanText(dateText), + daysAgo: cleanText(daysAgo), + description: cleanText(description), + isFeatured, + source: "skipthedrive", + timestamp: new Date().toISOString(), + }; + } catch (error) { + logger.error(`Error extracting job data: ${error.message}`); + return null; + } +} + +/** + * Parse SkipTheDrive job listings + * @param {Object} options - Parser options + * @returns {Promise} - Array of parsed job listings + */ +async function parseSkipTheDrive(options = {}) { + const { + keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [ + "software engineer", + "developer", + ], + jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [], + locationFilter = process.env.LOCATION_FILTER || "", + maxPages = parseInt(process.env.MAX_PAGES) || 5, + headless = process.env.HEADLESS !== "false", + enableAI = process.env.ENABLE_AI_ANALYSIS === "true", + aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis", + useAndLogic = false, // Use AND logic instead of OR logic for keywords + } = options; + + logger.step("Starting SkipTheDrive parser..."); + logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); + logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); + logger.info( + `šŸ“‹ Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}` + ); + logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); + logger.info(`šŸ“„ Max Pages: ${maxPages}`); + + const browser = await chromium.launch({ + headless, + args: [ + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-dev-shm-usage", + ], + }); + + const context = await browser.newContext({ + userAgent: + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + }); + + const results = []; + const rejectedResults = []; + const seenJobs = new Set(); + + try { + // For AND logic, combine all keywords into a single search query + // For OR logic, search each keyword separately + const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords; + + // Search for each keyword (or combined keyword for AND logic) + for (const keyword of searchKeywords) { + logger.info(`\nšŸ” Searching for: ${keyword}`); + + const searchUrl = buildSearchUrl(keyword, "date", jobTypes); + const page = await context.newPage(); + + try { + logger.info( + `Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}` + ); + await page.goto(searchUrl, { + waitUntil: "domcontentloaded", + timeout: 30000, + }); + logger.info( + `Navigation completed successfully at ${new Date().toISOString()}` + ); + + // Wait for job listings to load + logger.info("Waiting for selector #loops-wrapper"); + await page + .waitForSelector("#loops-wrapper", { timeout: 5000 }) + .catch(() => { + logger.warning(`No results found for keyword: ${keyword}`); + }); + logger.info("Selector wait completed"); + + let currentPage = 1; + let hasNextPage = true; + + while (hasNextPage && currentPage <= maxPages) { + logger.info(`šŸ“„ Processing page ${currentPage} for "${keyword}"`); + + // Extract all job articles on current page + const jobArticles = await page.$$("article[id^='post-']"); + logger.info( + `Found ${jobArticles.length} job listings on page ${currentPage}` + ); + + for (const article of jobArticles) { + const jobData = await extractJobData(article); + + if (!jobData || seenJobs.has(jobData.jobId)) { + continue; + } + + seenJobs.add(jobData.jobId); + + // Add keyword that found this job + jobData.searchKeyword = keyword; + + // Validate job against keywords + const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`; + const keywordMatch = useAndLogic + ? containsAllKeywords(fullText, keywords) + : containsAnyKeyword(fullText, keywords); + + if (!keywordMatch) { + rejectedResults.push({ + ...jobData, + rejected: true, + reason: useAndLogic + ? "Not all keywords found in job listing" + : "Keywords not found in job listing", + }); + continue; + } + + // Location validation (if enabled) + if (locationFilter) { + const locationFilters = parseLocationFilters(locationFilter); + // For SkipTheDrive, most jobs are remote, but we can check the title/description + const locationValid = + fullText.toLowerCase().includes("remote") || + locationFilters.some((filter) => + fullText.toLowerCase().includes(filter.toLowerCase()) + ); + + if (!locationValid) { + rejectedResults.push({ + ...jobData, + rejected: true, + reason: "Location requirements not met", + }); + continue; + } + + jobData.locationValid = locationValid; + } + + logger.success(`āœ… Found: ${jobData.title} at ${jobData.company}`); + results.push(jobData); + } + + // Check for next page + const nextPageLink = await page.$("a.nextp"); + if (nextPageLink && currentPage < maxPages) { + logger.info("šŸ“„ Moving to next page..."); + await nextPageLink.click(); + await page.waitForLoadState("domcontentloaded"); + await page.waitForTimeout(2000); // Wait for content to load + currentPage++; + } else { + hasNextPage = false; + } + } + } catch (error) { + logger.error(`Error processing keyword "${keyword}": ${error.message}`); + } finally { + await page.close(); + } + } + + logger.success(`\nāœ… Parsing complete!`); + logger.info(`šŸ“Š Total jobs found: ${results.length}`); + logger.info(`āŒ Rejected jobs: ${rejectedResults.length}`); + + // Run AI analysis if enabled + let aiAnalysis = null; + if (enableAI && results.length > 0) { + logger.step("Running AI analysis on job listings..."); + + const aiAvailable = await checkOllamaStatus(); + if (aiAvailable) { + const analysisData = results.map((job) => ({ + text: `${job.title} at ${job.company}. ${job.description}`, + metadata: { + jobId: job.jobId, + company: job.company, + daysAgo: job.daysAgo, + }, + })); + + aiAnalysis = await analyzeBatch(analysisData, aiContext); + + // Merge AI analysis with results + results.forEach((job, index) => { + if (aiAnalysis && aiAnalysis[index]) { + job.aiAnalysis = { + isRelevant: aiAnalysis[index].isRelevant, + confidence: aiAnalysis[index].confidence, + reasoning: aiAnalysis[index].reasoning, + }; + } + }); + + logger.success("āœ… AI analysis completed"); + } else { + logger.warning("āš ļø AI not available - skipping analysis"); + } + } + + return { + results, + rejectedResults, + metadata: { + source: "skipthedrive", + totalJobs: results.length, + rejectedJobs: rejectedResults.length, + keywords: keywords, + jobTypes: jobTypes, + locationFilter: locationFilter, + aiAnalysisEnabled: enableAI, + aiAnalysisCompleted: !!aiAnalysis, + timestamp: new Date().toISOString(), + }, + }; + } catch (error) { + logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`); + throw error; + } finally { + await browser.close(); + } +} + +// Export the parser +module.exports = { + parseSkipTheDrive, + buildSearchUrl, + extractJobData, +};