diff --git a/core-parser/index.js b/core-parser/index.js index 4239594..e068b13 100644 --- a/core-parser/index.js +++ b/core-parser/index.js @@ -20,7 +20,26 @@ class CoreParser { this.browser = await playwright.chromium.launch({ headless: this.config.headless }); - this.context = await this.browser.newContext(); + + // Create context with user agent to appear more like a real browser + const contextOptions = { + userAgent: this.config.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + viewport: { width: 1920, height: 1080 }, + locale: 'en-US', + timezoneId: 'America/New_York', + }; + + // Add extra HTTP headers to appear more legitimate + contextOptions.extraHTTPHeaders = { + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + }; + + this.context = await this.browser.newContext(contextOptions); } async createPage(id) { diff --git a/job-search-parser/README.md b/job-search-parser/README.md index ce470ef..7182ef6 100644 --- a/job-search-parser/README.md +++ b/job-search-parser/README.md @@ -99,7 +99,7 @@ node index.js --sites=linkedin --keywords="co-op" --min-date="2025-12-01" node index.js --sites=linkedin --keywords="co-op" --location="Ontario" --min-date="2025-12-01" # Combine multiple sites -node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op" +node index.js --sites=linkedin,skipthedrive,indeed --keywords="intern,co-op" # Use AND logic - jobs must match ALL keywords (e.g., "co-op" AND "summer 2026") node index.js --sites=linkedin --keywords="co-op,summer 2026" --and @@ -118,9 +118,61 @@ node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --an - LinkedIn supports relative timeframes up to ~30 days - For dates older than 30 days, LinkedIn may limit results to the maximum supported timeframe -### 🚧 Planned Parsers +#### Indeed Parser -- **Indeed**: Comprehensive job aggregator +Comprehensive job aggregator with extensive job listings. + +**Features:** + +- Keyword-based job search +- Location filtering (both Indeed location and post-extraction filter) +- Multi-page result parsing with pagination +- Salary information extraction +- Date filtering (jobs posted within last 30 days) +- Automatic duplicate detection +- Job type and experience level support + +**Usage:** + +```bash +# Search Indeed jobs +node index.js --sites=indeed --keywords="software engineer,developer" + +# Search with location filter +node index.js --sites=indeed --keywords="co-op" --location="Ontario" + +# Search with date filter (jobs posted after specific date) +node index.js --sites=indeed --keywords="co-op" --min-date="2025-12-01" + +# Combine filters +node index.js --sites=indeed --keywords="co-op" --location="Ontario" --min-date="2025-12-01" + +# Combine multiple sites +node index.js --sites=indeed,linkedin --keywords="intern,co-op" + +# Use AND logic - jobs must match ALL keywords +node index.js --sites=indeed --keywords="co-op,summer 2026" --and + +# Use grouped AND/OR logic - (co-op OR intern) AND (summer 2026) +node index.js --sites=indeed --keywords="co-op|intern,summer 2026" --and +``` + +**Date Filter Notes:** +- The date filter converts to Indeed's `fromage` parameter (days ago) +- Format: `YYYY-MM-DD` (e.g., `2025-12-01`) +- Indeed supports up to 30 days for date filtering +- For dates older than 30 days, Indeed limits results to the maximum supported timeframe + +**CAPTCHA/Verification Handling:** +- Indeed may show CAPTCHA or human verification pages when detecting automated access +- If you encounter CAPTCHA errors, try: + 1. Run in non-headless mode: Set `HEADLESS=false` in `.env` file (you can manually solve CAPTCHA) + 2. Wait a few minutes between runs to avoid rate limiting + 3. Use a different IP address or VPN if available + 4. Reduce the number of pages or keywords per run +- The parser will automatically detect and report CAPTCHA pages with helpful error messages + +### 🚧 Planned Parsers - **Glassdoor**: Jobs with company reviews and salary data - **Monster**: Traditional job board - **SimplyHired**: Job aggregator with salary estimates @@ -195,7 +247,7 @@ USE_AND_LOGIC=true node index.js # Select sites to parse -node index.js --sites=linkedin,skipthedrive +node index.js --sites=linkedin,skipthedrive,indeed # Search keywords node index.js --keywords="software engineer,developer" @@ -229,7 +281,7 @@ node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --an **Available Options:** -- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive) +- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive, indeed) - `--keywords="keyword1,keyword2"`: Search keywords - Use `|` (pipe) to separate OR keywords within a group: `"co-op|intern"` means "co-op" OR "intern" - Use `,` (comma) to separate AND groups when using `--and`: `"co-op|intern,summer 2026"` means (co-op OR intern) AND (summer 2026) diff --git a/job-search-parser/index.js b/job-search-parser/index.js index a39aecf..71596c1 100644 --- a/job-search-parser/index.js +++ b/job-search-parser/index.js @@ -11,6 +11,7 @@ const fs = require("fs"); const CoreParser = require("../core-parser"); const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy"); const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy"); +const { indeedStrategy } = require("./strategies/indeed-strategy"); const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer"); const { convertResultsToCsv } = require("./src/csv-utils"); @@ -35,8 +36,8 @@ const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for const SITE_STRATEGIES = { skipthedrive: skipthedriveStrategy, linkedin: linkedinJobsStrategy, + indeed: indeedStrategy, // Add more site strategies here - // indeed: indeedStrategy, // glassdoor: glassdoorStrategy, }; diff --git a/job-search-parser/strategies/indeed-strategy.js b/job-search-parser/strategies/indeed-strategy.js new file mode 100644 index 0000000..a566d64 --- /dev/null +++ b/job-search-parser/strategies/indeed-strategy.js @@ -0,0 +1,947 @@ +/** + * Indeed Parsing Strategy + * + * Uses core-parser for browser management and ai-analyzer for utilities + */ + +const { + logger, + cleanText, + containsAnyKeyword, + containsAllKeywords, + matchesKeywordGroups, + validateLocationAgainstFilters, +} = require("ai-analyzer"); + +/** + * Indeed URL builder + */ +function buildSearchUrl(keyword, location = "", filters = {}) { + const baseUrl = "https://www.indeed.com/jobs"; + const params = new URLSearchParams({ + q: keyword, + sort: "date", // Sort by date (newest first) + }); + + if (location) { + params.append("l", location); + } + + // Add date filter if provided + if (filters.fromage) { + // fromage is in days (e.g., 1 = last 24 hours, 7 = last 7 days, 30 = last 30 days) + params.append("fromage", filters.fromage); + } + + // Add job type filter + if (filters.jobType) { + // jt=fulltime, parttime, contract, internship, temporary + params.append("jt", filters.jobType); + } + + // Add remote filter + if (filters.remote) { + params.append("remote", "true"); + } + + // Add experience level filter + if (filters.experienceLevel) { + // explvl=entry_level, mid_level, senior_level + params.append("explvl", filters.experienceLevel); + } + + return `${baseUrl}?${params.toString()}`; +} + +/** + * Indeed parsing strategy function + */ +async function indeedStrategy(coreParser, options = {}) { + const { + keywords = ["software engineer", "developer"], + keywordGroups = null, // Array of keyword groups for grouped AND/OR logic + locationFilter = null, + maxPages = 5, + location = "", // Indeed location search (e.g., "Toronto, ON", "Canada") + minDate = null, // Minimum posted date (format: YYYY-MM-DD) + useAndLogic = false, // Use AND logic instead of OR logic for keywords + } = options; + + const results = []; + const rejectedResults = []; + const seenJobs = new Set(); + + try { + // Create main page + const page = await coreParser.createPage("indeed-main"); + + logger.info("šŸš€ Starting Indeed parser..."); + logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); + if (keywordGroups) { + logger.info(`šŸ”— Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); + } else { + logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); + } + logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); + logger.info(`šŸŒ Indeed Location: ${location || "None"}`); + logger.info(`šŸ“„ Max Pages: ${maxPages}`); + + // Convert minDate to fromage (days ago) + let fromage = null; + if (minDate) { + try { + const minDateObj = new Date(minDate); + const now = new Date(); + const daysDiff = Math.floor((now - minDateObj) / (1000 * 60 * 60 * 24)); + if (daysDiff > 0 && daysDiff <= 30) { + fromage = daysDiff; + logger.info(`šŸ“… Min Date Filter: ${minDate} (${fromage} days ago)`); + } else if (daysDiff > 30) { + fromage = 30; // Indeed's maximum is typically 30 days + logger.info(`šŸ“… Min Date Filter: ${minDate} (limited to 30 days)`); + } + } catch (error) { + logger.warning(`āš ļø Invalid date format for minDate: ${minDate}. Expected format: YYYY-MM-DD`); + } + } + + // Determine search keywords based on logic type + let searchKeywords; + if (keywordGroups) { + // For grouped AND/OR logic, search each keyword in each group (OR within groups) + searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups + } else if (useAndLogic) { + // For simple AND logic, combine all keywords into a single search query + searchKeywords = [keywords.join(" ")]; + } else { + // For OR logic, search each keyword separately + searchKeywords = keywords; + } + + // Search for each keyword (or combined keyword for AND logic) + for (const keyword of searchKeywords) { + logger.info(`\nšŸ” Searching Indeed for: "${keyword}"`); + + const searchUrl = buildSearchUrl(keyword, location, { + fromage: fromage, + }); + logger.info(`šŸ”— Search URL: ${searchUrl}`); + + try { + // Navigate to job search results + // Use domcontentloaded instead of networkidle for faster loading + // Indeed can be slow to fully load, so we'll wait for DOM and then check for content + try { + await coreParser.navigateTo(searchUrl, { + pageId: "indeed-main", + retries: 2, + waitUntil: "domcontentloaded", + timeout: 60000, // Increase timeout to 60 seconds + }); + } catch (navError) { + // If navigation fails, try with load event instead + logger.warning(`āš ļø Initial navigation failed, trying with 'load' event: ${navError.message}`); + try { + await coreParser.navigateTo(searchUrl, { + pageId: "indeed-main", + retries: 1, + waitUntil: "load", + timeout: 60000, + }); + } catch (loadError) { + // Last resort: try direct page navigation + logger.warning(`āš ļø Load event failed, trying direct navigation: ${loadError.message}`); + await page.goto(searchUrl, { timeout: 60000, waitUntil: "domcontentloaded" }).catch(() => { + throw new Error(`Failed to navigate to Indeed after all attempts: ${loadError.message}`); + }); + } + } + + // Wait for page to load and let JavaScript execute + await new Promise((resolve) => setTimeout(resolve, 5000)); + + // Check if we're on the right page + const currentUrl = page.url(); + logger.info(`šŸ“ Current page URL: ${currentUrl}`); + + // Check if we were redirected or blocked (check URL first) + if (currentUrl.includes('captcha') || currentUrl.includes('blocked') || currentUrl.includes('access-denied') || currentUrl.includes('verify')) { + logger.error(`āŒ Indeed appears to be blocking access. URL: ${currentUrl}`); + throw new Error('Indeed is showing a CAPTCHA or verification page. Please try running in non-headless mode (set HEADLESS=false in .env) or wait and try again later.'); + } + + // Check page content for CAPTCHA/human verification indicators + try { + const pageContent = await page.evaluate(() => { + const bodyText = document.body?.textContent?.toLowerCase() || ''; + const title = document.title?.toLowerCase() || ''; + + // Check for common CAPTCHA/verification indicators + const captchaIndicators = [ + 'verify you\'re human', + 'verify you are human', + 'captcha', + 'prove you\'re not a robot', + 'unusual traffic', + 'automated queries', + 'please verify', + 'security check', + 'access denied', + 'blocked', + ]; + + const foundIndicators = captchaIndicators.filter(indicator => + bodyText.includes(indicator) || title.includes(indicator) + ); + + return { + hasCaptcha: foundIndicators.length > 0, + indicators: foundIndicators, + title: document.title, + bodyPreview: bodyText.substring(0, 500), + }; + }); + + if (pageContent.hasCaptcha) { + logger.error(`āŒ Indeed is showing a CAPTCHA/verification page.`); + logger.error(` Detected indicators: ${pageContent.indicators.join(', ')}`); + logger.error(` Page title: ${pageContent.title}`); + logger.error(`\nšŸ’” Solutions:`); + logger.error(` 1. Run in non-headless mode: Set HEADLESS=false in .env file`); + logger.error(` 2. Wait a few minutes and try again`); + logger.error(` 3. Use a different IP address or VPN`); + logger.error(` 4. Manually solve the CAPTCHA in a browser, then try again`); + throw new Error(`Indeed CAPTCHA detected: ${pageContent.indicators.join(', ')}. Please see suggestions above.`); + } + } catch (checkError) { + // If the check itself fails, log but don't throw (might be a different error) + if (checkError.message.includes('CAPTCHA')) { + throw checkError; // Re-throw CAPTCHA errors + } + logger.debug(`Could not check for CAPTCHA: ${checkError.message}`); + } + + // Check for results count + try { + const resultsText = await page.evaluate(() => { + const countElement = document.querySelector(".jobsearch-JobCountAndSortPane-jobCount"); + return countElement ? countElement.textContent : "No results count found"; + }); + logger.info(`šŸ“Š Indeed results info: ${resultsText}`); + } catch (e) { + logger.debug(`Could not get results count: ${e.message}`); + } + + // Wait for job listings container + let hasResults = false; + const possibleSelectors = [ + "#mosaic-provider-jobcards", + ".job_seen_beacon", + "[data-jk]", + ".jobsearch-SerpJobCard", + ".jobCard", + ]; + + for (const selector of possibleSelectors) { + try { + await page.waitForSelector(selector, { timeout: 5000 }); + const count = await page.$$(selector).then((elements) => elements.length); + if (count > 0) { + hasResults = true; + logger.info(`āœ… Found job results container with selector: ${selector} (${count} jobs)`); + break; + } + } catch (e) { + // Try next selector + continue; + } + } + + if (!hasResults) { + logger.warning(`āš ļø No job results container found for keyword: ${keyword}`); + continue; + } + + // Process multiple pages + let currentPage = 1; + const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited + + logger.info(`šŸ“„ Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`); + + while (currentPage <= maxPagesToProcess) { + logger.info(`šŸ“„ Processing page ${currentPage}...`); + + // Wait for page to fully load + await new Promise((resolve) => setTimeout(resolve, 2000)); + + // Extract jobs from current page + const pageJobs = await extractJobsFromPage(page, keyword, locationFilter); + logger.info(`šŸ“‹ Extracted ${pageJobs.length} jobs from page ${currentPage}`); + + if (pageJobs.length === 0) { + logger.warning(`āš ļø No jobs found on page ${currentPage}, stopping pagination`); + break; + } + + // Process each job + for (const job of pageJobs) { + // Skip duplicates + if (seenJobs.has(job.jobId)) { + continue; + } + seenJobs.add(job.jobId); + + // Validate keywords based on logic type + if (keywordGroups) { + // Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR) + const fullText = `${job.title} ${job.description} ${job.company}`; + if (!matchesKeywordGroups(fullText, keywordGroups)) { + rejectedResults.push({ + ...job, + rejectionReason: "Job does not match all keyword groups", + }); + continue; + } + } else if (useAndLogic) { + // Simple AND logic: all keywords must match + const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase(); + if (!containsAllKeywords(fullText, keywords)) { + rejectedResults.push({ + ...job, + rejectionReason: "Not all keywords found in job listing", + }); + continue; + } + } + + // Validate location if filtering enabled + if (locationFilter) { + const locationValid = validateLocationAgainstFilters( + job.location, + locationFilter + ); + + if (!locationValid.isValid) { + rejectedResults.push({ + ...job, + rejectionReason: locationValid.reasoning || "Location filter mismatch", + }); + continue; + } + } + + results.push(job); + } + + // Check if there's a next page + const hasNext = await hasNextPageAvailable(page); + if (!hasNext) { + logger.info(`āœ… No more pages available. Total jobs extracted: ${results.length}`); + break; + } + + // Navigate to next page if we haven't reached maxPages + if (currentPage < maxPagesToProcess) { + logger.info(`āž”ļø Navigating to page ${currentPage + 1}...`); + const navigationSuccess = await navigateToNextPage(page); + + if (!navigationSuccess) { + logger.warning(`āš ļø Failed to navigate to next page, stopping pagination`); + break; + } + + currentPage++; + } else { + logger.info(`šŸ“Š Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${results.length}`); + break; + } + } + + const totalExtracted = results.length + rejectedResults.length; + logger.info(`šŸ“‹ Extracted ${results.length} accepted jobs, ${rejectedResults.length} rejected jobs (${totalExtracted} total) across ${currentPage} page(s) for "${keyword}"`); + } catch (error) { + logger.error(`Error processing keyword "${keyword}": ${error.message}`); + logger.error(`Stack: ${error.stack}`); + } + } + + logger.info( + `šŸŽÆ Indeed parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected` + ); + + return { + results, + rejectedResults, + summary: { + totalJobs: results.length, + totalRejected: rejectedResults.length, + keywords: keywords.join(", "), + locationFilter, + source: "indeed", + }, + }; + } catch (error) { + logger.error(`āŒ Indeed parsing failed: ${error.message}`); + logger.error(`Stack: ${error.stack}`); + return { + results, + rejectedResults, + summary: { + totalJobs: results.length, + totalRejected: rejectedResults.length, + keywords: keywords.join(", "), + locationFilter, + source: "indeed", + error: error.message, + }, + }; + } +} + +/** + * Extract jobs from current page + */ +async function extractJobsFromPage(page, keyword, locationFilter) { + const jobs = []; + + try { + // Indeed job listings are typically in divs with data-jk attribute (job key) + const jobSelectors = [ + "[data-jk]", + ".job_seen_beacon", + ".jobsearch-SerpJobCard", + ".jobCard", + "div[data-testid='job-card']", + ]; + + let jobElements = []; + for (const selector of jobSelectors) { + try { + await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {}); + const elements = await page.$$(selector); + if (elements.length > 0) { + jobElements = elements; + logger.info(`āœ… Found ${jobElements.length} job elements using selector: ${selector}`); + break; + } + } catch (e) { + // Try next selector + continue; + } + } + + if (jobElements.length === 0) { + logger.warning(`āš ļø No job elements found with any selector`); + return jobs; + } + + for (const jobElement of jobElements) { + try { + // Try to scroll job into view, but don't fail if it times out + // Some elements might be in hidden containers or lazy-loaded + try { + await Promise.race([ + jobElement.scrollIntoViewIfNeeded(), + new Promise((_, reject) => + setTimeout(() => reject(new Error('Scroll timeout')), 2000) + ) + ]); + await new Promise((resolve) => setTimeout(resolve, 100)); + } catch (scrollError) { + // If scrolling fails, try a simpler scroll approach + try { + await jobElement.evaluate((el) => { + el.scrollIntoView({ behavior: 'auto', block: 'center' }); + }); + await new Promise((resolve) => setTimeout(resolve, 100)); + } catch (simpleScrollError) { + // If even simple scroll fails, continue anyway - we can still extract data + logger.debug(`Could not scroll element into view, continuing anyway: ${simpleScrollError.message}`); + } + } + + const job = await extractJobData(jobElement, keyword); + if (job && (job.title || job.jobId)) { + jobs.push(job); + } + } catch (error) { + logger.warning(`Failed to extract job data: ${error.message}`); + } + } + } catch (error) { + logger.error(`Failed to extract jobs from page: ${error.message}`); + } + + return jobs; +} + +/** + * Extract data from individual job element + */ +async function extractJobData(jobElement, keyword) { + try { + const jobData = await jobElement.evaluate((el) => { + const data = { + jobId: "", + title: "", + company: "", + location: "", + jobUrl: "", + postedDate: "", + description: "", + salary: "", + jobType: "", + }; + + // Extract job ID from data-jk attribute + data.jobId = el.getAttribute("data-jk") || ""; + + // Extract title and URL + const titleSelectors = [ + "h2.jobTitle a", + "h2.jobTitle", + "a[data-jk]", + "h2 a", + ".jobTitle a", + "[class*='jobTitle'] a", + ]; + + for (const selector of titleSelectors) { + const titleElement = el.querySelector(selector); + if (titleElement) { + data.title = titleElement.textContent?.trim() || titleElement.innerText?.trim() || ""; + if (titleElement.tagName === "A") { + data.jobUrl = titleElement.getAttribute("href") || ""; + } else { + const link = titleElement.querySelector("a"); + if (link) { + data.jobUrl = link.getAttribute("href") || ""; + } + } + if (data.title) break; + } + } + + // Extract company name + const companySelectors = [ + "[data-testid='company-name']", + ".companyName", + "[class*='companyName']", + "span.companyName", + "a[data-testid='company-name']", + ]; + + for (const selector of companySelectors) { + const companyElement = el.querySelector(selector); + if (companyElement) { + const text = companyElement.textContent?.trim() || companyElement.innerText?.trim() || ""; + if (text && text.length > 0) { + data.company = text; + break; + } + } + } + + // Extract location + const locationSelectors = [ + "[data-testid='job-location']", + ".companyLocation", + "[class*='companyLocation']", + "[class*='location']", + ]; + + for (const selector of locationSelectors) { + const locationElement = el.querySelector(selector); + if (locationElement) { + const text = locationElement.textContent?.trim() || locationElement.innerText?.trim() || ""; + if (text && text.length > 0) { + data.location = text; + break; + } + } + } + + // Extract salary + const salarySelectors = [ + "[data-testid='attribute_snippet_testid']", + ".salary-snippet", + "[class*='salary']", + ".salaryText", + ]; + + for (const selector of salarySelectors) { + const salaryElement = el.querySelector(selector); + if (salaryElement) { + const text = salaryElement.textContent?.trim() || salaryElement.innerText?.trim() || ""; + if (text && text.includes("$") || text.match(/\d+/)) { + data.salary = text; + break; + } + } + } + + // Extract posted date + const dateSelectors = [ + "[data-testid='myJobsStateDate']", + ".date", + "[class*='date']", + "span.date", + ]; + + for (const selector of dateSelectors) { + const dateElement = el.querySelector(selector); + if (dateElement) { + const text = dateElement.textContent?.trim() || dateElement.innerText?.trim() || ""; + if (text) { + // Parse relative dates like "2 days ago", "Just posted", etc. + const now = new Date(); + if (text.match(/just posted|today/i)) { + data.postedDate = now.toISOString().split("T")[0]; + } else if (text.match(/\d+\s*(day|days)/i)) { + const match = text.match(/(\d+)\s*day/i); + if (match) { + const daysAgo = parseInt(match[1]); + const date = new Date(now); + date.setDate(date.getDate() - daysAgo); + data.postedDate = date.toISOString().split("T")[0]; + } + } else { + data.postedDate = text; + } + break; + } + } + } + + // Extract description snippet + const descSelectors = [ + ".job-snippet", + "[class*='job-snippet']", + "[class*='summary']", + ".summary", + ]; + + for (const selector of descSelectors) { + const descElement = el.querySelector(selector); + if (descElement) { + const text = descElement.textContent?.trim() || descElement.innerText?.trim() || ""; + if (text && text.length > 20) { + data.description = text.substring(0, 500); // Limit description length + break; + } + } + } + + return data; + }); + + // Clean and format + const title = cleanText(jobData.title); + let jobUrl = jobData.jobUrl || ""; + + // Make URL absolute if relative + if (jobUrl && !jobUrl.startsWith("http")) { + if (jobUrl.startsWith("/")) { + jobUrl = `https://www.indeed.com${jobUrl}`; + } else { + jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`; + } + } else if (!jobUrl && jobData.jobId) { + jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`; + } + + // Generate job ID if not found + const jobId = jobData.jobId || `indeed-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + + if (!jobId && !title) { + return null; + } + + return { + jobId, + title, + company: cleanText(jobData.company), + location: cleanText(jobData.location), + jobUrl, + postedDate: jobData.postedDate, + description: cleanText(jobData.description), + salary: cleanText(jobData.salary), + jobType: jobData.jobType, + keyword, + extractedAt: new Date().toISOString(), + source: "indeed", + }; + } catch (error) { + logger.warning(`Error extracting job data: ${error.message}`); + return null; + } +} + +/** + * Parse job description to separate role duties from job requirements + */ +function parseDutiesAndRequirements(description) { + if (!description || description.trim().length === 0) { + return { duties: "", requirements: "" }; + } + + // Common section headers that indicate duties/responsibilities + const dutiesKeywords = [ + /responsibilities?:/i, + /duties?:/i, + /what you['\u2019]ll do/i, + /key responsibilities/i, + /your role/i, + /position overview/i, + /about the role/i, + ]; + + // Common section headers that indicate requirements/qualifications + const requirementsKeywords = [ + /requirements?:/i, + /qualifications?:/i, + /must have/i, + /required:/i, + /what you['\u2019]ll bring/i, + /you have:/i, + /skills required/i, + /minimum requirements/i, + ]; + + // Split description into sections + const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0); + + let currentSection = "duties"; + let dutiesText = ""; + let requirementsText = ""; + + for (const section of sections) { + let isRequirementsSection = false; + for (const keyword of requirementsKeywords) { + if (keyword.test(section)) { + isRequirementsSection = true; + currentSection = "requirements"; + break; + } + } + + if (!isRequirementsSection) { + for (const keyword of dutiesKeywords) { + if (keyword.test(section)) { + currentSection = "duties"; + break; + } + } + } + + if (currentSection === "requirements") { + requirementsText += (requirementsText ? "\n\n" : "") + section.trim(); + } else { + dutiesText += (dutiesText ? "\n\n" : "") + section.trim(); + } + } + + // Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements + if (!dutiesText && !requirementsText && description) { + const midPoint = Math.floor(description.length * 0.6); + dutiesText = description.substring(0, midPoint).trim(); + requirementsText = description.substring(midPoint).trim(); + } + + return { + duties: dutiesText.trim(), + requirements: requirementsText.trim(), + }; +} + +/** + * Check if next page is available + */ +async function hasNextPageAvailable(page) { + try { + const nextButtonSelectors = [ + "a[aria-label='Next']", + "a[aria-label='Next Page']", + "a[data-testid='pagination-page-next']", + "[data-testid='pagination-page-next']", + "a[aria-label*='Next']", + ]; + + for (const selector of nextButtonSelectors) { + try { + const nextButton = await page.$(selector); + if (nextButton) { + const isDisabled = await nextButton.evaluate((el) => { + return el.hasAttribute("disabled") || + el.getAttribute("aria-disabled") === "true" || + el.classList.contains("disabled"); + }).catch(() => false); + + if (!isDisabled) { + return true; + } + } + } catch (e) { + continue; + } + } + + return false; + } catch (error) { + logger.debug(`Error checking for next page: ${error.message}`); + return false; + } +} + +/** + * Navigate to next page + */ +async function navigateToNextPage(page) { + try { + const nextButtonSelectors = [ + "a[aria-label='Next']", + "a[aria-label='Next Page']", + "a[data-testid='pagination-page-next']", + "[data-testid='pagination-page-next']", + "a[aria-label*='Next']", + ]; + + for (const selector of nextButtonSelectors) { + try { + const nextButton = await page.$(selector); + if (nextButton) { + const isDisabled = await nextButton.evaluate((el) => { + return el.hasAttribute("disabled") || + el.getAttribute("aria-disabled") === "true" || + el.classList.contains("disabled"); + }).catch(() => false); + + if (!isDisabled) { + // Get current URL before navigation + const urlBefore = page.url(); + + await nextButton.scrollIntoViewIfNeeded().catch(() => {}); + await new Promise((resolve) => setTimeout(resolve, 500)); + + await nextButton.click(); + logger.info(`āœ… Clicked next page button`); + + // Wait for navigation to complete (URL change or content load) + // Indeed might use AJAX, so wait for either URL change or content update + let navigationComplete = false; + const maxWaitTime = 10000; // 10 seconds max wait + const startTime = Date.now(); + + while (!navigationComplete && (Date.now() - startTime) < maxWaitTime) { + await new Promise((resolve) => setTimeout(resolve, 500)); + + // Check if URL changed (full page navigation) + const currentUrl = page.url(); + if (currentUrl !== urlBefore) { + logger.info(`šŸ“ URL changed to: ${currentUrl}`); + navigationComplete = true; + break; + } + + // Check if job elements appeared (AJAX navigation) + const jobCount = await page.$$eval( + "[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard", + (elements) => elements.length + ).catch(() => 0); + + if (jobCount > 0) { + logger.info(`āœ… Found ${jobCount} job elements (AJAX navigation)`); + navigationComplete = true; + break; + } + } + + // Additional wait for content to stabilize + await new Promise((resolve) => setTimeout(resolve, 2000)); + + // Check for CAPTCHA after navigation + const currentUrl = page.url(); + if (currentUrl.includes('captcha') || currentUrl.includes('verify') || currentUrl.includes('blocked')) { + logger.error(`āŒ CAPTCHA detected after navigation to page. URL: ${currentUrl}`); + throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false) or wait and try again.'); + } + + // Check page content for CAPTCHA + try { + const hasCaptcha = await page.evaluate(() => { + const bodyText = document.body?.textContent?.toLowerCase() || ''; + const indicators = ['verify you\'re human', 'captcha', 'unusual traffic', 'automated queries']; + return indicators.some(ind => bodyText.includes(ind)); + }); + + if (hasCaptcha) { + logger.error(`āŒ CAPTCHA detected on page content after navigation`); + throw new Error('Indeed CAPTCHA detected. Please run in non-headless mode (HEADLESS=false) to solve it manually.'); + } + } catch (captchaError) { + if (captchaError.message.includes('CAPTCHA')) { + throw captchaError; + } + } + + // Scroll page to trigger any lazy loading + try { + await page.evaluate(() => { + window.scrollTo(0, 300); + }); + await new Promise((resolve) => setTimeout(resolve, 1000)); + } catch (e) { + // Ignore scroll errors + } + + // Final check for job elements with multiple selectors + const finalJobCount = await page.$$eval( + "[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard, div[data-testid='job-card']", + (elements) => elements.length + ).catch(() => 0); + + if (finalJobCount > 0) { + logger.info(`āœ… Navigation successful, found ${finalJobCount} job elements`); + return true; + } else { + logger.warning(`āš ļø No job elements found after navigation (waited ${maxWaitTime}ms)`); + // Debug: check what's on the page + try { + const pageTitle = await page.title(); + const pageUrl = page.url(); + logger.debug(`Page title: ${pageTitle}, URL: ${pageUrl}`); + + // Check if it's a CAPTCHA page + const bodyText = await page.evaluate(() => document.body?.textContent?.toLowerCase() || ''); + if (bodyText.includes('captcha') || bodyText.includes('verify')) { + logger.error(`āŒ Page appears to be a CAPTCHA page`); + throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false).'); + } + } catch (e) { + if (e.message.includes('CAPTCHA')) { + throw e; + } + // Ignore other debug errors + } + return false; + } + } + } + } catch (e) { + continue; + } + } + + logger.warning(`āš ļø Could not find or click next page button`); + return false; + } catch (error) { + logger.warning(`Failed to navigate to next page: ${error.message}`); + return false; + } +} + +module.exports = { + indeedStrategy, + buildSearchUrl, +}; +