Add Indeed parsing strategy and enhance job search parser
- Introduced a new Indeed parsing strategy to support job extraction from Indeed, including advanced filtering options. - Updated job search parser to include Indeed in the site strategies, allowing for combined searches with other job sites. - Enhanced README documentation with detailed usage instructions for the Indeed parser, including examples for keyword and location filtering. - Improved logging for Indeed parsing to provide insights into job extraction processes and potential CAPTCHA handling.
This commit is contained in:
parent
47cdc03fb8
commit
673f84d388
@ -20,7 +20,26 @@ class CoreParser {
|
||||
this.browser = await playwright.chromium.launch({
|
||||
headless: this.config.headless
|
||||
});
|
||||
this.context = await this.browser.newContext();
|
||||
|
||||
// Create context with user agent to appear more like a real browser
|
||||
const contextOptions = {
|
||||
userAgent: this.config.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
locale: 'en-US',
|
||||
timezoneId: 'America/New_York',
|
||||
};
|
||||
|
||||
// Add extra HTTP headers to appear more legitimate
|
||||
contextOptions.extraHTTPHeaders = {
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
};
|
||||
|
||||
this.context = await this.browser.newContext(contextOptions);
|
||||
}
|
||||
|
||||
async createPage(id) {
|
||||
|
||||
@ -99,7 +99,7 @@ node index.js --sites=linkedin --keywords="co-op" --min-date="2025-12-01"
|
||||
node index.js --sites=linkedin --keywords="co-op" --location="Ontario" --min-date="2025-12-01"
|
||||
|
||||
# Combine multiple sites
|
||||
node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op"
|
||||
node index.js --sites=linkedin,skipthedrive,indeed --keywords="intern,co-op"
|
||||
|
||||
# Use AND logic - jobs must match ALL keywords (e.g., "co-op" AND "summer 2026")
|
||||
node index.js --sites=linkedin --keywords="co-op,summer 2026" --and
|
||||
@ -118,9 +118,61 @@ node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --an
|
||||
- LinkedIn supports relative timeframes up to ~30 days
|
||||
- For dates older than 30 days, LinkedIn may limit results to the maximum supported timeframe
|
||||
|
||||
### 🚧 Planned Parsers
|
||||
#### Indeed Parser
|
||||
|
||||
- **Indeed**: Comprehensive job aggregator
|
||||
Comprehensive job aggregator with extensive job listings.
|
||||
|
||||
**Features:**
|
||||
|
||||
- Keyword-based job search
|
||||
- Location filtering (both Indeed location and post-extraction filter)
|
||||
- Multi-page result parsing with pagination
|
||||
- Salary information extraction
|
||||
- Date filtering (jobs posted within last 30 days)
|
||||
- Automatic duplicate detection
|
||||
- Job type and experience level support
|
||||
|
||||
**Usage:**
|
||||
|
||||
```bash
|
||||
# Search Indeed jobs
|
||||
node index.js --sites=indeed --keywords="software engineer,developer"
|
||||
|
||||
# Search with location filter
|
||||
node index.js --sites=indeed --keywords="co-op" --location="Ontario"
|
||||
|
||||
# Search with date filter (jobs posted after specific date)
|
||||
node index.js --sites=indeed --keywords="co-op" --min-date="2025-12-01"
|
||||
|
||||
# Combine filters
|
||||
node index.js --sites=indeed --keywords="co-op" --location="Ontario" --min-date="2025-12-01"
|
||||
|
||||
# Combine multiple sites
|
||||
node index.js --sites=indeed,linkedin --keywords="intern,co-op"
|
||||
|
||||
# Use AND logic - jobs must match ALL keywords
|
||||
node index.js --sites=indeed --keywords="co-op,summer 2026" --and
|
||||
|
||||
# Use grouped AND/OR logic - (co-op OR intern) AND (summer 2026)
|
||||
node index.js --sites=indeed --keywords="co-op|intern,summer 2026" --and
|
||||
```
|
||||
|
||||
**Date Filter Notes:**
|
||||
- The date filter converts to Indeed's `fromage` parameter (days ago)
|
||||
- Format: `YYYY-MM-DD` (e.g., `2025-12-01`)
|
||||
- Indeed supports up to 30 days for date filtering
|
||||
- For dates older than 30 days, Indeed limits results to the maximum supported timeframe
|
||||
|
||||
**CAPTCHA/Verification Handling:**
|
||||
- Indeed may show CAPTCHA or human verification pages when detecting automated access
|
||||
- If you encounter CAPTCHA errors, try:
|
||||
1. Run in non-headless mode: Set `HEADLESS=false` in `.env` file (you can manually solve CAPTCHA)
|
||||
2. Wait a few minutes between runs to avoid rate limiting
|
||||
3. Use a different IP address or VPN if available
|
||||
4. Reduce the number of pages or keywords per run
|
||||
- The parser will automatically detect and report CAPTCHA pages with helpful error messages
|
||||
|
||||
### 🚧 Planned Parsers
|
||||
- **Glassdoor**: Jobs with company reviews and salary data
|
||||
- **Monster**: Traditional job board
|
||||
- **SimplyHired**: Job aggregator with salary estimates
|
||||
@ -195,7 +247,7 @@ USE_AND_LOGIC=true
|
||||
node index.js
|
||||
|
||||
# Select sites to parse
|
||||
node index.js --sites=linkedin,skipthedrive
|
||||
node index.js --sites=linkedin,skipthedrive,indeed
|
||||
|
||||
# Search keywords
|
||||
node index.js --keywords="software engineer,developer"
|
||||
@ -229,7 +281,7 @@ node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --an
|
||||
|
||||
**Available Options:**
|
||||
|
||||
- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive)
|
||||
- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive, indeed)
|
||||
- `--keywords="keyword1,keyword2"`: Search keywords
|
||||
- Use `|` (pipe) to separate OR keywords within a group: `"co-op|intern"` means "co-op" OR "intern"
|
||||
- Use `,` (comma) to separate AND groups when using `--and`: `"co-op|intern,summer 2026"` means (co-op OR intern) AND (summer 2026)
|
||||
|
||||
@ -11,6 +11,7 @@ const fs = require("fs");
|
||||
const CoreParser = require("../core-parser");
|
||||
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
|
||||
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
|
||||
const { indeedStrategy } = require("./strategies/indeed-strategy");
|
||||
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
|
||||
const { convertResultsToCsv } = require("./src/csv-utils");
|
||||
|
||||
@ -35,8 +36,8 @@ const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for
|
||||
const SITE_STRATEGIES = {
|
||||
skipthedrive: skipthedriveStrategy,
|
||||
linkedin: linkedinJobsStrategy,
|
||||
indeed: indeedStrategy,
|
||||
// Add more site strategies here
|
||||
// indeed: indeedStrategy,
|
||||
// glassdoor: glassdoorStrategy,
|
||||
};
|
||||
|
||||
|
||||
947
job-search-parser/strategies/indeed-strategy.js
Normal file
947
job-search-parser/strategies/indeed-strategy.js
Normal file
@ -0,0 +1,947 @@
|
||||
/**
|
||||
* Indeed Parsing Strategy
|
||||
*
|
||||
* Uses core-parser for browser management and ai-analyzer for utilities
|
||||
*/
|
||||
|
||||
const {
|
||||
logger,
|
||||
cleanText,
|
||||
containsAnyKeyword,
|
||||
containsAllKeywords,
|
||||
matchesKeywordGroups,
|
||||
validateLocationAgainstFilters,
|
||||
} = require("ai-analyzer");
|
||||
|
||||
/**
|
||||
* Indeed URL builder
|
||||
*/
|
||||
function buildSearchUrl(keyword, location = "", filters = {}) {
|
||||
const baseUrl = "https://www.indeed.com/jobs";
|
||||
const params = new URLSearchParams({
|
||||
q: keyword,
|
||||
sort: "date", // Sort by date (newest first)
|
||||
});
|
||||
|
||||
if (location) {
|
||||
params.append("l", location);
|
||||
}
|
||||
|
||||
// Add date filter if provided
|
||||
if (filters.fromage) {
|
||||
// fromage is in days (e.g., 1 = last 24 hours, 7 = last 7 days, 30 = last 30 days)
|
||||
params.append("fromage", filters.fromage);
|
||||
}
|
||||
|
||||
// Add job type filter
|
||||
if (filters.jobType) {
|
||||
// jt=fulltime, parttime, contract, internship, temporary
|
||||
params.append("jt", filters.jobType);
|
||||
}
|
||||
|
||||
// Add remote filter
|
||||
if (filters.remote) {
|
||||
params.append("remote", "true");
|
||||
}
|
||||
|
||||
// Add experience level filter
|
||||
if (filters.experienceLevel) {
|
||||
// explvl=entry_level, mid_level, senior_level
|
||||
params.append("explvl", filters.experienceLevel);
|
||||
}
|
||||
|
||||
return `${baseUrl}?${params.toString()}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indeed parsing strategy function
|
||||
*/
|
||||
async function indeedStrategy(coreParser, options = {}) {
|
||||
const {
|
||||
keywords = ["software engineer", "developer"],
|
||||
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
|
||||
locationFilter = null,
|
||||
maxPages = 5,
|
||||
location = "", // Indeed location search (e.g., "Toronto, ON", "Canada")
|
||||
minDate = null, // Minimum posted date (format: YYYY-MM-DD)
|
||||
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||
} = options;
|
||||
|
||||
const results = [];
|
||||
const rejectedResults = [];
|
||||
const seenJobs = new Set();
|
||||
|
||||
try {
|
||||
// Create main page
|
||||
const page = await coreParser.createPage("indeed-main");
|
||||
|
||||
logger.info("🚀 Starting Indeed parser...");
|
||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||
if (keywordGroups) {
|
||||
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||
} else {
|
||||
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||
}
|
||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||
logger.info(`🌍 Indeed Location: ${location || "None"}`);
|
||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||
|
||||
// Convert minDate to fromage (days ago)
|
||||
let fromage = null;
|
||||
if (minDate) {
|
||||
try {
|
||||
const minDateObj = new Date(minDate);
|
||||
const now = new Date();
|
||||
const daysDiff = Math.floor((now - minDateObj) / (1000 * 60 * 60 * 24));
|
||||
if (daysDiff > 0 && daysDiff <= 30) {
|
||||
fromage = daysDiff;
|
||||
logger.info(`📅 Min Date Filter: ${minDate} (${fromage} days ago)`);
|
||||
} else if (daysDiff > 30) {
|
||||
fromage = 30; // Indeed's maximum is typically 30 days
|
||||
logger.info(`📅 Min Date Filter: ${minDate} (limited to 30 days)`);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warning(`⚠️ Invalid date format for minDate: ${minDate}. Expected format: YYYY-MM-DD`);
|
||||
}
|
||||
}
|
||||
|
||||
// Determine search keywords based on logic type
|
||||
let searchKeywords;
|
||||
if (keywordGroups) {
|
||||
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
|
||||
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
|
||||
} else if (useAndLogic) {
|
||||
// For simple AND logic, combine all keywords into a single search query
|
||||
searchKeywords = [keywords.join(" ")];
|
||||
} else {
|
||||
// For OR logic, search each keyword separately
|
||||
searchKeywords = keywords;
|
||||
}
|
||||
|
||||
// Search for each keyword (or combined keyword for AND logic)
|
||||
for (const keyword of searchKeywords) {
|
||||
logger.info(`\n🔍 Searching Indeed for: "${keyword}"`);
|
||||
|
||||
const searchUrl = buildSearchUrl(keyword, location, {
|
||||
fromage: fromage,
|
||||
});
|
||||
logger.info(`🔗 Search URL: ${searchUrl}`);
|
||||
|
||||
try {
|
||||
// Navigate to job search results
|
||||
// Use domcontentloaded instead of networkidle for faster loading
|
||||
// Indeed can be slow to fully load, so we'll wait for DOM and then check for content
|
||||
try {
|
||||
await coreParser.navigateTo(searchUrl, {
|
||||
pageId: "indeed-main",
|
||||
retries: 2,
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000, // Increase timeout to 60 seconds
|
||||
});
|
||||
} catch (navError) {
|
||||
// If navigation fails, try with load event instead
|
||||
logger.warning(`⚠️ Initial navigation failed, trying with 'load' event: ${navError.message}`);
|
||||
try {
|
||||
await coreParser.navigateTo(searchUrl, {
|
||||
pageId: "indeed-main",
|
||||
retries: 1,
|
||||
waitUntil: "load",
|
||||
timeout: 60000,
|
||||
});
|
||||
} catch (loadError) {
|
||||
// Last resort: try direct page navigation
|
||||
logger.warning(`⚠️ Load event failed, trying direct navigation: ${loadError.message}`);
|
||||
await page.goto(searchUrl, { timeout: 60000, waitUntil: "domcontentloaded" }).catch(() => {
|
||||
throw new Error(`Failed to navigate to Indeed after all attempts: ${loadError.message}`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for page to load and let JavaScript execute
|
||||
await new Promise((resolve) => setTimeout(resolve, 5000));
|
||||
|
||||
// Check if we're on the right page
|
||||
const currentUrl = page.url();
|
||||
logger.info(`📍 Current page URL: ${currentUrl}`);
|
||||
|
||||
// Check if we were redirected or blocked (check URL first)
|
||||
if (currentUrl.includes('captcha') || currentUrl.includes('blocked') || currentUrl.includes('access-denied') || currentUrl.includes('verify')) {
|
||||
logger.error(`❌ Indeed appears to be blocking access. URL: ${currentUrl}`);
|
||||
throw new Error('Indeed is showing a CAPTCHA or verification page. Please try running in non-headless mode (set HEADLESS=false in .env) or wait and try again later.');
|
||||
}
|
||||
|
||||
// Check page content for CAPTCHA/human verification indicators
|
||||
try {
|
||||
const pageContent = await page.evaluate(() => {
|
||||
const bodyText = document.body?.textContent?.toLowerCase() || '';
|
||||
const title = document.title?.toLowerCase() || '';
|
||||
|
||||
// Check for common CAPTCHA/verification indicators
|
||||
const captchaIndicators = [
|
||||
'verify you\'re human',
|
||||
'verify you are human',
|
||||
'captcha',
|
||||
'prove you\'re not a robot',
|
||||
'unusual traffic',
|
||||
'automated queries',
|
||||
'please verify',
|
||||
'security check',
|
||||
'access denied',
|
||||
'blocked',
|
||||
];
|
||||
|
||||
const foundIndicators = captchaIndicators.filter(indicator =>
|
||||
bodyText.includes(indicator) || title.includes(indicator)
|
||||
);
|
||||
|
||||
return {
|
||||
hasCaptcha: foundIndicators.length > 0,
|
||||
indicators: foundIndicators,
|
||||
title: document.title,
|
||||
bodyPreview: bodyText.substring(0, 500),
|
||||
};
|
||||
});
|
||||
|
||||
if (pageContent.hasCaptcha) {
|
||||
logger.error(`❌ Indeed is showing a CAPTCHA/verification page.`);
|
||||
logger.error(` Detected indicators: ${pageContent.indicators.join(', ')}`);
|
||||
logger.error(` Page title: ${pageContent.title}`);
|
||||
logger.error(`\n💡 Solutions:`);
|
||||
logger.error(` 1. Run in non-headless mode: Set HEADLESS=false in .env file`);
|
||||
logger.error(` 2. Wait a few minutes and try again`);
|
||||
logger.error(` 3. Use a different IP address or VPN`);
|
||||
logger.error(` 4. Manually solve the CAPTCHA in a browser, then try again`);
|
||||
throw new Error(`Indeed CAPTCHA detected: ${pageContent.indicators.join(', ')}. Please see suggestions above.`);
|
||||
}
|
||||
} catch (checkError) {
|
||||
// If the check itself fails, log but don't throw (might be a different error)
|
||||
if (checkError.message.includes('CAPTCHA')) {
|
||||
throw checkError; // Re-throw CAPTCHA errors
|
||||
}
|
||||
logger.debug(`Could not check for CAPTCHA: ${checkError.message}`);
|
||||
}
|
||||
|
||||
// Check for results count
|
||||
try {
|
||||
const resultsText = await page.evaluate(() => {
|
||||
const countElement = document.querySelector(".jobsearch-JobCountAndSortPane-jobCount");
|
||||
return countElement ? countElement.textContent : "No results count found";
|
||||
});
|
||||
logger.info(`📊 Indeed results info: ${resultsText}`);
|
||||
} catch (e) {
|
||||
logger.debug(`Could not get results count: ${e.message}`);
|
||||
}
|
||||
|
||||
// Wait for job listings container
|
||||
let hasResults = false;
|
||||
const possibleSelectors = [
|
||||
"#mosaic-provider-jobcards",
|
||||
".job_seen_beacon",
|
||||
"[data-jk]",
|
||||
".jobsearch-SerpJobCard",
|
||||
".jobCard",
|
||||
];
|
||||
|
||||
for (const selector of possibleSelectors) {
|
||||
try {
|
||||
await page.waitForSelector(selector, { timeout: 5000 });
|
||||
const count = await page.$$(selector).then((elements) => elements.length);
|
||||
if (count > 0) {
|
||||
hasResults = true;
|
||||
logger.info(`✅ Found job results container with selector: ${selector} (${count} jobs)`);
|
||||
break;
|
||||
}
|
||||
} catch (e) {
|
||||
// Try next selector
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasResults) {
|
||||
logger.warning(`⚠️ No job results container found for keyword: ${keyword}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process multiple pages
|
||||
let currentPage = 1;
|
||||
const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited
|
||||
|
||||
logger.info(`📄 Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`);
|
||||
|
||||
while (currentPage <= maxPagesToProcess) {
|
||||
logger.info(`📄 Processing page ${currentPage}...`);
|
||||
|
||||
// Wait for page to fully load
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
|
||||
// Extract jobs from current page
|
||||
const pageJobs = await extractJobsFromPage(page, keyword, locationFilter);
|
||||
logger.info(`📋 Extracted ${pageJobs.length} jobs from page ${currentPage}`);
|
||||
|
||||
if (pageJobs.length === 0) {
|
||||
logger.warning(`⚠️ No jobs found on page ${currentPage}, stopping pagination`);
|
||||
break;
|
||||
}
|
||||
|
||||
// Process each job
|
||||
for (const job of pageJobs) {
|
||||
// Skip duplicates
|
||||
if (seenJobs.has(job.jobId)) {
|
||||
continue;
|
||||
}
|
||||
seenJobs.add(job.jobId);
|
||||
|
||||
// Validate keywords based on logic type
|
||||
if (keywordGroups) {
|
||||
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
|
||||
const fullText = `${job.title} ${job.description} ${job.company}`;
|
||||
if (!matchesKeywordGroups(fullText, keywordGroups)) {
|
||||
rejectedResults.push({
|
||||
...job,
|
||||
rejectionReason: "Job does not match all keyword groups",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
} else if (useAndLogic) {
|
||||
// Simple AND logic: all keywords must match
|
||||
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
|
||||
if (!containsAllKeywords(fullText, keywords)) {
|
||||
rejectedResults.push({
|
||||
...job,
|
||||
rejectionReason: "Not all keywords found in job listing",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Validate location if filtering enabled
|
||||
if (locationFilter) {
|
||||
const locationValid = validateLocationAgainstFilters(
|
||||
job.location,
|
||||
locationFilter
|
||||
);
|
||||
|
||||
if (!locationValid.isValid) {
|
||||
rejectedResults.push({
|
||||
...job,
|
||||
rejectionReason: locationValid.reasoning || "Location filter mismatch",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
results.push(job);
|
||||
}
|
||||
|
||||
// Check if there's a next page
|
||||
const hasNext = await hasNextPageAvailable(page);
|
||||
if (!hasNext) {
|
||||
logger.info(`✅ No more pages available. Total jobs extracted: ${results.length}`);
|
||||
break;
|
||||
}
|
||||
|
||||
// Navigate to next page if we haven't reached maxPages
|
||||
if (currentPage < maxPagesToProcess) {
|
||||
logger.info(`➡️ Navigating to page ${currentPage + 1}...`);
|
||||
const navigationSuccess = await navigateToNextPage(page);
|
||||
|
||||
if (!navigationSuccess) {
|
||||
logger.warning(`⚠️ Failed to navigate to next page, stopping pagination`);
|
||||
break;
|
||||
}
|
||||
|
||||
currentPage++;
|
||||
} else {
|
||||
logger.info(`📊 Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${results.length}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const totalExtracted = results.length + rejectedResults.length;
|
||||
logger.info(`📋 Extracted ${results.length} accepted jobs, ${rejectedResults.length} rejected jobs (${totalExtracted} total) across ${currentPage} page(s) for "${keyword}"`);
|
||||
} catch (error) {
|
||||
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
||||
logger.error(`Stack: ${error.stack}`);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(
|
||||
`🎯 Indeed parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
|
||||
);
|
||||
|
||||
return {
|
||||
results,
|
||||
rejectedResults,
|
||||
summary: {
|
||||
totalJobs: results.length,
|
||||
totalRejected: rejectedResults.length,
|
||||
keywords: keywords.join(", "),
|
||||
locationFilter,
|
||||
source: "indeed",
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`❌ Indeed parsing failed: ${error.message}`);
|
||||
logger.error(`Stack: ${error.stack}`);
|
||||
return {
|
||||
results,
|
||||
rejectedResults,
|
||||
summary: {
|
||||
totalJobs: results.length,
|
||||
totalRejected: rejectedResults.length,
|
||||
keywords: keywords.join(", "),
|
||||
locationFilter,
|
||||
source: "indeed",
|
||||
error: error.message,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract jobs from current page
|
||||
*/
|
||||
async function extractJobsFromPage(page, keyword, locationFilter) {
|
||||
const jobs = [];
|
||||
|
||||
try {
|
||||
// Indeed job listings are typically in divs with data-jk attribute (job key)
|
||||
const jobSelectors = [
|
||||
"[data-jk]",
|
||||
".job_seen_beacon",
|
||||
".jobsearch-SerpJobCard",
|
||||
".jobCard",
|
||||
"div[data-testid='job-card']",
|
||||
];
|
||||
|
||||
let jobElements = [];
|
||||
for (const selector of jobSelectors) {
|
||||
try {
|
||||
await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {});
|
||||
const elements = await page.$$(selector);
|
||||
if (elements.length > 0) {
|
||||
jobElements = elements;
|
||||
logger.info(`✅ Found ${jobElements.length} job elements using selector: ${selector}`);
|
||||
break;
|
||||
}
|
||||
} catch (e) {
|
||||
// Try next selector
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (jobElements.length === 0) {
|
||||
logger.warning(`⚠️ No job elements found with any selector`);
|
||||
return jobs;
|
||||
}
|
||||
|
||||
for (const jobElement of jobElements) {
|
||||
try {
|
||||
// Try to scroll job into view, but don't fail if it times out
|
||||
// Some elements might be in hidden containers or lazy-loaded
|
||||
try {
|
||||
await Promise.race([
|
||||
jobElement.scrollIntoViewIfNeeded(),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('Scroll timeout')), 2000)
|
||||
)
|
||||
]);
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
} catch (scrollError) {
|
||||
// If scrolling fails, try a simpler scroll approach
|
||||
try {
|
||||
await jobElement.evaluate((el) => {
|
||||
el.scrollIntoView({ behavior: 'auto', block: 'center' });
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
} catch (simpleScrollError) {
|
||||
// If even simple scroll fails, continue anyway - we can still extract data
|
||||
logger.debug(`Could not scroll element into view, continuing anyway: ${simpleScrollError.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
const job = await extractJobData(jobElement, keyword);
|
||||
if (job && (job.title || job.jobId)) {
|
||||
jobs.push(job);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warning(`Failed to extract job data: ${error.message}`);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to extract jobs from page: ${error.message}`);
|
||||
}
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract data from individual job element
|
||||
*/
|
||||
async function extractJobData(jobElement, keyword) {
|
||||
try {
|
||||
const jobData = await jobElement.evaluate((el) => {
|
||||
const data = {
|
||||
jobId: "",
|
||||
title: "",
|
||||
company: "",
|
||||
location: "",
|
||||
jobUrl: "",
|
||||
postedDate: "",
|
||||
description: "",
|
||||
salary: "",
|
||||
jobType: "",
|
||||
};
|
||||
|
||||
// Extract job ID from data-jk attribute
|
||||
data.jobId = el.getAttribute("data-jk") || "";
|
||||
|
||||
// Extract title and URL
|
||||
const titleSelectors = [
|
||||
"h2.jobTitle a",
|
||||
"h2.jobTitle",
|
||||
"a[data-jk]",
|
||||
"h2 a",
|
||||
".jobTitle a",
|
||||
"[class*='jobTitle'] a",
|
||||
];
|
||||
|
||||
for (const selector of titleSelectors) {
|
||||
const titleElement = el.querySelector(selector);
|
||||
if (titleElement) {
|
||||
data.title = titleElement.textContent?.trim() || titleElement.innerText?.trim() || "";
|
||||
if (titleElement.tagName === "A") {
|
||||
data.jobUrl = titleElement.getAttribute("href") || "";
|
||||
} else {
|
||||
const link = titleElement.querySelector("a");
|
||||
if (link) {
|
||||
data.jobUrl = link.getAttribute("href") || "";
|
||||
}
|
||||
}
|
||||
if (data.title) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract company name
|
||||
const companySelectors = [
|
||||
"[data-testid='company-name']",
|
||||
".companyName",
|
||||
"[class*='companyName']",
|
||||
"span.companyName",
|
||||
"a[data-testid='company-name']",
|
||||
];
|
||||
|
||||
for (const selector of companySelectors) {
|
||||
const companyElement = el.querySelector(selector);
|
||||
if (companyElement) {
|
||||
const text = companyElement.textContent?.trim() || companyElement.innerText?.trim() || "";
|
||||
if (text && text.length > 0) {
|
||||
data.company = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract location
|
||||
const locationSelectors = [
|
||||
"[data-testid='job-location']",
|
||||
".companyLocation",
|
||||
"[class*='companyLocation']",
|
||||
"[class*='location']",
|
||||
];
|
||||
|
||||
for (const selector of locationSelectors) {
|
||||
const locationElement = el.querySelector(selector);
|
||||
if (locationElement) {
|
||||
const text = locationElement.textContent?.trim() || locationElement.innerText?.trim() || "";
|
||||
if (text && text.length > 0) {
|
||||
data.location = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract salary
|
||||
const salarySelectors = [
|
||||
"[data-testid='attribute_snippet_testid']",
|
||||
".salary-snippet",
|
||||
"[class*='salary']",
|
||||
".salaryText",
|
||||
];
|
||||
|
||||
for (const selector of salarySelectors) {
|
||||
const salaryElement = el.querySelector(selector);
|
||||
if (salaryElement) {
|
||||
const text = salaryElement.textContent?.trim() || salaryElement.innerText?.trim() || "";
|
||||
if (text && text.includes("$") || text.match(/\d+/)) {
|
||||
data.salary = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract posted date
|
||||
const dateSelectors = [
|
||||
"[data-testid='myJobsStateDate']",
|
||||
".date",
|
||||
"[class*='date']",
|
||||
"span.date",
|
||||
];
|
||||
|
||||
for (const selector of dateSelectors) {
|
||||
const dateElement = el.querySelector(selector);
|
||||
if (dateElement) {
|
||||
const text = dateElement.textContent?.trim() || dateElement.innerText?.trim() || "";
|
||||
if (text) {
|
||||
// Parse relative dates like "2 days ago", "Just posted", etc.
|
||||
const now = new Date();
|
||||
if (text.match(/just posted|today/i)) {
|
||||
data.postedDate = now.toISOString().split("T")[0];
|
||||
} else if (text.match(/\d+\s*(day|days)/i)) {
|
||||
const match = text.match(/(\d+)\s*day/i);
|
||||
if (match) {
|
||||
const daysAgo = parseInt(match[1]);
|
||||
const date = new Date(now);
|
||||
date.setDate(date.getDate() - daysAgo);
|
||||
data.postedDate = date.toISOString().split("T")[0];
|
||||
}
|
||||
} else {
|
||||
data.postedDate = text;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract description snippet
|
||||
const descSelectors = [
|
||||
".job-snippet",
|
||||
"[class*='job-snippet']",
|
||||
"[class*='summary']",
|
||||
".summary",
|
||||
];
|
||||
|
||||
for (const selector of descSelectors) {
|
||||
const descElement = el.querySelector(selector);
|
||||
if (descElement) {
|
||||
const text = descElement.textContent?.trim() || descElement.innerText?.trim() || "";
|
||||
if (text && text.length > 20) {
|
||||
data.description = text.substring(0, 500); // Limit description length
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
});
|
||||
|
||||
// Clean and format
|
||||
const title = cleanText(jobData.title);
|
||||
let jobUrl = jobData.jobUrl || "";
|
||||
|
||||
// Make URL absolute if relative
|
||||
if (jobUrl && !jobUrl.startsWith("http")) {
|
||||
if (jobUrl.startsWith("/")) {
|
||||
jobUrl = `https://www.indeed.com${jobUrl}`;
|
||||
} else {
|
||||
jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`;
|
||||
}
|
||||
} else if (!jobUrl && jobData.jobId) {
|
||||
jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`;
|
||||
}
|
||||
|
||||
// Generate job ID if not found
|
||||
const jobId = jobData.jobId || `indeed-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
||||
|
||||
if (!jobId && !title) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
jobId,
|
||||
title,
|
||||
company: cleanText(jobData.company),
|
||||
location: cleanText(jobData.location),
|
||||
jobUrl,
|
||||
postedDate: jobData.postedDate,
|
||||
description: cleanText(jobData.description),
|
||||
salary: cleanText(jobData.salary),
|
||||
jobType: jobData.jobType,
|
||||
keyword,
|
||||
extractedAt: new Date().toISOString(),
|
||||
source: "indeed",
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warning(`Error extracting job data: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse job description to separate role duties from job requirements
|
||||
*/
|
||||
function parseDutiesAndRequirements(description) {
|
||||
if (!description || description.trim().length === 0) {
|
||||
return { duties: "", requirements: "" };
|
||||
}
|
||||
|
||||
// Common section headers that indicate duties/responsibilities
|
||||
const dutiesKeywords = [
|
||||
/responsibilities?:/i,
|
||||
/duties?:/i,
|
||||
/what you['\u2019]ll do/i,
|
||||
/key responsibilities/i,
|
||||
/your role/i,
|
||||
/position overview/i,
|
||||
/about the role/i,
|
||||
];
|
||||
|
||||
// Common section headers that indicate requirements/qualifications
|
||||
const requirementsKeywords = [
|
||||
/requirements?:/i,
|
||||
/qualifications?:/i,
|
||||
/must have/i,
|
||||
/required:/i,
|
||||
/what you['\u2019]ll bring/i,
|
||||
/you have:/i,
|
||||
/skills required/i,
|
||||
/minimum requirements/i,
|
||||
];
|
||||
|
||||
// Split description into sections
|
||||
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
|
||||
|
||||
let currentSection = "duties";
|
||||
let dutiesText = "";
|
||||
let requirementsText = "";
|
||||
|
||||
for (const section of sections) {
|
||||
let isRequirementsSection = false;
|
||||
for (const keyword of requirementsKeywords) {
|
||||
if (keyword.test(section)) {
|
||||
isRequirementsSection = true;
|
||||
currentSection = "requirements";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isRequirementsSection) {
|
||||
for (const keyword of dutiesKeywords) {
|
||||
if (keyword.test(section)) {
|
||||
currentSection = "duties";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (currentSection === "requirements") {
|
||||
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
|
||||
} else {
|
||||
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
|
||||
if (!dutiesText && !requirementsText && description) {
|
||||
const midPoint = Math.floor(description.length * 0.6);
|
||||
dutiesText = description.substring(0, midPoint).trim();
|
||||
requirementsText = description.substring(midPoint).trim();
|
||||
}
|
||||
|
||||
return {
|
||||
duties: dutiesText.trim(),
|
||||
requirements: requirementsText.trim(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if next page is available
|
||||
*/
|
||||
async function hasNextPageAvailable(page) {
|
||||
try {
|
||||
const nextButtonSelectors = [
|
||||
"a[aria-label='Next']",
|
||||
"a[aria-label='Next Page']",
|
||||
"a[data-testid='pagination-page-next']",
|
||||
"[data-testid='pagination-page-next']",
|
||||
"a[aria-label*='Next']",
|
||||
];
|
||||
|
||||
for (const selector of nextButtonSelectors) {
|
||||
try {
|
||||
const nextButton = await page.$(selector);
|
||||
if (nextButton) {
|
||||
const isDisabled = await nextButton.evaluate((el) => {
|
||||
return el.hasAttribute("disabled") ||
|
||||
el.getAttribute("aria-disabled") === "true" ||
|
||||
el.classList.contains("disabled");
|
||||
}).catch(() => false);
|
||||
|
||||
if (!isDisabled) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
logger.debug(`Error checking for next page: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Navigate to next page
|
||||
*/
|
||||
async function navigateToNextPage(page) {
|
||||
try {
|
||||
const nextButtonSelectors = [
|
||||
"a[aria-label='Next']",
|
||||
"a[aria-label='Next Page']",
|
||||
"a[data-testid='pagination-page-next']",
|
||||
"[data-testid='pagination-page-next']",
|
||||
"a[aria-label*='Next']",
|
||||
];
|
||||
|
||||
for (const selector of nextButtonSelectors) {
|
||||
try {
|
||||
const nextButton = await page.$(selector);
|
||||
if (nextButton) {
|
||||
const isDisabled = await nextButton.evaluate((el) => {
|
||||
return el.hasAttribute("disabled") ||
|
||||
el.getAttribute("aria-disabled") === "true" ||
|
||||
el.classList.contains("disabled");
|
||||
}).catch(() => false);
|
||||
|
||||
if (!isDisabled) {
|
||||
// Get current URL before navigation
|
||||
const urlBefore = page.url();
|
||||
|
||||
await nextButton.scrollIntoViewIfNeeded().catch(() => {});
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
|
||||
await nextButton.click();
|
||||
logger.info(`✅ Clicked next page button`);
|
||||
|
||||
// Wait for navigation to complete (URL change or content load)
|
||||
// Indeed might use AJAX, so wait for either URL change or content update
|
||||
let navigationComplete = false;
|
||||
const maxWaitTime = 10000; // 10 seconds max wait
|
||||
const startTime = Date.now();
|
||||
|
||||
while (!navigationComplete && (Date.now() - startTime) < maxWaitTime) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
|
||||
// Check if URL changed (full page navigation)
|
||||
const currentUrl = page.url();
|
||||
if (currentUrl !== urlBefore) {
|
||||
logger.info(`📍 URL changed to: ${currentUrl}`);
|
||||
navigationComplete = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Check if job elements appeared (AJAX navigation)
|
||||
const jobCount = await page.$$eval(
|
||||
"[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard",
|
||||
(elements) => elements.length
|
||||
).catch(() => 0);
|
||||
|
||||
if (jobCount > 0) {
|
||||
logger.info(`✅ Found ${jobCount} job elements (AJAX navigation)`);
|
||||
navigationComplete = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Additional wait for content to stabilize
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
|
||||
// Check for CAPTCHA after navigation
|
||||
const currentUrl = page.url();
|
||||
if (currentUrl.includes('captcha') || currentUrl.includes('verify') || currentUrl.includes('blocked')) {
|
||||
logger.error(`❌ CAPTCHA detected after navigation to page. URL: ${currentUrl}`);
|
||||
throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false) or wait and try again.');
|
||||
}
|
||||
|
||||
// Check page content for CAPTCHA
|
||||
try {
|
||||
const hasCaptcha = await page.evaluate(() => {
|
||||
const bodyText = document.body?.textContent?.toLowerCase() || '';
|
||||
const indicators = ['verify you\'re human', 'captcha', 'unusual traffic', 'automated queries'];
|
||||
return indicators.some(ind => bodyText.includes(ind));
|
||||
});
|
||||
|
||||
if (hasCaptcha) {
|
||||
logger.error(`❌ CAPTCHA detected on page content after navigation`);
|
||||
throw new Error('Indeed CAPTCHA detected. Please run in non-headless mode (HEADLESS=false) to solve it manually.');
|
||||
}
|
||||
} catch (captchaError) {
|
||||
if (captchaError.message.includes('CAPTCHA')) {
|
||||
throw captchaError;
|
||||
}
|
||||
}
|
||||
|
||||
// Scroll page to trigger any lazy loading
|
||||
try {
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo(0, 300);
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
} catch (e) {
|
||||
// Ignore scroll errors
|
||||
}
|
||||
|
||||
// Final check for job elements with multiple selectors
|
||||
const finalJobCount = await page.$$eval(
|
||||
"[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard, div[data-testid='job-card']",
|
||||
(elements) => elements.length
|
||||
).catch(() => 0);
|
||||
|
||||
if (finalJobCount > 0) {
|
||||
logger.info(`✅ Navigation successful, found ${finalJobCount} job elements`);
|
||||
return true;
|
||||
} else {
|
||||
logger.warning(`⚠️ No job elements found after navigation (waited ${maxWaitTime}ms)`);
|
||||
// Debug: check what's on the page
|
||||
try {
|
||||
const pageTitle = await page.title();
|
||||
const pageUrl = page.url();
|
||||
logger.debug(`Page title: ${pageTitle}, URL: ${pageUrl}`);
|
||||
|
||||
// Check if it's a CAPTCHA page
|
||||
const bodyText = await page.evaluate(() => document.body?.textContent?.toLowerCase() || '');
|
||||
if (bodyText.includes('captcha') || bodyText.includes('verify')) {
|
||||
logger.error(`❌ Page appears to be a CAPTCHA page`);
|
||||
throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false).');
|
||||
}
|
||||
} catch (e) {
|
||||
if (e.message.includes('CAPTCHA')) {
|
||||
throw e;
|
||||
}
|
||||
// Ignore other debug errors
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
logger.warning(`⚠️ Could not find or click next page button`);
|
||||
return false;
|
||||
} catch (error) {
|
||||
logger.warning(`Failed to navigate to next page: ${error.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
indeedStrategy,
|
||||
buildSearchUrl,
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user