linkedout/job-search-parser/strategies/linkedin-jobs-strategy.js
tanyar09 47cdc03fb8 Enhance job search parser with advanced keyword filtering and job detail extraction
- Implemented grouped AND/OR logic for keyword searches, allowing for more flexible job matching criteria.
- Added a minimum date filter to restrict job results to postings after a specified date.
- Enhanced job detail extraction to include role duties and job requirements from job descriptions.
- Updated README with new command line options and examples for using date filters and keyword logic.
- Improved logging to provide clearer insights into keyword matching logic and job search parameters.
2025-12-18 13:33:19 -05:00

1683 lines
63 KiB
JavaScript

/**
* LinkedIn Jobs Parsing Strategy
*
* Uses core-parser for browser management and ai-analyzer for utilities
*/
const {
logger,
cleanText,
validateLocationAgainstFilters,
parseLocationFilters,
containsAnyKeyword,
containsAllKeywords,
matchesKeywordGroups,
} = require("ai-analyzer");
/**
* LinkedIn Jobs URL builder
*/
function buildJobSearchUrl(keyword, location = "", filters = {}) {
const baseUrl = "https://www.linkedin.com/jobs/search/";
// Always wrap keywords in quotes to ensure exact phrase matching
// LinkedIn's search treats unquoted keywords as individual words (OR logic)
// e.g., "co-op" becomes "co" OR "op", "software engineer" becomes "software" OR "engineer"
// Wrapping in quotes forces LinkedIn to search for the exact phrase
// URLSearchParams will properly encode the quotes
const searchKeyword = `"${keyword}"`;
const params = new URLSearchParams({
keywords: searchKeyword,
sortBy: "DD", // Date posted (newest first)
});
if (location) {
params.append("location", location);
}
// Add date filter if provided (f_TPR parameter)
// LinkedIn uses f_TPR=r<seconds> where seconds is the time range
if (filters.minDate) {
try {
const minDate = new Date(filters.minDate);
const now = new Date();
const secondsDiff = Math.floor((now - minDate) / 1000);
// LinkedIn supports relative timeframes (f_TPR parameter)
// If date is in the future, don't add filter
if (secondsDiff > 0) {
// LinkedIn typically supports up to ~30 days (2592000 seconds)
// For dates older than 30 days, we'll still add it but LinkedIn may limit results
const maxSeconds = 2592000; // 30 days
const timeRange = Math.min(secondsDiff, maxSeconds);
params.append("f_TPR", `r${timeRange}`);
}
} catch (error) {
logger.warning(`⚠️ Invalid date format for minDate: ${filters.minDate}. Expected format: YYYY-MM-DD`);
}
}
// Add additional filters
if (filters.experienceLevel) {
params.append("f_E", filters.experienceLevel);
}
if (filters.jobType) {
params.append("f_JT", filters.jobType); // F=Full-time, P=Part-time, C=Contract, T=Temporary, I=Internship
}
if (filters.remote) {
params.append("f_WT", "2"); // 2 = Remote
}
return `${baseUrl}?${params.toString()}`;
}
/**
* LinkedIn Jobs parsing strategy function
*/
async function linkedinJobsStrategy(coreParser, options = {}) {
const {
keywords = ["software engineer", "developer"],
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
locationFilter = null,
maxPages = 5,
credentials = {},
location = "", // LinkedIn location search (e.g., "Canada", "Toronto, Ontario, Canada")
minDate = null, // Minimum posted date (format: YYYY-MM-DD)
useAndLogic = false, // Use AND logic instead of OR logic for keywords
} = options;
const results = [];
const rejectedResults = [];
const seenJobs = new Set();
// Create a backup to track results in case of issues
let resultsBackup = [];
let rejectedResultsBackup = [];
try {
// Create main page
const page = await coreParser.createPage("linkedin-jobs-main");
// Authenticate to LinkedIn
logger.info("🔐 Authenticating to LinkedIn...");
await coreParser.authenticate("linkedin", credentials, "linkedin-jobs-main");
logger.info("✅ LinkedIn authentication successful");
logger.info("🚀 Starting LinkedIn Jobs parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
if (keywordGroups) {
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
} else {
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
}
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`🌍 LinkedIn Location: ${location || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
if (minDate) {
logger.info(`📅 Min Date Filter: ${minDate} (jobs posted after this date)`);
}
// Determine search keywords based on logic type
let searchKeywords;
if (keywordGroups) {
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
// We'll combine results and filter to ensure all groups match (AND between groups)
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
} else if (useAndLogic) {
// For simple AND logic, combine all keywords into a single search query
searchKeywords = [keywords.join(" ")];
} else {
// For OR logic, search each keyword separately
searchKeywords = keywords;
}
// Search for each keyword (or combined keyword for AND logic)
for (const keyword of searchKeywords) {
logger.info(`\n🔍 Searching LinkedIn Jobs for: "${keyword}"`);
const searchUrl = buildJobSearchUrl(keyword, location, {
minDate: minDate,
});
logger.info(`🔗 Search URL: ${searchUrl}`);
// Check if page is still valid before proceeding
try {
await page.evaluate(() => document.readyState).catch(() => {
throw new Error("Page is no longer valid - browser may have closed");
});
} catch (pageError) {
logger.error(`❌ Page is no longer accessible: ${pageError.message}`);
logger.info(`⚠️ Preserving ${results.length} jobs found so far`);
break; // Exit keyword loop if page is invalid
}
try {
// Navigate to job search results
await coreParser.navigateTo(searchUrl, {
pageId: "linkedin-jobs-main",
retries: 2,
waitUntil: "networkidle",
});
// Wait for page to load - reduced delay, use networkidle from navigation
await new Promise((resolve) => setTimeout(resolve, 2000));
// Verify we're on the right page and check what LinkedIn shows
const currentUrl = page.url();
logger.info(`📍 Current page URL: ${currentUrl}`);
// Check if LinkedIn shows any results count
try {
const resultsText = await page.evaluate(() => {
// Look for result count text like "Showing X results" or "X jobs"
const possibleTexts = [
document.querySelector("h1")?.textContent,
document.querySelector(".results-context-header__job-count")?.textContent,
document.querySelector("[class*='results-count']")?.textContent,
document.querySelector("[class*='job-count']")?.textContent,
].filter(Boolean);
return possibleTexts.join(" | ") || "No results count found";
});
logger.info(`📊 LinkedIn results info: ${resultsText}`);
} catch (e) {
logger.debug(`Could not get results count: ${e.message}`);
}
// Scroll to trigger lazy loading - single scroll operation
try {
await page.evaluate(() => {
window.scrollTo(0, 500);
});
await new Promise((resolve) => setTimeout(resolve, 1000));
} catch (e) {
logger.debug(`Could not scroll page: ${e.message}`);
}
// Wait for job listings container - try multiple selectors
let hasResults = false;
const possibleSelectors = [
".jobs-search-results-list",
".jobs-search-results",
"[data-test-id='job-search-results-list']",
".scaffold-layout__list-container",
"ul.scaffold-layout__list-container",
".jobs-search__results-list",
"main .scaffold-layout__list",
];
// Try selectors in parallel with shorter timeout
const selectorPromises = possibleSelectors.map(async (selector) => {
try {
await page.waitForSelector(selector, { timeout: 3000 });
const count = await page.$$(selector).then((elements) => elements.length);
if (count > 0) {
return { selector, count, success: true };
}
} catch (e) {
// Selector failed
}
return { selector, success: false };
});
// Wait for first successful selector
const selectorResults = await Promise.allSettled(selectorPromises);
for (const result of selectorResults) {
if (result.status === 'fulfilled' && result.value.success) {
hasResults = true;
logger.info(`✅ Found job results container with selector: ${result.value.selector}`);
break;
}
}
if (!hasResults) {
logger.warning(`⚠️ No job results container found for keyword: ${keyword}`);
// Debug: Check what's actually on the page
try {
const pageTitle = await page.title();
const pageUrl = page.url();
logger.info(`📄 Page title: ${pageTitle}`);
logger.info(`🔗 Page URL: ${pageUrl}`);
// Check for common LinkedIn elements
const hasMain = await page.$("main").then(el => el !== null).catch(() => false);
const hasJobsSection = await page.$("[class*='job']").then(el => el !== null).catch(() => false);
logger.info(`🔍 Debug - Has main: ${hasMain}, Has jobs section: ${hasJobsSection}`);
// Take screenshot for debugging
const screenshotPath = `debug-linkedin-jobs-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
await page.screenshot({ path: screenshotPath, fullPage: true });
logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
} catch (e) {
logger.warning(`Could not capture debug info: ${e.message}`);
}
continue;
}
// LinkedIn uses pagination with a "Next" button
// Extract jobs from each page and navigate to next page
const allJobs = [];
let currentPage = 1;
const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited
logger.info(`📄 Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`);
while (currentPage <= maxPagesToProcess) {
logger.info(`📄 Processing page ${currentPage}...`);
// Wait for page to fully load
await new Promise((resolve) => setTimeout(resolve, 2000));
// Extract jobs from current page
const pageJobs = await extractJobsFromPage(page, keyword, locationFilter, coreParser);
logger.info(`📋 Extracted ${pageJobs.length} jobs from page ${currentPage}`);
if (pageJobs.length === 0) {
logger.warning(`⚠️ No jobs found on page ${currentPage}, stopping pagination`);
break;
}
allJobs.push(...pageJobs);
// Check if there's a next page
const hasNext = await hasNextPageAvailable(page);
if (!hasNext) {
logger.info(`✅ No more pages available. Total jobs extracted: ${allJobs.length}`);
break;
}
// Navigate to next page if we haven't reached maxPages
if (currentPage < maxPagesToProcess) {
logger.info(`➡️ Navigating to page ${currentPage + 1}...`);
const navigationSuccess = await navigateToNextPage(page);
if (!navigationSuccess) {
logger.warning(`⚠️ Failed to navigate to next page, stopping pagination`);
break;
}
currentPage++;
// Quick verification that job elements are present (navigateToNextPage already waited for them)
const jobCount = await page.$$eval(
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
(elements) => elements.length
).catch(() => 0);
if (jobCount === 0) {
logger.warning(`⚠️ No job elements found on page ${currentPage} after navigation, stopping pagination`);
break;
}
logger.debug(`✅ Page ${currentPage} loaded with ${jobCount} job elements`);
} else {
logger.info(`📊 Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${allJobs.length}`);
break;
}
}
logger.info(`📋 Extracted ${allJobs.length} total jobs across ${currentPage} page(s)`);
// Verify page is still valid after extraction
try {
await page.evaluate(() => document.readyState);
} catch (pageError) {
logger.warning(`⚠️ Page became invalid after extraction, but we have ${allJobs.length} jobs extracted`);
}
// Log sample job data for debugging
if (allJobs.length > 0 && process.env.DEBUG === "true") {
const sampleJob = allJobs[0];
logger.debug(`📝 Sample job: ID=${sampleJob.jobId}, Title=${sampleJob.title}, Location=${sampleJob.location || 'N/A'}, Company=${sampleJob.company || 'N/A'}`);
}
let duplicateCount = 0;
let locationRejectedCount = 0;
let addedCount = 0;
let noJobIdCount = 0;
for (const job of allJobs) {
// Handle jobs without jobId - use URL as fallback identifier
if (!job.jobId || job.jobId === "") {
noJobIdCount++;
// Use job URL as identifier if no jobId
if (job.jobUrl) {
const urlMatch = job.jobUrl.match(/\/jobs\/view\/(\d+)/);
if (urlMatch) {
job.jobId = urlMatch[1];
} else {
// Generate a unique ID from URL
job.jobId = `linkedin-${job.jobUrl.replace(/[^a-zA-Z0-9]/g, '-')}`;
}
} else {
// No jobId and no URL - skip this job
logger.warning(`⚠️ Job has no jobId or URL, skipping: ${job.title || 'Unknown'}`);
continue;
}
}
// Skip duplicates
if (seenJobs.has(job.jobId)) {
duplicateCount++;
if (process.env.DEBUG === "true") {
logger.debug(`⏭️ Skipping duplicate job: ${job.jobId} - ${job.title}`);
}
continue;
}
seenJobs.add(job.jobId);
// Validate keywords based on logic type
if (keywordGroups) {
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
const fullText = `${job.title} ${job.description} ${job.company}`;
if (!matchesKeywordGroups(fullText, keywordGroups)) {
rejectedResults.push({
...job,
rejectionReason: "Job does not match all keyword groups",
});
if (process.env.DEBUG === "true") {
logger.debug(`🔍 Rejected (grouped logic): "${job.title}" - does not match all groups`);
}
continue;
}
} else if (useAndLogic) {
// Simple AND logic: all keywords must match
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
if (!containsAllKeywords(fullText, keywords)) {
rejectedResults.push({
...job,
rejectionReason: "Not all keywords found in job listing",
});
if (process.env.DEBUG === "true") {
logger.debug(`🔍 Rejected (AND logic): "${job.title}" - not all keywords found`);
}
continue;
}
}
// For OR logic, trust LinkedIn's search results (already filtered)
// Validate location if filtering enabled
if (locationFilter) {
// Parse locationFilter string into array if it's a string
const locationFiltersArray = typeof locationFilter === 'string'
? parseLocationFilters(locationFilter)
: locationFilter;
const locationValid = validateLocationAgainstFilters(
job.location,
locationFiltersArray
);
if (!locationValid.isValid) {
locationRejectedCount++;
rejectedResults.push({
...job,
rejectionReason: locationValid.reasoning || "Location filter mismatch",
});
if (process.env.DEBUG === "true") {
logger.debug(`📍 Rejected location: "${job.location}" - ${locationValid.reasoning || "Location filter mismatch"}`);
}
continue;
}
}
results.push(job);
addedCount++;
}
// Backup results after each keyword processing
resultsBackup = [...results];
rejectedResultsBackup = [...rejectedResults];
logger.info(`📊 Processing complete: ${addedCount} added, ${locationRejectedCount} location-rejected, ${duplicateCount} duplicates, ${noJobIdCount} had no jobId`);
logger.info(`📊 Current results count: ${results.length} jobs accumulated so far`);
logger.info(`📊 Backup results count: ${resultsBackup.length} jobs in backup`);
} catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
logger.error(`Stack: ${error.stack}`);
// Preserve results even if there's an error
logger.info(`⚠️ Preserving ${results.length} jobs found before error`);
}
}
// Log results before returning
logger.info(`📊 Final results check: results.length=${results.length}, rejectedResults.length=${rejectedResults.length}`);
logger.info(`📊 Backup check: resultsBackup.length=${resultsBackup.length}, rejectedResultsBackup.length=${rejectedResultsBackup.length}`);
// If results array is empty but backup has data, use backup (defensive programming)
const finalResults = results.length > 0 ? results : resultsBackup;
const finalRejectedResults = rejectedResults.length > 0 ? rejectedResults : rejectedResultsBackup;
if (results.length === 0 && resultsBackup.length > 0) {
logger.warning(`⚠️ Results array was empty but backup has ${resultsBackup.length} jobs - using backup!`);
}
if (finalResults.length > 0) {
logger.info(`📝 First result sample: ${JSON.stringify(finalResults[0], null, 2).substring(0, 200)}...`);
}
logger.info(
`🎯 LinkedIn Jobs parsing completed: ${finalResults.length} jobs found, ${finalRejectedResults.length} rejected`
);
// Final verification - log if results seem wrong
if (finalResults.length === 0 && finalRejectedResults.length === 0) {
logger.warning(`⚠️ No jobs found or rejected - this might indicate an extraction issue`);
}
const returnValue = {
results: [...finalResults], // Create a copy to ensure we're returning the right data
rejectedResults: [...finalRejectedResults],
summary: {
totalJobs: finalResults.length,
totalRejected: finalRejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
source: "linkedin-jobs",
},
};
logger.info(`📦 Returning: ${returnValue.results.length} results, ${returnValue.rejectedResults.length} rejected`);
return returnValue;
} catch (error) {
logger.error(`❌ LinkedIn Jobs parsing failed: ${error.message}`);
logger.error(`Stack: ${error.stack}`);
// Return whatever results we have, even if there was an error
logger.info(`⚠️ Returning ${results.length} jobs found before fatal error`);
return {
results,
rejectedResults,
summary: {
totalJobs: results.length,
totalRejected: rejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
source: "linkedin-jobs",
error: error.message,
},
};
}
}
/**
* Scroll to load more jobs (LinkedIn uses infinite scroll) - improved to load all jobs
*/
async function scrollToLoadJobs(page) {
try {
let previousJobCount = 0;
let currentJobCount = 0;
let scrollAttempts = 0;
let noChangeCount = 0; // Count how many times count hasn't changed
const maxScrollAttempts = 50; // Increased for large result sets
const maxNoChangeAttempts = 3; // Stop if count doesn't change 3 times in a row
logger.info(`📜 Starting to scroll and load jobs...`);
// Keep scrolling until no more jobs load
while (scrollAttempts < maxScrollAttempts) {
// Count current jobs
currentJobCount = await page.$$eval(
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
(elements) => elements.length
).catch(() => 0);
// If no new jobs loaded, increment no-change counter
if (currentJobCount === previousJobCount && scrollAttempts > 0) {
noChangeCount++;
// If count hasn't changed 3 times in a row, we're probably done
if (noChangeCount >= maxNoChangeAttempts) {
logger.info(`📊 Loaded ${currentJobCount} jobs after ${scrollAttempts} scrolls (no new jobs for ${noChangeCount} attempts)`);
break;
}
} else {
// Count changed, reset no-change counter
noChangeCount = 0;
}
previousJobCount = currentJobCount;
// Scroll down - use smooth scrolling to trigger lazy loading
await page.evaluate(() => {
window.scrollTo({
top: document.body.scrollHeight,
behavior: 'smooth'
});
});
// Wait for new content to load - LinkedIn sometimes needs more time
await new Promise((resolve) => setTimeout(resolve, 2500));
// Also try scrolling in smaller increments to trigger lazy loading
if (scrollAttempts % 3 === 0) {
await page.evaluate(() => {
window.scrollBy(0, 1000);
});
await new Promise((resolve) => setTimeout(resolve, 1000));
}
scrollAttempts++;
// Log progress every 5 scrolls
if (scrollAttempts % 5 === 0) {
const newCount = await page.$$eval(
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
(elements) => elements.length
).catch(() => 0);
logger.info(`📜 Scrolled ${scrollAttempts} times, loaded ${newCount} jobs so far...`);
}
}
// Final scroll to ensure everything is loaded
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
await new Promise((resolve) => setTimeout(resolve, 2000));
// Final count
const finalCount = await page.$$eval(
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
(elements) => elements.length
).catch(() => 0);
logger.info(`✅ Finished scrolling. Total jobs loaded: ${finalCount}`);
} catch (error) {
logger.warning(`Could not scroll page: ${error.message}`);
}
}
/**
* Extract jobs from current page
*/
async function extractJobsFromPage(page, keyword, locationFilter, coreParser = null) {
const jobs = [];
try {
// LinkedIn job listings are in <ul> with class "jobs-search-results__list"
// Each job is a <li> with class "jobs-search-results__list-item"
// Try multiple selectors as LinkedIn changes their structure
const jobSelectors = [
"li.jobs-search-results__list-item",
"li[data-occludable-job-id]",
".job-card-container",
"[data-test-id='job-search-result']",
".scaffold-layout__list-item",
"li.scaffold-layout__list-item",
"ul.scaffold-layout__list-container > li",
"main ul li",
"[class*='job-card']",
"[class*='job-search-result']",
];
// Try selectors in parallel for faster detection
let jobElements = [];
const selectorChecks = jobSelectors.map(async (selector) => {
try {
await page.waitForSelector(selector, { timeout: 2000 }).catch(() => {});
const elements = await page.$$(selector);
if (elements.length > 0) {
return { selector, elements, success: true };
}
} catch (e) {
// Selector failed
}
return { selector, elements: [], success: false };
});
const selectorResults = await Promise.allSettled(selectorChecks);
for (const result of selectorResults) {
if (result.status === 'fulfilled' && result.value.success) {
jobElements = result.value.elements;
logger.info(`✅ Found ${jobElements.length} job elements using selector: ${result.value.selector}`);
break;
}
}
if (jobElements.length === 0) {
logger.warning(`⚠️ No job elements found with any selector`);
// Fallback: Try to find job links directly and use their parent containers
try {
logger.info(`🔍 Trying fallback: searching for job links directly...`);
const jobLinks = await page.$$("a[href*='/jobs/view/']");
if (jobLinks.length > 0) {
logger.info(`✅ Found ${jobLinks.length} job links using fallback method`);
// Get unique parent containers for each link
const seenUrls = new Set();
const parentElements = [];
for (const link of jobLinks) {
try {
// Extract job URL to check for duplicates
const href = await link.getAttribute("href");
if (!href || seenUrls.has(href)) continue;
seenUrls.add(href);
// Get parent element using evaluate
const parentHandle = await link.evaluateHandle((el) => {
return el.closest("li") || el.closest("[class*='card']") || el.closest("div") || el.parentElement;
});
if (parentHandle) {
parentElements.push(parentHandle);
}
} catch (e) {
// Skip if we can't process this link
}
}
if (parentElements.length > 0) {
jobElements = parentElements;
logger.info(`✅ Using ${jobElements.length} unique job elements from fallback`);
}
}
} catch (e) {
logger.warning(`Fallback method failed: ${e.message}`);
}
// Debug: Log what selectors we can find
if (jobElements.length === 0) {
try {
const allLis = await page.$$("li").then(elements => elements.length);
const allDivs = await page.$$("div[class*='job']").then(elements => elements.length);
const jobLinks = await page.$$("a[href*='/jobs/']").then(elements => elements.length);
logger.info(`🔍 Debug - Found ${allLis} <li> elements, ${allDivs} job-related divs, ${jobLinks} job links`);
// Try to find any list container
const listContainers = await page.$$("ul, ol").then(elements => elements.length);
logger.info(`🔍 Debug - Found ${listContainers} list containers`);
} catch (e) {
// Ignore debug errors
}
}
if (jobElements.length === 0) {
return jobs;
}
}
let extractedCount = 0;
let failedCount = 0;
for (let i = 0; i < jobElements.length; i++) {
const jobElement = jobElements[i];
try {
// Scroll job into view and hover to trigger lazy loading of content
try {
await jobElement.scrollIntoViewIfNeeded();
await new Promise((resolve) => setTimeout(resolve, 100)); // Small delay for content to load
// Hover over the element to trigger LinkedIn's lazy loading
await jobElement.hover().catch(() => {
// If hover fails, try scrolling again
jobElement.scrollIntoViewIfNeeded();
});
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait for content to load after hover
} catch (scrollError) {
// If scrolling/hovering fails, continue anyway - might still have data
logger.debug(`Could not scroll/hover job element ${i}: ${scrollError.message}`);
}
const job = await extractJobData(jobElement, keyword, page, coreParser);
if (job && (job.title || job.jobId)) {
// Only add if we have at least a title or jobId
jobs.push(job);
extractedCount++;
} else {
failedCount++;
if (process.env.DEBUG === "true") {
logger.debug(`Job ${i} extraction returned empty: jobId=${job?.jobId || 'none'}, title=${job?.title || 'none'}`);
}
}
} catch (error) {
logger.warning(`Failed to extract job data for element ${i}: ${error.message}`);
failedCount++;
}
}
if (jobElements.length > 0) {
logger.info(`📊 Extraction summary: ${extractedCount} successful, ${failedCount} failed out of ${jobElements.length} job elements`);
}
} catch (error) {
logger.error(`Failed to extract jobs from page: ${error.message}`);
}
return jobs;
}
/**
* Extract full job description from job detail page
*/
async function extractFullJobDescription(coreParser, jobUrl) {
try {
if (!jobUrl) {
return { fullDescription: "", roleDuties: "", jobRequirements: "" };
}
// Create a separate page for detail extraction to avoid disrupting search results
const detailPage = await coreParser.createPage(`linkedin-job-detail-${Date.now()}`);
try {
// Navigate to job detail page
await detailPage.goto(jobUrl, { waitUntil: "networkidle2", timeout: 30000 }).catch(() => {});
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait for content to load
const jobDetails = await detailPage.evaluate(() => {
const details = {
fullDescription: "",
roleDuties: "",
jobRequirements: "",
};
// Try multiple selectors for job description container
const descriptionSelectors = [
".description__text",
".show-more-less-html__markup",
"[class*='description__text']",
"[class*='job-description']",
".jobs-description__text",
".jobs-box__html-content",
"[data-test-id='job-description']",
".jobs-details__main-content",
".jobs-description-content__text",
];
let descriptionElement = null;
for (const selector of descriptionSelectors) {
descriptionElement = document.querySelector(selector);
if (descriptionElement) {
break;
}
}
if (descriptionElement) {
details.fullDescription = descriptionElement.textContent?.trim() ||
descriptionElement.innerText?.trim() || "";
}
// If we didn't find description, try to get from main content area
if (!details.fullDescription) {
const mainContent = document.querySelector("main") ||
document.querySelector("[class*='jobs-details']") ||
document.querySelector("[class*='job-details']");
if (mainContent) {
details.fullDescription = mainContent.textContent?.trim() ||
mainContent.innerText?.trim() || "";
}
}
return details;
});
// Parse duties and requirements from full description
const parsed = parseDutiesAndRequirements(jobDetails.fullDescription);
return {
fullDescription: jobDetails.fullDescription,
roleDuties: parsed.duties,
jobRequirements: parsed.requirements,
};
} finally {
// Close the detail page to free resources
try {
await detailPage.close();
} catch (closeError) {
// Ignore close errors
}
}
} catch (error) {
logger.warning(`Failed to extract full job description from ${jobUrl}: ${error.message}`);
return { fullDescription: "", roleDuties: "", jobRequirements: "" };
}
}
/**
* Parse job description to separate role duties from job requirements
*/
function parseDutiesAndRequirements(description) {
if (!description || description.trim().length === 0) {
return { duties: "", requirements: "" };
}
const duties = [];
const requirements = [];
// Common section headers that indicate duties/responsibilities
const dutiesKeywords = [
/responsibilities?:/i,
/duties?:/i,
/what you['\u2019]ll do/i,
/key responsibilities/i,
/your role/i,
/position overview/i,
/about the role/i,
/role overview/i,
/what we need/i,
/you will:/i,
/you['\u2019]ll be responsible/i,
];
// Common section headers that indicate requirements/qualifications
const requirementsKeywords = [
/requirements?:/i,
/qualifications?:/i,
/must have/i,
/required:/i,
/what you['\u2019]ll bring/i,
/you have:/i,
/skills required/i,
/minimum requirements/i,
/preferred qualifications/i,
/education:/i,
/experience:/i,
/you must have/i,
/we['\u2019]re looking for/i,
];
// Split description into sections (by common delimiters)
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
let currentSection = "duties"; // Default to duties
let dutiesText = "";
let requirementsText = "";
for (const section of sections) {
const sectionLower = section.toLowerCase();
// Check if this section is about requirements
let isRequirementsSection = false;
for (const keyword of requirementsKeywords) {
if (keyword.test(section)) {
isRequirementsSection = true;
currentSection = "requirements";
break;
}
}
// Check if this section is about duties/responsibilities
if (!isRequirementsSection) {
for (const keyword of dutiesKeywords) {
if (keyword.test(section)) {
currentSection = "duties";
break;
}
}
}
// Add to appropriate section
if (currentSection === "requirements") {
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
} else {
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
}
}
// If we couldn't split by sections, try to find bullet points or numbered lists
if (!dutiesText && !requirementsText) {
const lines = description.split(/\n/);
let foundRequirementsHeader = false;
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.length === 0) continue;
// Check if this line is a requirements header
for (const keyword of requirementsKeywords) {
if (keyword.test(line)) {
foundRequirementsHeader = true;
break;
}
}
if (foundRequirementsHeader) {
requirementsText += (requirementsText ? "\n" : "") + line;
} else {
// Check if it's a duties header
let isDutiesHeader = false;
for (const keyword of dutiesKeywords) {
if (keyword.test(line)) {
isDutiesHeader = true;
break;
}
}
if (!isDutiesHeader) {
// Add to duties if we haven't found requirements header yet
if (!foundRequirementsHeader) {
dutiesText += (dutiesText ? "\n" : "") + line;
} else {
requirementsText += (requirementsText ? "\n" : "") + line;
}
} else {
dutiesText += (dutiesText ? "\n" : "") + line;
}
}
}
}
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
if (!dutiesText && !requirementsText && description) {
const midPoint = Math.floor(description.length * 0.6);
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
const splitPoint = Math.max(
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
);
dutiesText = description.substring(0, splitPoint).trim();
requirementsText = description.substring(splitPoint).trim();
}
return {
duties: dutiesText.trim(),
requirements: requirementsText.trim(),
};
}
/**
* Extract data from individual job element
*/
async function extractJobData(jobElement, keyword, page = null, coreParser = null) {
try {
const jobData = await jobElement.evaluate((el) => {
const data = {
jobId: "",
title: "",
company: "",
location: "",
jobUrl: "",
postedDate: "",
description: "",
jobType: "",
experienceLevel: "",
};
// Extract job ID from data-job-id or link
const jobIdAttr = el.getAttribute("data-job-id") ||
el.getAttribute("data-occludable-job-id") ||
el.querySelector("[data-job-id]")?.getAttribute("data-job-id");
if (jobIdAttr) {
data.jobId = jobIdAttr.toString();
}
// Extract title and URL - try multiple selectors (updated for LinkedIn's current structure)
const titleSelectors = [
"a.job-card-list__title",
".job-card-list__title-link",
"a[data-test-id='job-title']",
".base-search-card__title a",
"h3 a",
".job-card-container__link",
"a[href*='/jobs/view/']",
".job-card-list__title a",
".base-search-card__title",
"h3.base-search-card__title a",
"[class*='job-title'] a",
"[class*='job-card'] a[href*='/jobs/']",
"a[href*='/jobs/view/'] span", // LinkedIn sometimes wraps title in span
"h3[class*='title'] a",
"h4[class*='title'] a",
".job-search-card__title a",
".jobs-search-results__list-item a[href*='/jobs/view/']",
];
for (const selector of titleSelectors) {
const link = el.querySelector(selector);
if (link) {
data.jobUrl = link.getAttribute("href") || "";
// Extract job ID from URL if not found
if (!data.jobId && data.jobUrl) {
const match = data.jobUrl.match(/\/jobs\/view\/(\d+)/);
if (match) {
data.jobId = match[1];
}
}
// Try to get text from link or its children
data.title = link.textContent?.trim() || link.innerText?.trim() || "";
// If link has no text, try getting from child span or div
if (!data.title) {
const child = link.querySelector("span, div");
if (child) {
data.title = child.textContent?.trim() || child.innerText?.trim() || "";
}
}
if (data.title) break;
}
}
// Fallback: Get title from any link with job URL pattern
if (!data.title) {
const allLinks = el.querySelectorAll("a[href*='/jobs/view/']");
for (const link of allLinks) {
const href = link.getAttribute("href") || "";
if (href.includes("/jobs/view/")) {
data.jobUrl = href;
// Extract job ID from URL
const match = href.match(/\/jobs\/view\/(\d+)/);
if (match) {
data.jobId = match[1];
}
// Get text from link or any visible child
data.title = link.textContent?.trim() || link.innerText?.trim() || "";
if (!data.title) {
const visibleChild = Array.from(link.querySelectorAll("*")).find(
child => child.textContent?.trim() && child.offsetParent !== null
);
if (visibleChild) {
data.title = visibleChild.textContent?.trim() || "";
}
}
if (data.title) break;
}
}
}
// Last resort: Extract from aria-label or title attribute
if (!data.title) {
const linkWithAria = el.querySelector("a[aria-label], a[title]");
if (linkWithAria) {
data.title = linkWithAria.getAttribute("aria-label") ||
linkWithAria.getAttribute("title") || "";
if (linkWithAria.getAttribute("href")?.includes("/jobs/view/")) {
data.jobUrl = linkWithAria.getAttribute("href");
}
}
}
// Extract company name - try multiple selectors and patterns
const companySelectors = [
".job-card-container__company-name",
".job-card-container__primary-description",
"a[data-test-id='job-company-name']",
".base-search-card__subtitle",
".job-card-container__company-name-link",
"[class*='company-name']",
"[class*='job-card-container__company']",
".base-search-card__subtitle-link",
"a[href*='/company/']",
"[class*='subtitle']",
"[class*='primary-description']",
];
for (const selector of companySelectors) {
const companyElement = el.querySelector(selector);
if (companyElement) {
const text = companyElement.textContent?.trim() ||
companyElement.innerText?.trim() || "";
// Filter out common non-company text
if (text &&
text.length > 1 &&
text.length < 100 &&
!text.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i) &&
!text.includes(",") && // Location usually has comma, company usually doesn't
!text.match(/^[A-Z][a-z]+,\s*[A-Z]/)) { // Not a location pattern
data.company = text;
break;
}
}
}
// Fallback: Look for company link and get its text
if (!data.company) {
const companyLink = el.querySelector("a[href*='/company/']");
if (companyLink) {
const linkText = companyLink.textContent?.trim() || companyLink.innerText?.trim() || "";
if (linkText && linkText.length > 1 && linkText.length < 100) {
data.company = linkText;
}
}
}
// Fallback: Look for text that appears after the title but before location/metadata
if (!data.company) {
const titleElement = el.querySelector("a[href*='/jobs/view/']");
if (titleElement) {
// Get the next sibling or parent's next child
let current = titleElement.parentElement;
if (current) {
const siblings = Array.from(current.children);
const titleIndex = siblings.indexOf(titleElement);
// Check next few siblings
for (let i = titleIndex + 1; i < Math.min(titleIndex + 4, siblings.length); i++) {
const sibling = siblings[i];
const text = sibling.textContent?.trim() || sibling.innerText?.trim() || "";
if (text &&
text.length > 1 &&
text.length < 100 &&
!text.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i) &&
!text.includes(",")) {
data.company = text;
break;
}
}
}
}
}
// Extract location - try multiple selectors and patterns
const locationSelectors = [
".job-card-container__metadata-item",
".job-card-container__metadata-wrapper .job-card-container__metadata-item",
"[data-test-id='job-location']",
".base-search-card__metadata",
".job-card-container__metadata",
"[class*='metadata']",
"[class*='location']",
];
for (const selector of locationSelectors) {
const locationElements = el.querySelectorAll(selector);
for (const locationElement of locationElements) {
const text = locationElement.textContent?.trim() ||
locationElement.innerText?.trim() || "";
// Check if it looks like a location (not a date or job type)
if (text &&
!text.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i) &&
(text.includes(",") ||
text.match(/^[A-Z][a-z]+,\s*[A-Z]/) || // City, State/Province pattern
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|toronto|vancouver|calgary|ottawa|montreal|canada|united states|usa)/i.test(text))) {
data.location = text;
break;
}
}
if (data.location) break;
}
// Fallback: Look for location link
if (!data.location) {
const locationLink = el.querySelector("a[href*='/location/']");
if (locationLink) {
const linkText = locationLink.textContent?.trim() || locationLink.innerText?.trim() || "";
if (linkText && linkText.length > 2) {
data.location = linkText;
}
}
}
// Fallback: Look for text patterns that look like locations
if (!data.location) {
// Get all text nodes and look for location-like patterns
const allText = el.innerText || el.textContent || "";
const lines = allText.split("\n").map(l => l.trim()).filter(l => l.length > 0);
for (const line of lines) {
// Skip if it's the title, company, or a date
if (line === data.title ||
line === data.company ||
line.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) ||
line.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i)) {
continue;
}
// Check if it looks like a location
if (line.includes(",") ||
line.match(/^[A-Z][a-z]+,\s*[A-Z]/) ||
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|toronto|vancouver|calgary|ottawa|montreal|canada)/i.test(line)) {
data.location = line;
break;
}
}
}
// Extract posted date
const dateSelectors = [
"time",
".job-card-container__metadata-item time",
"[data-test-id='job-posted-date']",
"time[datetime]",
"[class*='date']",
"[class*='posted']",
];
for (const selector of dateSelectors) {
const dateElement = el.querySelector(selector);
if (dateElement) {
const datetime = dateElement.getAttribute("datetime");
const title = dateElement.getAttribute("title");
const text = dateElement.textContent?.trim() || dateElement.innerText?.trim() || "";
if (datetime) {
data.postedDate = datetime;
break;
} else if (title && title.match(/\d{4}-\d{2}-\d{2}/)) {
data.postedDate = title;
break;
} else if (text && text.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
// Parse relative dates like "2 days ago"
const match = text.match(/(\d+)\s*(minute|hour|day|week|month|year|h|d|w)/i);
if (match) {
const amount = parseInt(match[1]);
const unit = match[2].toLowerCase();
const date = new Date();
if (unit.includes("minute") || unit === "h") {
date.setMinutes(date.getMinutes() - amount);
} else if (unit.includes("hour") || unit === "h") {
date.setHours(date.getHours() - amount);
} else if (unit.includes("day") || unit === "d") {
date.setDate(date.getDate() - amount);
} else if (unit.includes("week") || unit === "w") {
date.setDate(date.getDate() - (amount * 7));
} else if (unit.includes("month")) {
date.setMonth(date.getMonth() - amount);
} else if (unit.includes("year")) {
date.setFullYear(date.getFullYear() - amount);
}
data.postedDate = date.toISOString().split("T")[0];
} else {
data.postedDate = text;
}
break;
}
}
}
// Fallback: Look for date patterns in metadata text
if (!data.postedDate) {
const metadataItems = el.querySelectorAll("[class*='metadata']");
for (const item of metadataItems) {
const text = item.textContent?.trim() || item.innerText?.trim() || "";
if (text && text.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
const match = text.match(/(\d+)\s*(minute|hour|day|week|month|year|h|d|w)/i);
if (match) {
const amount = parseInt(match[1]);
const unit = match[2].toLowerCase();
const date = new Date();
if (unit.includes("day") || unit === "d") {
date.setDate(date.getDate() - amount);
data.postedDate = date.toISOString().split("T")[0];
break;
} else if (unit.includes("week") || unit === "w") {
date.setDate(date.getDate() - (amount * 7));
data.postedDate = date.toISOString().split("T")[0];
break;
}
}
}
}
}
// Extract job type and experience level from metadata
const metadataSelectors = [
".job-card-container__metadata-item",
"[class*='metadata']",
"[class*='job-type']",
"[class*='experience']",
];
for (const selector of metadataSelectors) {
const metadataItems = el.querySelectorAll(selector);
for (const item of metadataItems) {
const text = item.textContent?.trim() || item.innerText?.trim() || "";
// Check for job type
if (!data.jobType && text.match(/^(Full-time|Part-time|Contract|Internship|Temporary|Freelance)$/i)) {
data.jobType = text;
}
// Check for experience level
if (!data.experienceLevel && text.match(/(Entry level|Mid-Senior|Associate|Executive|Internship|Senior|Junior|Mid-level)/i)) {
data.experienceLevel = text;
}
if (data.jobType && data.experienceLevel) break;
}
if (data.jobType && data.experienceLevel) break;
}
// Fallback: Look in all text for job type and experience patterns
if (!data.jobType || !data.experienceLevel) {
const allText = el.innerText || el.textContent || "";
const words = allText.split(/\s+/);
if (!data.jobType) {
for (const word of words) {
if (word.match(/^(Full-time|Part-time|Contract|Internship|Temporary|Freelance)$/i)) {
data.jobType = word;
break;
}
}
}
if (!data.experienceLevel) {
for (let i = 0; i < words.length - 1; i++) {
const phrase = words.slice(i, i + 2).join(" ");
if (phrase.match(/(Entry level|Mid-Senior|Associate|Executive|Internship|Senior level|Junior level|Mid level)/i)) {
data.experienceLevel = phrase;
break;
}
}
}
}
// Try to get description snippet
const descSelectors = [
".job-card-list__description",
".job-card-container__description",
"[data-test-id='job-description']",
".base-search-card__snippet",
"[class*='description']",
"[class*='snippet']",
"[class*='summary']",
];
for (const selector of descSelectors) {
const descElement = el.querySelector(selector);
if (descElement) {
const text = descElement.textContent?.trim() ||
descElement.innerText?.trim() || "";
// Only use if it's substantial (more than just a few words)
if (text && text.length > 20) {
data.description = text.substring(0, 500); // Limit description length
break;
}
}
}
// Fallback: Extract description from any paragraph or div that's not title/company/location
if (!data.description) {
const allElements = el.querySelectorAll("p, div, span");
for (const elem of allElements) {
const text = elem.textContent?.trim() || elem.innerText?.trim() || "";
// Skip if it's title, company, location, or too short
if (text &&
text.length > 30 &&
text !== data.title &&
text !== data.company &&
text !== data.location &&
!text.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i)) {
data.description = text.substring(0, 500);
break;
}
}
}
return data;
});
// Clean and format
const title = cleanText(jobData.title);
let jobUrl = jobData.jobUrl || "";
if (jobUrl && !jobUrl.startsWith("http")) {
jobUrl = `https://www.linkedin.com${jobUrl}`;
}
// If we have no title and no jobId, try one more aggressive extraction
if (!jobData.jobId && !title) {
// Try to extract from the entire element's text content
try {
const allText = await jobElement.evaluate((el) => {
// Get all text, split by newlines
const text = el.innerText || el.textContent || "";
const lines = text.split("\n").map(l => l.trim()).filter(l => l.length > 0);
// Find job link
const jobLink = el.querySelector("a[href*='/jobs/view/']");
const jobUrl = jobLink?.getAttribute("href") || "";
const jobIdMatch = jobUrl.match(/\/jobs\/view\/(\d+)/);
const jobId = jobIdMatch ? jobIdMatch[1] : "";
// First non-empty line is usually the title
const title = lines[0] || "";
// Look for company (usually second line or contains company pattern)
let company = "";
for (const line of lines.slice(1, 5)) {
if (line && line.length < 100 && !line.includes(",") &&
!line.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
!line.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i)) {
company = line;
break;
}
}
// Look for location (usually has comma or location keywords)
let location = "";
for (const line of lines) {
if (line.includes(",") ||
line.match(/^[A-Z][a-z]+,\s*[A-Z]/) ||
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|toronto|vancouver|calgary|ottawa|montreal|canada|united states|usa)/i.test(line)) {
location = line;
break;
}
}
return { jobId, jobUrl, title, company, location };
});
if (allText.jobId || allText.title) {
// Use the extracted data
if (allText.jobId) jobData.jobId = allText.jobId;
if (allText.jobUrl) jobData.jobUrl = allText.jobUrl;
if (allText.title && !title) title = allText.title;
if (allText.company && !jobData.company) jobData.company = allText.company;
if (allText.location && !jobData.location) jobData.location = allText.location;
}
} catch (e) {
// Fallback extraction failed
}
}
// If we still have no title and no jobId, this extraction failed
if (!jobData.jobId && !title) {
logger.warning(`⚠️ Job extraction failed: no jobId or title found`);
return null;
}
// Filter out jobs that have jobId but no meaningful data (title, company, or location)
// These are likely jobs that haven't loaded their content yet
if (jobData.jobId && !title && !jobData.company && !jobData.location) {
logger.debug(`⚠️ Job ${jobData.jobId} has no extractable data (title, company, or location) - skipping`);
return null;
}
// Log if we're missing critical fields (only in debug mode to reduce noise)
if (process.env.DEBUG === "true") {
if (!title) {
logger.warning(`⚠️ Job ${jobData.jobId} missing title`);
}
if (!jobData.company) {
logger.debug(`⚠️ Job ${jobData.jobId || title} missing company`);
}
if (!jobData.location) {
logger.debug(`⚠️ Job ${jobData.jobId || title} missing location`);
}
}
// Generate job ID if not found
const jobId = jobData.jobId || `linkedin-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
// Extract full job details if coreParser and jobUrl are provided
let fullDetails = { fullDescription: "", roleDuties: "", jobRequirements: "" };
if (coreParser && jobUrl) {
try {
fullDetails = await extractFullJobDescription(coreParser, jobUrl);
// If we got full description, update the description field
if (fullDetails.fullDescription) {
jobData.description = fullDetails.fullDescription;
}
} catch (error) {
logger.debug(`Could not extract full job details for ${jobUrl}: ${error.message}`);
}
}
return {
jobId,
title,
company: cleanText(jobData.company),
location: cleanText(jobData.location),
jobUrl,
postedDate: jobData.postedDate,
description: cleanText(fullDetails.fullDescription || jobData.description),
roleDuties: cleanText(fullDetails.roleDuties),
jobRequirements: cleanText(fullDetails.jobRequirements),
jobType: jobData.jobType,
experienceLevel: jobData.experienceLevel,
keyword,
extractedAt: new Date().toISOString(),
source: "linkedin-jobs",
};
} catch (error) {
logger.warning(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Check if next page is available
*/
async function hasNextPageAvailable(page) {
try {
// LinkedIn uses pagination buttons - try multiple selectors
const nextButtonSelectors = [
"button[aria-label='Next']",
"button[aria-label='Next page']",
"button[aria-label*='Next']",
".artdeco-pagination__button--next",
"button[data-test-id='pagination-next-button']",
"button.pagination__button--next",
"button[class*='pagination'][class*='next']",
"li[class*='pagination'][class*='next'] button",
"a[aria-label='Next']",
"a[aria-label='Next page']",
];
for (const selector of nextButtonSelectors) {
try {
const nextButton = await page.$(selector);
if (nextButton) {
// Check if button is disabled
const isDisabled = await nextButton.evaluate((el) => {
return el.hasAttribute("disabled") ||
el.getAttribute("aria-disabled") === "true" ||
el.classList.contains("disabled") ||
el.classList.contains("artdeco-button--disabled");
}).catch(() => false);
// Check if button is visible
const isVisible = await nextButton.isVisible().catch(() => false);
if (!isDisabled && isVisible) {
logger.debug(`✅ Found next page button with selector: ${selector}`);
return true;
}
}
} catch (e) {
// Try next selector
continue;
}
}
logger.debug(`❌ No next page button found`);
return false;
} catch (error) {
logger.debug(`Error checking for next page: ${error.message}`);
return false;
}
}
/**
* Navigate to next page
*/
async function navigateToNextPage(page) {
try {
const nextButtonSelectors = [
"button[aria-label='Next']",
"button[aria-label='Next page']",
"button[aria-label*='Next']",
".artdeco-pagination__button--next",
"button[data-test-id='pagination-next-button']",
"button.pagination__button--next",
"button[class*='pagination'][class*='next']",
"li[class*='pagination'][class*='next'] button",
"a[aria-label='Next']",
"a[aria-label='Next page']",
];
for (const selector of nextButtonSelectors) {
try {
const nextButton = await page.$(selector);
if (nextButton) {
// Check if button is disabled
const isDisabled = await nextButton.evaluate((el) => {
return el.hasAttribute("disabled") ||
el.getAttribute("aria-disabled") === "true" ||
el.classList.contains("disabled") ||
el.classList.contains("artdeco-button--disabled");
}).catch(() => false);
if (!isDisabled) {
// Scroll button into view before clicking (minimal delay)
await nextButton.scrollIntoViewIfNeeded();
await new Promise((resolve) => setTimeout(resolve, 100));
// Click the button and wait for job elements to appear
// This is more efficient than waiting for fixed timeouts
const clickPromise = nextButton.click();
logger.info(`✅ Clicked next page button (selector: ${selector})`);
// Wait for click to complete
await clickPromise;
// Wait for job elements to appear (this is the key indicator that page loaded)
// Use Promise.race to wait for any of the common job element selectors
try {
await Promise.race([
page.waitForSelector("li[data-occludable-job-id]", { timeout: 6000 }),
page.waitForSelector("li.jobs-search-results__list-item", { timeout: 6000 }),
page.waitForSelector(".scaffold-layout__list-item", { timeout: 6000 }),
]);
// Small buffer for content to fully render
await new Promise((resolve) => setTimeout(resolve, 300));
return true;
} catch (e) {
// If elements don't appear quickly, wait a bit more and check
logger.debug(`⚠️ Job elements not detected immediately, waiting...`);
await new Promise((resolve) => setTimeout(resolve, 1500));
// Verify elements exist now
const jobCount = await page.$$eval(
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
(elements) => elements.length
).catch(() => 0);
if (jobCount > 0) {
return true;
} else {
logger.warning(`⚠️ No job elements found after navigation`);
return false;
}
}
}
}
} catch (e) {
// Try next selector
continue;
}
}
logger.warning(`⚠️ Could not find or click next page button`);
return false;
} catch (error) {
logger.warning(`Failed to navigate to next page: ${error.message}`);
return false;
}
}
module.exports = {
linkedinJobsStrategy,
buildJobSearchUrl,
};