- Implemented grouped AND/OR logic for keyword searches, allowing for more flexible job matching criteria. - Added a minimum date filter to restrict job results to postings after a specified date. - Enhanced job detail extraction to include role duties and job requirements from job descriptions. - Updated README with new command line options and examples for using date filters and keyword logic. - Improved logging to provide clearer insights into keyword matching logic and job search parameters.
1683 lines
63 KiB
JavaScript
1683 lines
63 KiB
JavaScript
/**
|
|
* LinkedIn Jobs Parsing Strategy
|
|
*
|
|
* Uses core-parser for browser management and ai-analyzer for utilities
|
|
*/
|
|
|
|
const {
|
|
logger,
|
|
cleanText,
|
|
validateLocationAgainstFilters,
|
|
parseLocationFilters,
|
|
containsAnyKeyword,
|
|
containsAllKeywords,
|
|
matchesKeywordGroups,
|
|
} = require("ai-analyzer");
|
|
|
|
/**
|
|
* LinkedIn Jobs URL builder
|
|
*/
|
|
function buildJobSearchUrl(keyword, location = "", filters = {}) {
|
|
const baseUrl = "https://www.linkedin.com/jobs/search/";
|
|
|
|
// Always wrap keywords in quotes to ensure exact phrase matching
|
|
// LinkedIn's search treats unquoted keywords as individual words (OR logic)
|
|
// e.g., "co-op" becomes "co" OR "op", "software engineer" becomes "software" OR "engineer"
|
|
// Wrapping in quotes forces LinkedIn to search for the exact phrase
|
|
// URLSearchParams will properly encode the quotes
|
|
const searchKeyword = `"${keyword}"`;
|
|
|
|
const params = new URLSearchParams({
|
|
keywords: searchKeyword,
|
|
sortBy: "DD", // Date posted (newest first)
|
|
});
|
|
|
|
if (location) {
|
|
params.append("location", location);
|
|
}
|
|
|
|
// Add date filter if provided (f_TPR parameter)
|
|
// LinkedIn uses f_TPR=r<seconds> where seconds is the time range
|
|
if (filters.minDate) {
|
|
try {
|
|
const minDate = new Date(filters.minDate);
|
|
const now = new Date();
|
|
const secondsDiff = Math.floor((now - minDate) / 1000);
|
|
|
|
// LinkedIn supports relative timeframes (f_TPR parameter)
|
|
// If date is in the future, don't add filter
|
|
if (secondsDiff > 0) {
|
|
// LinkedIn typically supports up to ~30 days (2592000 seconds)
|
|
// For dates older than 30 days, we'll still add it but LinkedIn may limit results
|
|
const maxSeconds = 2592000; // 30 days
|
|
const timeRange = Math.min(secondsDiff, maxSeconds);
|
|
params.append("f_TPR", `r${timeRange}`);
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`⚠️ Invalid date format for minDate: ${filters.minDate}. Expected format: YYYY-MM-DD`);
|
|
}
|
|
}
|
|
|
|
// Add additional filters
|
|
if (filters.experienceLevel) {
|
|
params.append("f_E", filters.experienceLevel);
|
|
}
|
|
if (filters.jobType) {
|
|
params.append("f_JT", filters.jobType); // F=Full-time, P=Part-time, C=Contract, T=Temporary, I=Internship
|
|
}
|
|
if (filters.remote) {
|
|
params.append("f_WT", "2"); // 2 = Remote
|
|
}
|
|
|
|
return `${baseUrl}?${params.toString()}`;
|
|
}
|
|
|
|
/**
|
|
* LinkedIn Jobs parsing strategy function
|
|
*/
|
|
async function linkedinJobsStrategy(coreParser, options = {}) {
|
|
const {
|
|
keywords = ["software engineer", "developer"],
|
|
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
|
|
locationFilter = null,
|
|
maxPages = 5,
|
|
credentials = {},
|
|
location = "", // LinkedIn location search (e.g., "Canada", "Toronto, Ontario, Canada")
|
|
minDate = null, // Minimum posted date (format: YYYY-MM-DD)
|
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
|
} = options;
|
|
|
|
const results = [];
|
|
const rejectedResults = [];
|
|
const seenJobs = new Set();
|
|
|
|
// Create a backup to track results in case of issues
|
|
let resultsBackup = [];
|
|
let rejectedResultsBackup = [];
|
|
|
|
try {
|
|
// Create main page
|
|
const page = await coreParser.createPage("linkedin-jobs-main");
|
|
|
|
// Authenticate to LinkedIn
|
|
logger.info("🔐 Authenticating to LinkedIn...");
|
|
await coreParser.authenticate("linkedin", credentials, "linkedin-jobs-main");
|
|
logger.info("✅ LinkedIn authentication successful");
|
|
|
|
logger.info("🚀 Starting LinkedIn Jobs parser...");
|
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
|
if (keywordGroups) {
|
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
|
} else {
|
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
|
}
|
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
|
logger.info(`🌍 LinkedIn Location: ${location || "None"}`);
|
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
|
if (minDate) {
|
|
logger.info(`📅 Min Date Filter: ${minDate} (jobs posted after this date)`);
|
|
}
|
|
|
|
// Determine search keywords based on logic type
|
|
let searchKeywords;
|
|
if (keywordGroups) {
|
|
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
|
|
// We'll combine results and filter to ensure all groups match (AND between groups)
|
|
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
|
|
} else if (useAndLogic) {
|
|
// For simple AND logic, combine all keywords into a single search query
|
|
searchKeywords = [keywords.join(" ")];
|
|
} else {
|
|
// For OR logic, search each keyword separately
|
|
searchKeywords = keywords;
|
|
}
|
|
|
|
// Search for each keyword (or combined keyword for AND logic)
|
|
for (const keyword of searchKeywords) {
|
|
logger.info(`\n🔍 Searching LinkedIn Jobs for: "${keyword}"`);
|
|
|
|
const searchUrl = buildJobSearchUrl(keyword, location, {
|
|
minDate: minDate,
|
|
});
|
|
logger.info(`🔗 Search URL: ${searchUrl}`);
|
|
|
|
// Check if page is still valid before proceeding
|
|
try {
|
|
await page.evaluate(() => document.readyState).catch(() => {
|
|
throw new Error("Page is no longer valid - browser may have closed");
|
|
});
|
|
} catch (pageError) {
|
|
logger.error(`❌ Page is no longer accessible: ${pageError.message}`);
|
|
logger.info(`⚠️ Preserving ${results.length} jobs found so far`);
|
|
break; // Exit keyword loop if page is invalid
|
|
}
|
|
|
|
try {
|
|
// Navigate to job search results
|
|
await coreParser.navigateTo(searchUrl, {
|
|
pageId: "linkedin-jobs-main",
|
|
retries: 2,
|
|
waitUntil: "networkidle",
|
|
});
|
|
|
|
// Wait for page to load - reduced delay, use networkidle from navigation
|
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
|
|
|
// Verify we're on the right page and check what LinkedIn shows
|
|
const currentUrl = page.url();
|
|
logger.info(`📍 Current page URL: ${currentUrl}`);
|
|
|
|
// Check if LinkedIn shows any results count
|
|
try {
|
|
const resultsText = await page.evaluate(() => {
|
|
// Look for result count text like "Showing X results" or "X jobs"
|
|
const possibleTexts = [
|
|
document.querySelector("h1")?.textContent,
|
|
document.querySelector(".results-context-header__job-count")?.textContent,
|
|
document.querySelector("[class*='results-count']")?.textContent,
|
|
document.querySelector("[class*='job-count']")?.textContent,
|
|
].filter(Boolean);
|
|
return possibleTexts.join(" | ") || "No results count found";
|
|
});
|
|
logger.info(`📊 LinkedIn results info: ${resultsText}`);
|
|
} catch (e) {
|
|
logger.debug(`Could not get results count: ${e.message}`);
|
|
}
|
|
|
|
// Scroll to trigger lazy loading - single scroll operation
|
|
try {
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, 500);
|
|
});
|
|
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
} catch (e) {
|
|
logger.debug(`Could not scroll page: ${e.message}`);
|
|
}
|
|
|
|
// Wait for job listings container - try multiple selectors
|
|
let hasResults = false;
|
|
const possibleSelectors = [
|
|
".jobs-search-results-list",
|
|
".jobs-search-results",
|
|
"[data-test-id='job-search-results-list']",
|
|
".scaffold-layout__list-container",
|
|
"ul.scaffold-layout__list-container",
|
|
".jobs-search__results-list",
|
|
"main .scaffold-layout__list",
|
|
];
|
|
|
|
// Try selectors in parallel with shorter timeout
|
|
const selectorPromises = possibleSelectors.map(async (selector) => {
|
|
try {
|
|
await page.waitForSelector(selector, { timeout: 3000 });
|
|
const count = await page.$$(selector).then((elements) => elements.length);
|
|
if (count > 0) {
|
|
return { selector, count, success: true };
|
|
}
|
|
} catch (e) {
|
|
// Selector failed
|
|
}
|
|
return { selector, success: false };
|
|
});
|
|
|
|
// Wait for first successful selector
|
|
const selectorResults = await Promise.allSettled(selectorPromises);
|
|
for (const result of selectorResults) {
|
|
if (result.status === 'fulfilled' && result.value.success) {
|
|
hasResults = true;
|
|
logger.info(`✅ Found job results container with selector: ${result.value.selector}`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!hasResults) {
|
|
logger.warning(`⚠️ No job results container found for keyword: ${keyword}`);
|
|
|
|
// Debug: Check what's actually on the page
|
|
try {
|
|
const pageTitle = await page.title();
|
|
const pageUrl = page.url();
|
|
logger.info(`📄 Page title: ${pageTitle}`);
|
|
logger.info(`🔗 Page URL: ${pageUrl}`);
|
|
|
|
// Check for common LinkedIn elements
|
|
const hasMain = await page.$("main").then(el => el !== null).catch(() => false);
|
|
const hasJobsSection = await page.$("[class*='job']").then(el => el !== null).catch(() => false);
|
|
logger.info(`🔍 Debug - Has main: ${hasMain}, Has jobs section: ${hasJobsSection}`);
|
|
|
|
// Take screenshot for debugging
|
|
const screenshotPath = `debug-linkedin-jobs-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
|
|
await page.screenshot({ path: screenshotPath, fullPage: true });
|
|
logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
|
|
} catch (e) {
|
|
logger.warning(`Could not capture debug info: ${e.message}`);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// LinkedIn uses pagination with a "Next" button
|
|
// Extract jobs from each page and navigate to next page
|
|
const allJobs = [];
|
|
let currentPage = 1;
|
|
const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited
|
|
|
|
logger.info(`📄 Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`);
|
|
|
|
while (currentPage <= maxPagesToProcess) {
|
|
logger.info(`📄 Processing page ${currentPage}...`);
|
|
|
|
// Wait for page to fully load
|
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
|
|
|
// Extract jobs from current page
|
|
const pageJobs = await extractJobsFromPage(page, keyword, locationFilter, coreParser);
|
|
logger.info(`📋 Extracted ${pageJobs.length} jobs from page ${currentPage}`);
|
|
|
|
if (pageJobs.length === 0) {
|
|
logger.warning(`⚠️ No jobs found on page ${currentPage}, stopping pagination`);
|
|
break;
|
|
}
|
|
|
|
allJobs.push(...pageJobs);
|
|
|
|
// Check if there's a next page
|
|
const hasNext = await hasNextPageAvailable(page);
|
|
if (!hasNext) {
|
|
logger.info(`✅ No more pages available. Total jobs extracted: ${allJobs.length}`);
|
|
break;
|
|
}
|
|
|
|
// Navigate to next page if we haven't reached maxPages
|
|
if (currentPage < maxPagesToProcess) {
|
|
logger.info(`➡️ Navigating to page ${currentPage + 1}...`);
|
|
const navigationSuccess = await navigateToNextPage(page);
|
|
|
|
if (!navigationSuccess) {
|
|
logger.warning(`⚠️ Failed to navigate to next page, stopping pagination`);
|
|
break;
|
|
}
|
|
|
|
currentPage++;
|
|
|
|
// Quick verification that job elements are present (navigateToNextPage already waited for them)
|
|
const jobCount = await page.$$eval(
|
|
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
|
|
(elements) => elements.length
|
|
).catch(() => 0);
|
|
|
|
if (jobCount === 0) {
|
|
logger.warning(`⚠️ No job elements found on page ${currentPage} after navigation, stopping pagination`);
|
|
break;
|
|
}
|
|
|
|
logger.debug(`✅ Page ${currentPage} loaded with ${jobCount} job elements`);
|
|
} else {
|
|
logger.info(`📊 Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${allJobs.length}`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
logger.info(`📋 Extracted ${allJobs.length} total jobs across ${currentPage} page(s)`);
|
|
|
|
// Verify page is still valid after extraction
|
|
try {
|
|
await page.evaluate(() => document.readyState);
|
|
} catch (pageError) {
|
|
logger.warning(`⚠️ Page became invalid after extraction, but we have ${allJobs.length} jobs extracted`);
|
|
}
|
|
|
|
// Log sample job data for debugging
|
|
if (allJobs.length > 0 && process.env.DEBUG === "true") {
|
|
const sampleJob = allJobs[0];
|
|
logger.debug(`📝 Sample job: ID=${sampleJob.jobId}, Title=${sampleJob.title}, Location=${sampleJob.location || 'N/A'}, Company=${sampleJob.company || 'N/A'}`);
|
|
}
|
|
|
|
let duplicateCount = 0;
|
|
let locationRejectedCount = 0;
|
|
let addedCount = 0;
|
|
let noJobIdCount = 0;
|
|
|
|
for (const job of allJobs) {
|
|
// Handle jobs without jobId - use URL as fallback identifier
|
|
if (!job.jobId || job.jobId === "") {
|
|
noJobIdCount++;
|
|
// Use job URL as identifier if no jobId
|
|
if (job.jobUrl) {
|
|
const urlMatch = job.jobUrl.match(/\/jobs\/view\/(\d+)/);
|
|
if (urlMatch) {
|
|
job.jobId = urlMatch[1];
|
|
} else {
|
|
// Generate a unique ID from URL
|
|
job.jobId = `linkedin-${job.jobUrl.replace(/[^a-zA-Z0-9]/g, '-')}`;
|
|
}
|
|
} else {
|
|
// No jobId and no URL - skip this job
|
|
logger.warning(`⚠️ Job has no jobId or URL, skipping: ${job.title || 'Unknown'}`);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Skip duplicates
|
|
if (seenJobs.has(job.jobId)) {
|
|
duplicateCount++;
|
|
if (process.env.DEBUG === "true") {
|
|
logger.debug(`⏭️ Skipping duplicate job: ${job.jobId} - ${job.title}`);
|
|
}
|
|
continue;
|
|
}
|
|
seenJobs.add(job.jobId);
|
|
|
|
// Validate keywords based on logic type
|
|
if (keywordGroups) {
|
|
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
|
|
const fullText = `${job.title} ${job.description} ${job.company}`;
|
|
if (!matchesKeywordGroups(fullText, keywordGroups)) {
|
|
rejectedResults.push({
|
|
...job,
|
|
rejectionReason: "Job does not match all keyword groups",
|
|
});
|
|
if (process.env.DEBUG === "true") {
|
|
logger.debug(`🔍 Rejected (grouped logic): "${job.title}" - does not match all groups`);
|
|
}
|
|
continue;
|
|
}
|
|
} else if (useAndLogic) {
|
|
// Simple AND logic: all keywords must match
|
|
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
|
|
if (!containsAllKeywords(fullText, keywords)) {
|
|
rejectedResults.push({
|
|
...job,
|
|
rejectionReason: "Not all keywords found in job listing",
|
|
});
|
|
if (process.env.DEBUG === "true") {
|
|
logger.debug(`🔍 Rejected (AND logic): "${job.title}" - not all keywords found`);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
// For OR logic, trust LinkedIn's search results (already filtered)
|
|
|
|
// Validate location if filtering enabled
|
|
if (locationFilter) {
|
|
// Parse locationFilter string into array if it's a string
|
|
const locationFiltersArray = typeof locationFilter === 'string'
|
|
? parseLocationFilters(locationFilter)
|
|
: locationFilter;
|
|
|
|
const locationValid = validateLocationAgainstFilters(
|
|
job.location,
|
|
locationFiltersArray
|
|
);
|
|
|
|
if (!locationValid.isValid) {
|
|
locationRejectedCount++;
|
|
rejectedResults.push({
|
|
...job,
|
|
rejectionReason: locationValid.reasoning || "Location filter mismatch",
|
|
});
|
|
if (process.env.DEBUG === "true") {
|
|
logger.debug(`📍 Rejected location: "${job.location}" - ${locationValid.reasoning || "Location filter mismatch"}`);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
results.push(job);
|
|
addedCount++;
|
|
}
|
|
|
|
// Backup results after each keyword processing
|
|
resultsBackup = [...results];
|
|
rejectedResultsBackup = [...rejectedResults];
|
|
|
|
logger.info(`📊 Processing complete: ${addedCount} added, ${locationRejectedCount} location-rejected, ${duplicateCount} duplicates, ${noJobIdCount} had no jobId`);
|
|
logger.info(`📊 Current results count: ${results.length} jobs accumulated so far`);
|
|
logger.info(`📊 Backup results count: ${resultsBackup.length} jobs in backup`);
|
|
} catch (error) {
|
|
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
|
logger.error(`Stack: ${error.stack}`);
|
|
// Preserve results even if there's an error
|
|
logger.info(`⚠️ Preserving ${results.length} jobs found before error`);
|
|
}
|
|
}
|
|
|
|
// Log results before returning
|
|
logger.info(`📊 Final results check: results.length=${results.length}, rejectedResults.length=${rejectedResults.length}`);
|
|
logger.info(`📊 Backup check: resultsBackup.length=${resultsBackup.length}, rejectedResultsBackup.length=${rejectedResultsBackup.length}`);
|
|
|
|
// If results array is empty but backup has data, use backup (defensive programming)
|
|
const finalResults = results.length > 0 ? results : resultsBackup;
|
|
const finalRejectedResults = rejectedResults.length > 0 ? rejectedResults : rejectedResultsBackup;
|
|
|
|
if (results.length === 0 && resultsBackup.length > 0) {
|
|
logger.warning(`⚠️ Results array was empty but backup has ${resultsBackup.length} jobs - using backup!`);
|
|
}
|
|
|
|
if (finalResults.length > 0) {
|
|
logger.info(`📝 First result sample: ${JSON.stringify(finalResults[0], null, 2).substring(0, 200)}...`);
|
|
}
|
|
|
|
logger.info(
|
|
`🎯 LinkedIn Jobs parsing completed: ${finalResults.length} jobs found, ${finalRejectedResults.length} rejected`
|
|
);
|
|
|
|
// Final verification - log if results seem wrong
|
|
if (finalResults.length === 0 && finalRejectedResults.length === 0) {
|
|
logger.warning(`⚠️ No jobs found or rejected - this might indicate an extraction issue`);
|
|
}
|
|
|
|
const returnValue = {
|
|
results: [...finalResults], // Create a copy to ensure we're returning the right data
|
|
rejectedResults: [...finalRejectedResults],
|
|
summary: {
|
|
totalJobs: finalResults.length,
|
|
totalRejected: finalRejectedResults.length,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
source: "linkedin-jobs",
|
|
},
|
|
};
|
|
|
|
logger.info(`📦 Returning: ${returnValue.results.length} results, ${returnValue.rejectedResults.length} rejected`);
|
|
return returnValue;
|
|
} catch (error) {
|
|
logger.error(`❌ LinkedIn Jobs parsing failed: ${error.message}`);
|
|
logger.error(`Stack: ${error.stack}`);
|
|
// Return whatever results we have, even if there was an error
|
|
logger.info(`⚠️ Returning ${results.length} jobs found before fatal error`);
|
|
return {
|
|
results,
|
|
rejectedResults,
|
|
summary: {
|
|
totalJobs: results.length,
|
|
totalRejected: rejectedResults.length,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
source: "linkedin-jobs",
|
|
error: error.message,
|
|
},
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Scroll to load more jobs (LinkedIn uses infinite scroll) - improved to load all jobs
|
|
*/
|
|
async function scrollToLoadJobs(page) {
|
|
try {
|
|
let previousJobCount = 0;
|
|
let currentJobCount = 0;
|
|
let scrollAttempts = 0;
|
|
let noChangeCount = 0; // Count how many times count hasn't changed
|
|
const maxScrollAttempts = 50; // Increased for large result sets
|
|
const maxNoChangeAttempts = 3; // Stop if count doesn't change 3 times in a row
|
|
|
|
logger.info(`📜 Starting to scroll and load jobs...`);
|
|
|
|
// Keep scrolling until no more jobs load
|
|
while (scrollAttempts < maxScrollAttempts) {
|
|
// Count current jobs
|
|
currentJobCount = await page.$$eval(
|
|
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
|
|
(elements) => elements.length
|
|
).catch(() => 0);
|
|
|
|
// If no new jobs loaded, increment no-change counter
|
|
if (currentJobCount === previousJobCount && scrollAttempts > 0) {
|
|
noChangeCount++;
|
|
// If count hasn't changed 3 times in a row, we're probably done
|
|
if (noChangeCount >= maxNoChangeAttempts) {
|
|
logger.info(`📊 Loaded ${currentJobCount} jobs after ${scrollAttempts} scrolls (no new jobs for ${noChangeCount} attempts)`);
|
|
break;
|
|
}
|
|
} else {
|
|
// Count changed, reset no-change counter
|
|
noChangeCount = 0;
|
|
}
|
|
|
|
previousJobCount = currentJobCount;
|
|
|
|
// Scroll down - use smooth scrolling to trigger lazy loading
|
|
await page.evaluate(() => {
|
|
window.scrollTo({
|
|
top: document.body.scrollHeight,
|
|
behavior: 'smooth'
|
|
});
|
|
});
|
|
|
|
// Wait for new content to load - LinkedIn sometimes needs more time
|
|
await new Promise((resolve) => setTimeout(resolve, 2500));
|
|
|
|
// Also try scrolling in smaller increments to trigger lazy loading
|
|
if (scrollAttempts % 3 === 0) {
|
|
await page.evaluate(() => {
|
|
window.scrollBy(0, 1000);
|
|
});
|
|
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
}
|
|
|
|
scrollAttempts++;
|
|
|
|
// Log progress every 5 scrolls
|
|
if (scrollAttempts % 5 === 0) {
|
|
const newCount = await page.$$eval(
|
|
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
|
|
(elements) => elements.length
|
|
).catch(() => 0);
|
|
logger.info(`📜 Scrolled ${scrollAttempts} times, loaded ${newCount} jobs so far...`);
|
|
}
|
|
}
|
|
|
|
// Final scroll to ensure everything is loaded
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, document.body.scrollHeight);
|
|
});
|
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
|
|
|
// Final count
|
|
const finalCount = await page.$$eval(
|
|
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
|
|
(elements) => elements.length
|
|
).catch(() => 0);
|
|
logger.info(`✅ Finished scrolling. Total jobs loaded: ${finalCount}`);
|
|
|
|
} catch (error) {
|
|
logger.warning(`Could not scroll page: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract jobs from current page
|
|
*/
|
|
async function extractJobsFromPage(page, keyword, locationFilter, coreParser = null) {
|
|
const jobs = [];
|
|
|
|
try {
|
|
// LinkedIn job listings are in <ul> with class "jobs-search-results__list"
|
|
// Each job is a <li> with class "jobs-search-results__list-item"
|
|
// Try multiple selectors as LinkedIn changes their structure
|
|
const jobSelectors = [
|
|
"li.jobs-search-results__list-item",
|
|
"li[data-occludable-job-id]",
|
|
".job-card-container",
|
|
"[data-test-id='job-search-result']",
|
|
".scaffold-layout__list-item",
|
|
"li.scaffold-layout__list-item",
|
|
"ul.scaffold-layout__list-container > li",
|
|
"main ul li",
|
|
"[class*='job-card']",
|
|
"[class*='job-search-result']",
|
|
];
|
|
|
|
// Try selectors in parallel for faster detection
|
|
let jobElements = [];
|
|
const selectorChecks = jobSelectors.map(async (selector) => {
|
|
try {
|
|
await page.waitForSelector(selector, { timeout: 2000 }).catch(() => {});
|
|
const elements = await page.$$(selector);
|
|
if (elements.length > 0) {
|
|
return { selector, elements, success: true };
|
|
}
|
|
} catch (e) {
|
|
// Selector failed
|
|
}
|
|
return { selector, elements: [], success: false };
|
|
});
|
|
|
|
const selectorResults = await Promise.allSettled(selectorChecks);
|
|
for (const result of selectorResults) {
|
|
if (result.status === 'fulfilled' && result.value.success) {
|
|
jobElements = result.value.elements;
|
|
logger.info(`✅ Found ${jobElements.length} job elements using selector: ${result.value.selector}`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (jobElements.length === 0) {
|
|
logger.warning(`⚠️ No job elements found with any selector`);
|
|
|
|
// Fallback: Try to find job links directly and use their parent containers
|
|
try {
|
|
logger.info(`🔍 Trying fallback: searching for job links directly...`);
|
|
const jobLinks = await page.$$("a[href*='/jobs/view/']");
|
|
if (jobLinks.length > 0) {
|
|
logger.info(`✅ Found ${jobLinks.length} job links using fallback method`);
|
|
|
|
// Get unique parent containers for each link
|
|
const seenUrls = new Set();
|
|
const parentElements = [];
|
|
|
|
for (const link of jobLinks) {
|
|
try {
|
|
// Extract job URL to check for duplicates
|
|
const href = await link.getAttribute("href");
|
|
if (!href || seenUrls.has(href)) continue;
|
|
seenUrls.add(href);
|
|
|
|
// Get parent element using evaluate
|
|
const parentHandle = await link.evaluateHandle((el) => {
|
|
return el.closest("li") || el.closest("[class*='card']") || el.closest("div") || el.parentElement;
|
|
});
|
|
|
|
if (parentHandle) {
|
|
parentElements.push(parentHandle);
|
|
}
|
|
} catch (e) {
|
|
// Skip if we can't process this link
|
|
}
|
|
}
|
|
|
|
if (parentElements.length > 0) {
|
|
jobElements = parentElements;
|
|
logger.info(`✅ Using ${jobElements.length} unique job elements from fallback`);
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logger.warning(`Fallback method failed: ${e.message}`);
|
|
}
|
|
|
|
// Debug: Log what selectors we can find
|
|
if (jobElements.length === 0) {
|
|
try {
|
|
const allLis = await page.$$("li").then(elements => elements.length);
|
|
const allDivs = await page.$$("div[class*='job']").then(elements => elements.length);
|
|
const jobLinks = await page.$$("a[href*='/jobs/']").then(elements => elements.length);
|
|
logger.info(`🔍 Debug - Found ${allLis} <li> elements, ${allDivs} job-related divs, ${jobLinks} job links`);
|
|
|
|
// Try to find any list container
|
|
const listContainers = await page.$$("ul, ol").then(elements => elements.length);
|
|
logger.info(`🔍 Debug - Found ${listContainers} list containers`);
|
|
} catch (e) {
|
|
// Ignore debug errors
|
|
}
|
|
}
|
|
|
|
if (jobElements.length === 0) {
|
|
return jobs;
|
|
}
|
|
}
|
|
|
|
let extractedCount = 0;
|
|
let failedCount = 0;
|
|
|
|
for (let i = 0; i < jobElements.length; i++) {
|
|
const jobElement = jobElements[i];
|
|
try {
|
|
// Scroll job into view and hover to trigger lazy loading of content
|
|
try {
|
|
await jobElement.scrollIntoViewIfNeeded();
|
|
await new Promise((resolve) => setTimeout(resolve, 100)); // Small delay for content to load
|
|
|
|
// Hover over the element to trigger LinkedIn's lazy loading
|
|
await jobElement.hover().catch(() => {
|
|
// If hover fails, try scrolling again
|
|
jobElement.scrollIntoViewIfNeeded();
|
|
});
|
|
await new Promise((resolve) => setTimeout(resolve, 200)); // Wait for content to load after hover
|
|
} catch (scrollError) {
|
|
// If scrolling/hovering fails, continue anyway - might still have data
|
|
logger.debug(`Could not scroll/hover job element ${i}: ${scrollError.message}`);
|
|
}
|
|
|
|
const job = await extractJobData(jobElement, keyword, page, coreParser);
|
|
if (job && (job.title || job.jobId)) {
|
|
// Only add if we have at least a title or jobId
|
|
jobs.push(job);
|
|
extractedCount++;
|
|
} else {
|
|
failedCount++;
|
|
if (process.env.DEBUG === "true") {
|
|
logger.debug(`Job ${i} extraction returned empty: jobId=${job?.jobId || 'none'}, title=${job?.title || 'none'}`);
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`Failed to extract job data for element ${i}: ${error.message}`);
|
|
failedCount++;
|
|
}
|
|
}
|
|
|
|
if (jobElements.length > 0) {
|
|
logger.info(`📊 Extraction summary: ${extractedCount} successful, ${failedCount} failed out of ${jobElements.length} job elements`);
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to extract jobs from page: ${error.message}`);
|
|
}
|
|
|
|
return jobs;
|
|
}
|
|
|
|
/**
|
|
* Extract full job description from job detail page
|
|
*/
|
|
async function extractFullJobDescription(coreParser, jobUrl) {
|
|
try {
|
|
if (!jobUrl) {
|
|
return { fullDescription: "", roleDuties: "", jobRequirements: "" };
|
|
}
|
|
|
|
// Create a separate page for detail extraction to avoid disrupting search results
|
|
const detailPage = await coreParser.createPage(`linkedin-job-detail-${Date.now()}`);
|
|
|
|
try {
|
|
// Navigate to job detail page
|
|
await detailPage.goto(jobUrl, { waitUntil: "networkidle2", timeout: 30000 }).catch(() => {});
|
|
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait for content to load
|
|
|
|
const jobDetails = await detailPage.evaluate(() => {
|
|
const details = {
|
|
fullDescription: "",
|
|
roleDuties: "",
|
|
jobRequirements: "",
|
|
};
|
|
|
|
// Try multiple selectors for job description container
|
|
const descriptionSelectors = [
|
|
".description__text",
|
|
".show-more-less-html__markup",
|
|
"[class*='description__text']",
|
|
"[class*='job-description']",
|
|
".jobs-description__text",
|
|
".jobs-box__html-content",
|
|
"[data-test-id='job-description']",
|
|
".jobs-details__main-content",
|
|
".jobs-description-content__text",
|
|
];
|
|
|
|
let descriptionElement = null;
|
|
for (const selector of descriptionSelectors) {
|
|
descriptionElement = document.querySelector(selector);
|
|
if (descriptionElement) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (descriptionElement) {
|
|
details.fullDescription = descriptionElement.textContent?.trim() ||
|
|
descriptionElement.innerText?.trim() || "";
|
|
}
|
|
|
|
// If we didn't find description, try to get from main content area
|
|
if (!details.fullDescription) {
|
|
const mainContent = document.querySelector("main") ||
|
|
document.querySelector("[class*='jobs-details']") ||
|
|
document.querySelector("[class*='job-details']");
|
|
if (mainContent) {
|
|
details.fullDescription = mainContent.textContent?.trim() ||
|
|
mainContent.innerText?.trim() || "";
|
|
}
|
|
}
|
|
|
|
return details;
|
|
});
|
|
|
|
// Parse duties and requirements from full description
|
|
const parsed = parseDutiesAndRequirements(jobDetails.fullDescription);
|
|
|
|
return {
|
|
fullDescription: jobDetails.fullDescription,
|
|
roleDuties: parsed.duties,
|
|
jobRequirements: parsed.requirements,
|
|
};
|
|
} finally {
|
|
// Close the detail page to free resources
|
|
try {
|
|
await detailPage.close();
|
|
} catch (closeError) {
|
|
// Ignore close errors
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`Failed to extract full job description from ${jobUrl}: ${error.message}`);
|
|
return { fullDescription: "", roleDuties: "", jobRequirements: "" };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse job description to separate role duties from job requirements
|
|
*/
|
|
function parseDutiesAndRequirements(description) {
|
|
if (!description || description.trim().length === 0) {
|
|
return { duties: "", requirements: "" };
|
|
}
|
|
|
|
const duties = [];
|
|
const requirements = [];
|
|
|
|
// Common section headers that indicate duties/responsibilities
|
|
const dutiesKeywords = [
|
|
/responsibilities?:/i,
|
|
/duties?:/i,
|
|
/what you['\u2019]ll do/i,
|
|
/key responsibilities/i,
|
|
/your role/i,
|
|
/position overview/i,
|
|
/about the role/i,
|
|
/role overview/i,
|
|
/what we need/i,
|
|
/you will:/i,
|
|
/you['\u2019]ll be responsible/i,
|
|
];
|
|
|
|
// Common section headers that indicate requirements/qualifications
|
|
const requirementsKeywords = [
|
|
/requirements?:/i,
|
|
/qualifications?:/i,
|
|
/must have/i,
|
|
/required:/i,
|
|
/what you['\u2019]ll bring/i,
|
|
/you have:/i,
|
|
/skills required/i,
|
|
/minimum requirements/i,
|
|
/preferred qualifications/i,
|
|
/education:/i,
|
|
/experience:/i,
|
|
/you must have/i,
|
|
/we['\u2019]re looking for/i,
|
|
];
|
|
|
|
// Split description into sections (by common delimiters)
|
|
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
|
|
|
|
let currentSection = "duties"; // Default to duties
|
|
let dutiesText = "";
|
|
let requirementsText = "";
|
|
|
|
for (const section of sections) {
|
|
const sectionLower = section.toLowerCase();
|
|
|
|
// Check if this section is about requirements
|
|
let isRequirementsSection = false;
|
|
for (const keyword of requirementsKeywords) {
|
|
if (keyword.test(section)) {
|
|
isRequirementsSection = true;
|
|
currentSection = "requirements";
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check if this section is about duties/responsibilities
|
|
if (!isRequirementsSection) {
|
|
for (const keyword of dutiesKeywords) {
|
|
if (keyword.test(section)) {
|
|
currentSection = "duties";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add to appropriate section
|
|
if (currentSection === "requirements") {
|
|
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
|
|
} else {
|
|
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
|
|
}
|
|
}
|
|
|
|
// If we couldn't split by sections, try to find bullet points or numbered lists
|
|
if (!dutiesText && !requirementsText) {
|
|
const lines = description.split(/\n/);
|
|
let foundRequirementsHeader = false;
|
|
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const line = lines[i].trim();
|
|
if (line.length === 0) continue;
|
|
|
|
// Check if this line is a requirements header
|
|
for (const keyword of requirementsKeywords) {
|
|
if (keyword.test(line)) {
|
|
foundRequirementsHeader = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (foundRequirementsHeader) {
|
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
|
} else {
|
|
// Check if it's a duties header
|
|
let isDutiesHeader = false;
|
|
for (const keyword of dutiesKeywords) {
|
|
if (keyword.test(line)) {
|
|
isDutiesHeader = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!isDutiesHeader) {
|
|
// Add to duties if we haven't found requirements header yet
|
|
if (!foundRequirementsHeader) {
|
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
|
} else {
|
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
|
}
|
|
} else {
|
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
|
|
if (!dutiesText && !requirementsText && description) {
|
|
const midPoint = Math.floor(description.length * 0.6);
|
|
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
|
|
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
|
|
const splitPoint = Math.max(
|
|
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
|
|
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
|
|
);
|
|
|
|
dutiesText = description.substring(0, splitPoint).trim();
|
|
requirementsText = description.substring(splitPoint).trim();
|
|
}
|
|
|
|
return {
|
|
duties: dutiesText.trim(),
|
|
requirements: requirementsText.trim(),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract data from individual job element
|
|
*/
|
|
async function extractJobData(jobElement, keyword, page = null, coreParser = null) {
|
|
try {
|
|
const jobData = await jobElement.evaluate((el) => {
|
|
const data = {
|
|
jobId: "",
|
|
title: "",
|
|
company: "",
|
|
location: "",
|
|
jobUrl: "",
|
|
postedDate: "",
|
|
description: "",
|
|
jobType: "",
|
|
experienceLevel: "",
|
|
};
|
|
|
|
// Extract job ID from data-job-id or link
|
|
const jobIdAttr = el.getAttribute("data-job-id") ||
|
|
el.getAttribute("data-occludable-job-id") ||
|
|
el.querySelector("[data-job-id]")?.getAttribute("data-job-id");
|
|
|
|
if (jobIdAttr) {
|
|
data.jobId = jobIdAttr.toString();
|
|
}
|
|
|
|
// Extract title and URL - try multiple selectors (updated for LinkedIn's current structure)
|
|
const titleSelectors = [
|
|
"a.job-card-list__title",
|
|
".job-card-list__title-link",
|
|
"a[data-test-id='job-title']",
|
|
".base-search-card__title a",
|
|
"h3 a",
|
|
".job-card-container__link",
|
|
"a[href*='/jobs/view/']",
|
|
".job-card-list__title a",
|
|
".base-search-card__title",
|
|
"h3.base-search-card__title a",
|
|
"[class*='job-title'] a",
|
|
"[class*='job-card'] a[href*='/jobs/']",
|
|
"a[href*='/jobs/view/'] span", // LinkedIn sometimes wraps title in span
|
|
"h3[class*='title'] a",
|
|
"h4[class*='title'] a",
|
|
".job-search-card__title a",
|
|
".jobs-search-results__list-item a[href*='/jobs/view/']",
|
|
];
|
|
|
|
for (const selector of titleSelectors) {
|
|
const link = el.querySelector(selector);
|
|
if (link) {
|
|
data.jobUrl = link.getAttribute("href") || "";
|
|
// Extract job ID from URL if not found
|
|
if (!data.jobId && data.jobUrl) {
|
|
const match = data.jobUrl.match(/\/jobs\/view\/(\d+)/);
|
|
if (match) {
|
|
data.jobId = match[1];
|
|
}
|
|
}
|
|
// Try to get text from link or its children
|
|
data.title = link.textContent?.trim() || link.innerText?.trim() || "";
|
|
// If link has no text, try getting from child span or div
|
|
if (!data.title) {
|
|
const child = link.querySelector("span, div");
|
|
if (child) {
|
|
data.title = child.textContent?.trim() || child.innerText?.trim() || "";
|
|
}
|
|
}
|
|
if (data.title) break;
|
|
}
|
|
}
|
|
|
|
// Fallback: Get title from any link with job URL pattern
|
|
if (!data.title) {
|
|
const allLinks = el.querySelectorAll("a[href*='/jobs/view/']");
|
|
for (const link of allLinks) {
|
|
const href = link.getAttribute("href") || "";
|
|
if (href.includes("/jobs/view/")) {
|
|
data.jobUrl = href;
|
|
// Extract job ID from URL
|
|
const match = href.match(/\/jobs\/view\/(\d+)/);
|
|
if (match) {
|
|
data.jobId = match[1];
|
|
}
|
|
// Get text from link or any visible child
|
|
data.title = link.textContent?.trim() || link.innerText?.trim() || "";
|
|
if (!data.title) {
|
|
const visibleChild = Array.from(link.querySelectorAll("*")).find(
|
|
child => child.textContent?.trim() && child.offsetParent !== null
|
|
);
|
|
if (visibleChild) {
|
|
data.title = visibleChild.textContent?.trim() || "";
|
|
}
|
|
}
|
|
if (data.title) break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last resort: Extract from aria-label or title attribute
|
|
if (!data.title) {
|
|
const linkWithAria = el.querySelector("a[aria-label], a[title]");
|
|
if (linkWithAria) {
|
|
data.title = linkWithAria.getAttribute("aria-label") ||
|
|
linkWithAria.getAttribute("title") || "";
|
|
if (linkWithAria.getAttribute("href")?.includes("/jobs/view/")) {
|
|
data.jobUrl = linkWithAria.getAttribute("href");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract company name - try multiple selectors and patterns
|
|
const companySelectors = [
|
|
".job-card-container__company-name",
|
|
".job-card-container__primary-description",
|
|
"a[data-test-id='job-company-name']",
|
|
".base-search-card__subtitle",
|
|
".job-card-container__company-name-link",
|
|
"[class*='company-name']",
|
|
"[class*='job-card-container__company']",
|
|
".base-search-card__subtitle-link",
|
|
"a[href*='/company/']",
|
|
"[class*='subtitle']",
|
|
"[class*='primary-description']",
|
|
];
|
|
|
|
for (const selector of companySelectors) {
|
|
const companyElement = el.querySelector(selector);
|
|
if (companyElement) {
|
|
const text = companyElement.textContent?.trim() ||
|
|
companyElement.innerText?.trim() || "";
|
|
// Filter out common non-company text
|
|
if (text &&
|
|
text.length > 1 &&
|
|
text.length < 100 &&
|
|
!text.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
|
|
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i) &&
|
|
!text.includes(",") && // Location usually has comma, company usually doesn't
|
|
!text.match(/^[A-Z][a-z]+,\s*[A-Z]/)) { // Not a location pattern
|
|
data.company = text;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for company link and get its text
|
|
if (!data.company) {
|
|
const companyLink = el.querySelector("a[href*='/company/']");
|
|
if (companyLink) {
|
|
const linkText = companyLink.textContent?.trim() || companyLink.innerText?.trim() || "";
|
|
if (linkText && linkText.length > 1 && linkText.length < 100) {
|
|
data.company = linkText;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for text that appears after the title but before location/metadata
|
|
if (!data.company) {
|
|
const titleElement = el.querySelector("a[href*='/jobs/view/']");
|
|
if (titleElement) {
|
|
// Get the next sibling or parent's next child
|
|
let current = titleElement.parentElement;
|
|
if (current) {
|
|
const siblings = Array.from(current.children);
|
|
const titleIndex = siblings.indexOf(titleElement);
|
|
// Check next few siblings
|
|
for (let i = titleIndex + 1; i < Math.min(titleIndex + 4, siblings.length); i++) {
|
|
const sibling = siblings[i];
|
|
const text = sibling.textContent?.trim() || sibling.innerText?.trim() || "";
|
|
if (text &&
|
|
text.length > 1 &&
|
|
text.length < 100 &&
|
|
!text.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
|
|
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i) &&
|
|
!text.includes(",")) {
|
|
data.company = text;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract location - try multiple selectors and patterns
|
|
const locationSelectors = [
|
|
".job-card-container__metadata-item",
|
|
".job-card-container__metadata-wrapper .job-card-container__metadata-item",
|
|
"[data-test-id='job-location']",
|
|
".base-search-card__metadata",
|
|
".job-card-container__metadata",
|
|
"[class*='metadata']",
|
|
"[class*='location']",
|
|
];
|
|
|
|
for (const selector of locationSelectors) {
|
|
const locationElements = el.querySelectorAll(selector);
|
|
for (const locationElement of locationElements) {
|
|
const text = locationElement.textContent?.trim() ||
|
|
locationElement.innerText?.trim() || "";
|
|
// Check if it looks like a location (not a date or job type)
|
|
if (text &&
|
|
!text.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
|
|
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i) &&
|
|
(text.includes(",") ||
|
|
text.match(/^[A-Z][a-z]+,\s*[A-Z]/) || // City, State/Province pattern
|
|
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|toronto|vancouver|calgary|ottawa|montreal|canada|united states|usa)/i.test(text))) {
|
|
data.location = text;
|
|
break;
|
|
}
|
|
}
|
|
if (data.location) break;
|
|
}
|
|
|
|
// Fallback: Look for location link
|
|
if (!data.location) {
|
|
const locationLink = el.querySelector("a[href*='/location/']");
|
|
if (locationLink) {
|
|
const linkText = locationLink.textContent?.trim() || locationLink.innerText?.trim() || "";
|
|
if (linkText && linkText.length > 2) {
|
|
data.location = linkText;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for text patterns that look like locations
|
|
if (!data.location) {
|
|
// Get all text nodes and look for location-like patterns
|
|
const allText = el.innerText || el.textContent || "";
|
|
const lines = allText.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
|
|
|
for (const line of lines) {
|
|
// Skip if it's the title, company, or a date
|
|
if (line === data.title ||
|
|
line === data.company ||
|
|
line.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) ||
|
|
line.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i)) {
|
|
continue;
|
|
}
|
|
|
|
// Check if it looks like a location
|
|
if (line.includes(",") ||
|
|
line.match(/^[A-Z][a-z]+,\s*[A-Z]/) ||
|
|
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|toronto|vancouver|calgary|ottawa|montreal|canada)/i.test(line)) {
|
|
data.location = line;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract posted date
|
|
const dateSelectors = [
|
|
"time",
|
|
".job-card-container__metadata-item time",
|
|
"[data-test-id='job-posted-date']",
|
|
"time[datetime]",
|
|
"[class*='date']",
|
|
"[class*='posted']",
|
|
];
|
|
|
|
for (const selector of dateSelectors) {
|
|
const dateElement = el.querySelector(selector);
|
|
if (dateElement) {
|
|
const datetime = dateElement.getAttribute("datetime");
|
|
const title = dateElement.getAttribute("title");
|
|
const text = dateElement.textContent?.trim() || dateElement.innerText?.trim() || "";
|
|
|
|
if (datetime) {
|
|
data.postedDate = datetime;
|
|
break;
|
|
} else if (title && title.match(/\d{4}-\d{2}-\d{2}/)) {
|
|
data.postedDate = title;
|
|
break;
|
|
} else if (text && text.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
|
|
// Parse relative dates like "2 days ago"
|
|
const match = text.match(/(\d+)\s*(minute|hour|day|week|month|year|h|d|w)/i);
|
|
if (match) {
|
|
const amount = parseInt(match[1]);
|
|
const unit = match[2].toLowerCase();
|
|
const date = new Date();
|
|
|
|
if (unit.includes("minute") || unit === "h") {
|
|
date.setMinutes(date.getMinutes() - amount);
|
|
} else if (unit.includes("hour") || unit === "h") {
|
|
date.setHours(date.getHours() - amount);
|
|
} else if (unit.includes("day") || unit === "d") {
|
|
date.setDate(date.getDate() - amount);
|
|
} else if (unit.includes("week") || unit === "w") {
|
|
date.setDate(date.getDate() - (amount * 7));
|
|
} else if (unit.includes("month")) {
|
|
date.setMonth(date.getMonth() - amount);
|
|
} else if (unit.includes("year")) {
|
|
date.setFullYear(date.getFullYear() - amount);
|
|
}
|
|
|
|
data.postedDate = date.toISOString().split("T")[0];
|
|
} else {
|
|
data.postedDate = text;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for date patterns in metadata text
|
|
if (!data.postedDate) {
|
|
const metadataItems = el.querySelectorAll("[class*='metadata']");
|
|
for (const item of metadataItems) {
|
|
const text = item.textContent?.trim() || item.innerText?.trim() || "";
|
|
if (text && text.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
|
|
const match = text.match(/(\d+)\s*(minute|hour|day|week|month|year|h|d|w)/i);
|
|
if (match) {
|
|
const amount = parseInt(match[1]);
|
|
const unit = match[2].toLowerCase();
|
|
const date = new Date();
|
|
|
|
if (unit.includes("day") || unit === "d") {
|
|
date.setDate(date.getDate() - amount);
|
|
data.postedDate = date.toISOString().split("T")[0];
|
|
break;
|
|
} else if (unit.includes("week") || unit === "w") {
|
|
date.setDate(date.getDate() - (amount * 7));
|
|
data.postedDate = date.toISOString().split("T")[0];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract job type and experience level from metadata
|
|
const metadataSelectors = [
|
|
".job-card-container__metadata-item",
|
|
"[class*='metadata']",
|
|
"[class*='job-type']",
|
|
"[class*='experience']",
|
|
];
|
|
|
|
for (const selector of metadataSelectors) {
|
|
const metadataItems = el.querySelectorAll(selector);
|
|
for (const item of metadataItems) {
|
|
const text = item.textContent?.trim() || item.innerText?.trim() || "";
|
|
|
|
// Check for job type
|
|
if (!data.jobType && text.match(/^(Full-time|Part-time|Contract|Internship|Temporary|Freelance)$/i)) {
|
|
data.jobType = text;
|
|
}
|
|
|
|
// Check for experience level
|
|
if (!data.experienceLevel && text.match(/(Entry level|Mid-Senior|Associate|Executive|Internship|Senior|Junior|Mid-level)/i)) {
|
|
data.experienceLevel = text;
|
|
}
|
|
|
|
if (data.jobType && data.experienceLevel) break;
|
|
}
|
|
if (data.jobType && data.experienceLevel) break;
|
|
}
|
|
|
|
// Fallback: Look in all text for job type and experience patterns
|
|
if (!data.jobType || !data.experienceLevel) {
|
|
const allText = el.innerText || el.textContent || "";
|
|
const words = allText.split(/\s+/);
|
|
|
|
if (!data.jobType) {
|
|
for (const word of words) {
|
|
if (word.match(/^(Full-time|Part-time|Contract|Internship|Temporary|Freelance)$/i)) {
|
|
data.jobType = word;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!data.experienceLevel) {
|
|
for (let i = 0; i < words.length - 1; i++) {
|
|
const phrase = words.slice(i, i + 2).join(" ");
|
|
if (phrase.match(/(Entry level|Mid-Senior|Associate|Executive|Internship|Senior level|Junior level|Mid level)/i)) {
|
|
data.experienceLevel = phrase;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try to get description snippet
|
|
const descSelectors = [
|
|
".job-card-list__description",
|
|
".job-card-container__description",
|
|
"[data-test-id='job-description']",
|
|
".base-search-card__snippet",
|
|
"[class*='description']",
|
|
"[class*='snippet']",
|
|
"[class*='summary']",
|
|
];
|
|
|
|
for (const selector of descSelectors) {
|
|
const descElement = el.querySelector(selector);
|
|
if (descElement) {
|
|
const text = descElement.textContent?.trim() ||
|
|
descElement.innerText?.trim() || "";
|
|
// Only use if it's substantial (more than just a few words)
|
|
if (text && text.length > 20) {
|
|
data.description = text.substring(0, 500); // Limit description length
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Extract description from any paragraph or div that's not title/company/location
|
|
if (!data.description) {
|
|
const allElements = el.querySelectorAll("p, div, span");
|
|
for (const elem of allElements) {
|
|
const text = elem.textContent?.trim() || elem.innerText?.trim() || "";
|
|
// Skip if it's title, company, location, or too short
|
|
if (text &&
|
|
text.length > 30 &&
|
|
text !== data.title &&
|
|
text !== data.company &&
|
|
text !== data.location &&
|
|
!text.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
|
|
!text.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i)) {
|
|
data.description = text.substring(0, 500);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return data;
|
|
});
|
|
|
|
// Clean and format
|
|
const title = cleanText(jobData.title);
|
|
let jobUrl = jobData.jobUrl || "";
|
|
if (jobUrl && !jobUrl.startsWith("http")) {
|
|
jobUrl = `https://www.linkedin.com${jobUrl}`;
|
|
}
|
|
|
|
// If we have no title and no jobId, try one more aggressive extraction
|
|
if (!jobData.jobId && !title) {
|
|
// Try to extract from the entire element's text content
|
|
try {
|
|
const allText = await jobElement.evaluate((el) => {
|
|
// Get all text, split by newlines
|
|
const text = el.innerText || el.textContent || "";
|
|
const lines = text.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
|
|
|
// Find job link
|
|
const jobLink = el.querySelector("a[href*='/jobs/view/']");
|
|
const jobUrl = jobLink?.getAttribute("href") || "";
|
|
const jobIdMatch = jobUrl.match(/\/jobs\/view\/(\d+)/);
|
|
const jobId = jobIdMatch ? jobIdMatch[1] : "";
|
|
|
|
// First non-empty line is usually the title
|
|
const title = lines[0] || "";
|
|
|
|
// Look for company (usually second line or contains company pattern)
|
|
let company = "";
|
|
for (const line of lines.slice(1, 5)) {
|
|
if (line && line.length < 100 && !line.includes(",") &&
|
|
!line.match(/^\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i) &&
|
|
!line.match(/^(Full-time|Part-time|Contract|Internship|Temporary)$/i)) {
|
|
company = line;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Look for location (usually has comma or location keywords)
|
|
let location = "";
|
|
for (const line of lines) {
|
|
if (line.includes(",") ||
|
|
line.match(/^[A-Z][a-z]+,\s*[A-Z]/) ||
|
|
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|toronto|vancouver|calgary|ottawa|montreal|canada|united states|usa)/i.test(line)) {
|
|
location = line;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return { jobId, jobUrl, title, company, location };
|
|
});
|
|
|
|
if (allText.jobId || allText.title) {
|
|
// Use the extracted data
|
|
if (allText.jobId) jobData.jobId = allText.jobId;
|
|
if (allText.jobUrl) jobData.jobUrl = allText.jobUrl;
|
|
if (allText.title && !title) title = allText.title;
|
|
if (allText.company && !jobData.company) jobData.company = allText.company;
|
|
if (allText.location && !jobData.location) jobData.location = allText.location;
|
|
}
|
|
} catch (e) {
|
|
// Fallback extraction failed
|
|
}
|
|
}
|
|
|
|
// If we still have no title and no jobId, this extraction failed
|
|
if (!jobData.jobId && !title) {
|
|
logger.warning(`⚠️ Job extraction failed: no jobId or title found`);
|
|
return null;
|
|
}
|
|
|
|
// Filter out jobs that have jobId but no meaningful data (title, company, or location)
|
|
// These are likely jobs that haven't loaded their content yet
|
|
if (jobData.jobId && !title && !jobData.company && !jobData.location) {
|
|
logger.debug(`⚠️ Job ${jobData.jobId} has no extractable data (title, company, or location) - skipping`);
|
|
return null;
|
|
}
|
|
|
|
// Log if we're missing critical fields (only in debug mode to reduce noise)
|
|
if (process.env.DEBUG === "true") {
|
|
if (!title) {
|
|
logger.warning(`⚠️ Job ${jobData.jobId} missing title`);
|
|
}
|
|
if (!jobData.company) {
|
|
logger.debug(`⚠️ Job ${jobData.jobId || title} missing company`);
|
|
}
|
|
if (!jobData.location) {
|
|
logger.debug(`⚠️ Job ${jobData.jobId || title} missing location`);
|
|
}
|
|
}
|
|
|
|
// Generate job ID if not found
|
|
const jobId = jobData.jobId || `linkedin-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
|
|
|
// Extract full job details if coreParser and jobUrl are provided
|
|
let fullDetails = { fullDescription: "", roleDuties: "", jobRequirements: "" };
|
|
if (coreParser && jobUrl) {
|
|
try {
|
|
fullDetails = await extractFullJobDescription(coreParser, jobUrl);
|
|
// If we got full description, update the description field
|
|
if (fullDetails.fullDescription) {
|
|
jobData.description = fullDetails.fullDescription;
|
|
}
|
|
} catch (error) {
|
|
logger.debug(`Could not extract full job details for ${jobUrl}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
return {
|
|
jobId,
|
|
title,
|
|
company: cleanText(jobData.company),
|
|
location: cleanText(jobData.location),
|
|
jobUrl,
|
|
postedDate: jobData.postedDate,
|
|
description: cleanText(fullDetails.fullDescription || jobData.description),
|
|
roleDuties: cleanText(fullDetails.roleDuties),
|
|
jobRequirements: cleanText(fullDetails.jobRequirements),
|
|
jobType: jobData.jobType,
|
|
experienceLevel: jobData.experienceLevel,
|
|
keyword,
|
|
extractedAt: new Date().toISOString(),
|
|
source: "linkedin-jobs",
|
|
};
|
|
} catch (error) {
|
|
logger.warning(`Error extracting job data: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if next page is available
|
|
*/
|
|
async function hasNextPageAvailable(page) {
|
|
try {
|
|
// LinkedIn uses pagination buttons - try multiple selectors
|
|
const nextButtonSelectors = [
|
|
"button[aria-label='Next']",
|
|
"button[aria-label='Next page']",
|
|
"button[aria-label*='Next']",
|
|
".artdeco-pagination__button--next",
|
|
"button[data-test-id='pagination-next-button']",
|
|
"button.pagination__button--next",
|
|
"button[class*='pagination'][class*='next']",
|
|
"li[class*='pagination'][class*='next'] button",
|
|
"a[aria-label='Next']",
|
|
"a[aria-label='Next page']",
|
|
];
|
|
|
|
for (const selector of nextButtonSelectors) {
|
|
try {
|
|
const nextButton = await page.$(selector);
|
|
if (nextButton) {
|
|
// Check if button is disabled
|
|
const isDisabled = await nextButton.evaluate((el) => {
|
|
return el.hasAttribute("disabled") ||
|
|
el.getAttribute("aria-disabled") === "true" ||
|
|
el.classList.contains("disabled") ||
|
|
el.classList.contains("artdeco-button--disabled");
|
|
}).catch(() => false);
|
|
|
|
// Check if button is visible
|
|
const isVisible = await nextButton.isVisible().catch(() => false);
|
|
|
|
if (!isDisabled && isVisible) {
|
|
logger.debug(`✅ Found next page button with selector: ${selector}`);
|
|
return true;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Try next selector
|
|
continue;
|
|
}
|
|
}
|
|
|
|
logger.debug(`❌ No next page button found`);
|
|
return false;
|
|
} catch (error) {
|
|
logger.debug(`Error checking for next page: ${error.message}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Navigate to next page
|
|
*/
|
|
async function navigateToNextPage(page) {
|
|
try {
|
|
const nextButtonSelectors = [
|
|
"button[aria-label='Next']",
|
|
"button[aria-label='Next page']",
|
|
"button[aria-label*='Next']",
|
|
".artdeco-pagination__button--next",
|
|
"button[data-test-id='pagination-next-button']",
|
|
"button.pagination__button--next",
|
|
"button[class*='pagination'][class*='next']",
|
|
"li[class*='pagination'][class*='next'] button",
|
|
"a[aria-label='Next']",
|
|
"a[aria-label='Next page']",
|
|
];
|
|
|
|
for (const selector of nextButtonSelectors) {
|
|
try {
|
|
const nextButton = await page.$(selector);
|
|
if (nextButton) {
|
|
// Check if button is disabled
|
|
const isDisabled = await nextButton.evaluate((el) => {
|
|
return el.hasAttribute("disabled") ||
|
|
el.getAttribute("aria-disabled") === "true" ||
|
|
el.classList.contains("disabled") ||
|
|
el.classList.contains("artdeco-button--disabled");
|
|
}).catch(() => false);
|
|
|
|
if (!isDisabled) {
|
|
// Scroll button into view before clicking (minimal delay)
|
|
await nextButton.scrollIntoViewIfNeeded();
|
|
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
|
|
// Click the button and wait for job elements to appear
|
|
// This is more efficient than waiting for fixed timeouts
|
|
const clickPromise = nextButton.click();
|
|
logger.info(`✅ Clicked next page button (selector: ${selector})`);
|
|
|
|
// Wait for click to complete
|
|
await clickPromise;
|
|
|
|
// Wait for job elements to appear (this is the key indicator that page loaded)
|
|
// Use Promise.race to wait for any of the common job element selectors
|
|
try {
|
|
await Promise.race([
|
|
page.waitForSelector("li[data-occludable-job-id]", { timeout: 6000 }),
|
|
page.waitForSelector("li.jobs-search-results__list-item", { timeout: 6000 }),
|
|
page.waitForSelector(".scaffold-layout__list-item", { timeout: 6000 }),
|
|
]);
|
|
|
|
// Small buffer for content to fully render
|
|
await new Promise((resolve) => setTimeout(resolve, 300));
|
|
return true;
|
|
} catch (e) {
|
|
// If elements don't appear quickly, wait a bit more and check
|
|
logger.debug(`⚠️ Job elements not detected immediately, waiting...`);
|
|
await new Promise((resolve) => setTimeout(resolve, 1500));
|
|
|
|
// Verify elements exist now
|
|
const jobCount = await page.$$eval(
|
|
"li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item",
|
|
(elements) => elements.length
|
|
).catch(() => 0);
|
|
|
|
if (jobCount > 0) {
|
|
return true;
|
|
} else {
|
|
logger.warning(`⚠️ No job elements found after navigation`);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Try next selector
|
|
continue;
|
|
}
|
|
}
|
|
|
|
logger.warning(`⚠️ Could not find or click next page button`);
|
|
return false;
|
|
} catch (error) {
|
|
logger.warning(`Failed to navigate to next page: ${error.message}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
linkedinJobsStrategy,
|
|
buildJobSearchUrl,
|
|
};
|
|
|