/** * LinkedIn Jobs Parsing Strategy * * Uses core-parser for browser management and ai-analyzer for utilities */ const { logger, cleanText, validateLocationAgainstFilters, parseLocationFilters, containsAnyKeyword, containsAllKeywords, matchesKeywordGroups, } = require("ai-analyzer"); /** * LinkedIn Jobs URL builder */ function buildJobSearchUrl(keyword, location = "", filters = {}) { const baseUrl = "https://www.linkedin.com/jobs/search/"; // Always wrap keywords in quotes to ensure exact phrase matching // LinkedIn's search treats unquoted keywords as individual words (OR logic) // e.g., "co-op" becomes "co" OR "op", "software engineer" becomes "software" OR "engineer" // Wrapping in quotes forces LinkedIn to search for the exact phrase // URLSearchParams will properly encode the quotes const searchKeyword = `"${keyword}"`; const params = new URLSearchParams({ keywords: searchKeyword, sortBy: "DD", // Date posted (newest first) }); if (location) { params.append("location", location); } // Add date filter if provided (f_TPR parameter) // LinkedIn uses f_TPR=r where seconds is the time range if (filters.minDate) { try { const minDate = new Date(filters.minDate); const now = new Date(); const secondsDiff = Math.floor((now - minDate) / 1000); // LinkedIn supports relative timeframes (f_TPR parameter) // If date is in the future, don't add filter if (secondsDiff > 0) { // LinkedIn typically supports up to ~30 days (2592000 seconds) // For dates older than 30 days, we'll still add it but LinkedIn may limit results const maxSeconds = 2592000; // 30 days const timeRange = Math.min(secondsDiff, maxSeconds); params.append("f_TPR", `r${timeRange}`); } } catch (error) { logger.warning(`āš ļø Invalid date format for minDate: ${filters.minDate}. Expected format: YYYY-MM-DD`); } } // Add additional filters if (filters.experienceLevel) { params.append("f_E", filters.experienceLevel); } if (filters.jobType) { params.append("f_JT", filters.jobType); // F=Full-time, P=Part-time, C=Contract, T=Temporary, I=Internship } if (filters.remote) { params.append("f_WT", "2"); // 2 = Remote } return `${baseUrl}?${params.toString()}`; } /** * LinkedIn Jobs parsing strategy function */ async function linkedinJobsStrategy(coreParser, options = {}) { const { keywords = ["software engineer", "developer"], keywordGroups = null, // Array of keyword groups for grouped AND/OR logic locationFilter = null, maxPages = 5, credentials = {}, location = "", // LinkedIn location search (e.g., "Canada", "Toronto, Ontario, Canada") minDate = null, // Minimum posted date (format: YYYY-MM-DD) useAndLogic = false, // Use AND logic instead of OR logic for keywords } = options; const results = []; const rejectedResults = []; const seenJobs = new Set(); // Create a backup to track results in case of issues let resultsBackup = []; let rejectedResultsBackup = []; try { // Create main page const page = await coreParser.createPage("linkedin-jobs-main"); // Authenticate to LinkedIn logger.info("šŸ” Authenticating to LinkedIn..."); await coreParser.authenticate("linkedin", credentials, "linkedin-jobs-main"); logger.info("āœ… LinkedIn authentication successful"); logger.info("šŸš€ Starting LinkedIn Jobs parser..."); logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); if (keywordGroups) { logger.info(`šŸ”— Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); } else { logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); } logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); logger.info(`šŸŒ LinkedIn Location: ${location || "None"}`); logger.info(`šŸ“„ Max Pages: ${maxPages}`); if (minDate) { logger.info(`šŸ“… Min Date Filter: ${minDate} (jobs posted after this date)`); } // Determine search keywords based on logic type let searchKeywords; if (keywordGroups) { // For grouped AND/OR logic, search each keyword in each group (OR within groups) // We'll combine results and filter to ensure all groups match (AND between groups) searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups } else if (useAndLogic) { // For simple AND logic, combine all keywords into a single search query searchKeywords = [keywords.join(" ")]; } else { // For OR logic, search each keyword separately searchKeywords = keywords; } // Search for each keyword (or combined keyword for AND logic) for (const keyword of searchKeywords) { logger.info(`\nšŸ” Searching LinkedIn Jobs for: "${keyword}"`); const searchUrl = buildJobSearchUrl(keyword, location, { minDate: minDate, }); logger.info(`šŸ”— Search URL: ${searchUrl}`); // Check if page is still valid before proceeding try { await page.evaluate(() => document.readyState).catch(() => { throw new Error("Page is no longer valid - browser may have closed"); }); } catch (pageError) { logger.error(`āŒ Page is no longer accessible: ${pageError.message}`); logger.info(`āš ļø Preserving ${results.length} jobs found so far`); break; // Exit keyword loop if page is invalid } try { // Navigate to job search results await coreParser.navigateTo(searchUrl, { pageId: "linkedin-jobs-main", retries: 2, waitUntil: "networkidle", }); // Wait for page to load - reduced delay, use networkidle from navigation await new Promise((resolve) => setTimeout(resolve, 2000)); // Verify we're on the right page and check what LinkedIn shows const currentUrl = page.url(); logger.info(`šŸ“ Current page URL: ${currentUrl}`); // Check if LinkedIn shows any results count try { const resultsText = await page.evaluate(() => { // Look for result count text like "Showing X results" or "X jobs" const possibleTexts = [ document.querySelector("h1")?.textContent, document.querySelector(".results-context-header__job-count")?.textContent, document.querySelector("[class*='results-count']")?.textContent, document.querySelector("[class*='job-count']")?.textContent, ].filter(Boolean); return possibleTexts.join(" | ") || "No results count found"; }); logger.info(`šŸ“Š LinkedIn results info: ${resultsText}`); } catch (e) { logger.debug(`Could not get results count: ${e.message}`); } // Scroll to trigger lazy loading - single scroll operation try { await page.evaluate(() => { window.scrollTo(0, 500); }); await new Promise((resolve) => setTimeout(resolve, 1000)); } catch (e) { logger.debug(`Could not scroll page: ${e.message}`); } // Wait for job listings container - try multiple selectors let hasResults = false; const possibleSelectors = [ ".jobs-search-results-list", ".jobs-search-results", "[data-test-id='job-search-results-list']", ".scaffold-layout__list-container", "ul.scaffold-layout__list-container", ".jobs-search__results-list", "main .scaffold-layout__list", ]; // Try selectors in parallel with shorter timeout const selectorPromises = possibleSelectors.map(async (selector) => { try { await page.waitForSelector(selector, { timeout: 3000 }); const count = await page.$$(selector).then((elements) => elements.length); if (count > 0) { return { selector, count, success: true }; } } catch (e) { // Selector failed } return { selector, success: false }; }); // Wait for first successful selector const selectorResults = await Promise.allSettled(selectorPromises); for (const result of selectorResults) { if (result.status === 'fulfilled' && result.value.success) { hasResults = true; logger.info(`āœ… Found job results container with selector: ${result.value.selector}`); break; } } if (!hasResults) { logger.warning(`āš ļø No job results container found for keyword: ${keyword}`); // Debug: Check what's actually on the page try { const pageTitle = await page.title(); const pageUrl = page.url(); logger.info(`šŸ“„ Page title: ${pageTitle}`); logger.info(`šŸ”— Page URL: ${pageUrl}`); // Check for common LinkedIn elements const hasMain = await page.$("main").then(el => el !== null).catch(() => false); const hasJobsSection = await page.$("[class*='job']").then(el => el !== null).catch(() => false); logger.info(`šŸ” Debug - Has main: ${hasMain}, Has jobs section: ${hasJobsSection}`); // Take screenshot for debugging const screenshotPath = `debug-linkedin-jobs-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`; await page.screenshot({ path: screenshotPath, fullPage: true }); logger.info(`šŸ“ø Debug screenshot saved: ${screenshotPath}`); } catch (e) { logger.warning(`Could not capture debug info: ${e.message}`); } continue; } // LinkedIn uses pagination with a "Next" button // Extract jobs from each page and navigate to next page const allJobs = []; let currentPage = 1; const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited logger.info(`šŸ“„ Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`); while (currentPage <= maxPagesToProcess) { logger.info(`šŸ“„ Processing page ${currentPage}...`); // Wait for page to fully load await new Promise((resolve) => setTimeout(resolve, 2000)); // Extract jobs from current page const pageJobs = await extractJobsFromPage(page, keyword, locationFilter, coreParser); logger.info(`šŸ“‹ Extracted ${pageJobs.length} jobs from page ${currentPage}`); if (pageJobs.length === 0) { logger.warning(`āš ļø No jobs found on page ${currentPage}, stopping pagination`); break; } allJobs.push(...pageJobs); // Check if there's a next page const hasNext = await hasNextPageAvailable(page); if (!hasNext) { logger.info(`āœ… No more pages available. Total jobs extracted: ${allJobs.length}`); break; } // Navigate to next page if we haven't reached maxPages if (currentPage < maxPagesToProcess) { logger.info(`āž”ļø Navigating to page ${currentPage + 1}...`); const navigationSuccess = await navigateToNextPage(page); if (!navigationSuccess) { logger.warning(`āš ļø Failed to navigate to next page, stopping pagination`); break; } currentPage++; // Quick verification that job elements are present (navigateToNextPage already waited for them) const jobCount = await page.$$eval( "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", (elements) => elements.length ).catch(() => 0); if (jobCount === 0) { logger.warning(`āš ļø No job elements found on page ${currentPage} after navigation, stopping pagination`); break; } logger.debug(`āœ… Page ${currentPage} loaded with ${jobCount} job elements`); } else { logger.info(`šŸ“Š Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${allJobs.length}`); break; } } logger.info(`šŸ“‹ Extracted ${allJobs.length} total jobs across ${currentPage} page(s)`); // Verify page is still valid after extraction try { await page.evaluate(() => document.readyState); } catch (pageError) { logger.warning(`āš ļø Page became invalid after extraction, but we have ${allJobs.length} jobs extracted`); } // Log sample job data for debugging if (allJobs.length > 0 && process.env.DEBUG === "true") { const sampleJob = allJobs[0]; logger.debug(`šŸ“ Sample job: ID=${sampleJob.jobId}, Title=${sampleJob.title}, Location=${sampleJob.location || 'N/A'}, Company=${sampleJob.company || 'N/A'}`); } let duplicateCount = 0; let locationRejectedCount = 0; let addedCount = 0; let noJobIdCount = 0; for (const job of allJobs) { // Handle jobs without jobId - use URL as fallback identifier if (!job.jobId || job.jobId === "") { noJobIdCount++; // Use job URL as identifier if no jobId if (job.jobUrl) { const urlMatch = job.jobUrl.match(/\/jobs\/view\/(\d+)/); if (urlMatch) { job.jobId = urlMatch[1]; } else { // Generate a unique ID from URL job.jobId = `linkedin-${job.jobUrl.replace(/[^a-zA-Z0-9]/g, '-')}`; } } else { // No jobId and no URL - skip this job logger.warning(`āš ļø Job has no jobId or URL, skipping: ${job.title || 'Unknown'}`); continue; } } // Skip duplicates if (seenJobs.has(job.jobId)) { duplicateCount++; if (process.env.DEBUG === "true") { logger.debug(`ā­ļø Skipping duplicate job: ${job.jobId} - ${job.title}`); } continue; } seenJobs.add(job.jobId); // Validate keywords based on logic type if (keywordGroups) { // Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR) const fullText = `${job.title} ${job.description} ${job.company}`; if (!matchesKeywordGroups(fullText, keywordGroups)) { rejectedResults.push({ ...job, rejectionReason: "Job does not match all keyword groups", }); if (process.env.DEBUG === "true") { logger.debug(`šŸ” Rejected (grouped logic): "${job.title}" - does not match all groups`); } continue; } } else if (useAndLogic) { // Simple AND logic: all keywords must match const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase(); if (!containsAllKeywords(fullText, keywords)) { rejectedResults.push({ ...job, rejectionReason: "Not all keywords found in job listing", }); if (process.env.DEBUG === "true") { logger.debug(`šŸ” Rejected (AND logic): "${job.title}" - not all keywords found`); } continue; } } // For OR logic, trust LinkedIn's search results (already filtered) // Validate location if filtering enabled if (locationFilter) { // Parse locationFilter string into array if it's a string const locationFiltersArray = typeof locationFilter === 'string' ? parseLocationFilters(locationFilter) : locationFilter; const locationValid = validateLocationAgainstFilters( job.location, locationFiltersArray ); if (!locationValid.isValid) { locationRejectedCount++; rejectedResults.push({ ...job, rejectionReason: locationValid.reasoning || "Location filter mismatch", }); if (process.env.DEBUG === "true") { logger.debug(`šŸ“ Rejected location: "${job.location}" - ${locationValid.reasoning || "Location filter mismatch"}`); } continue; } } results.push(job); addedCount++; } // Backup results after each keyword processing resultsBackup = [...results]; rejectedResultsBackup = [...rejectedResults]; logger.info(`šŸ“Š Processing complete: ${addedCount} added, ${locationRejectedCount} location-rejected, ${duplicateCount} duplicates, ${noJobIdCount} had no jobId`); logger.info(`šŸ“Š Current results count: ${results.length} jobs accumulated so far`); logger.info(`šŸ“Š Backup results count: ${resultsBackup.length} jobs in backup`); } catch (error) { logger.error(`Error processing keyword "${keyword}": ${error.message}`); logger.error(`Stack: ${error.stack}`); // Preserve results even if there's an error logger.info(`āš ļø Preserving ${results.length} jobs found before error`); } } // Log results before returning logger.info(`šŸ“Š Final results check: results.length=${results.length}, rejectedResults.length=${rejectedResults.length}`); logger.info(`šŸ“Š Backup check: resultsBackup.length=${resultsBackup.length}, rejectedResultsBackup.length=${rejectedResultsBackup.length}`); // If results array is empty but backup has data, use backup (defensive programming) const finalResults = results.length > 0 ? results : resultsBackup; const finalRejectedResults = rejectedResults.length > 0 ? rejectedResults : rejectedResultsBackup; if (results.length === 0 && resultsBackup.length > 0) { logger.warning(`āš ļø Results array was empty but backup has ${resultsBackup.length} jobs - using backup!`); } if (finalResults.length > 0) { logger.info(`šŸ“ First result sample: ${JSON.stringify(finalResults[0], null, 2).substring(0, 200)}...`); } logger.info( `šŸŽÆ LinkedIn Jobs parsing completed: ${finalResults.length} jobs found, ${finalRejectedResults.length} rejected` ); // Final verification - log if results seem wrong if (finalResults.length === 0 && finalRejectedResults.length === 0) { logger.warning(`āš ļø No jobs found or rejected - this might indicate an extraction issue`); } const returnValue = { results: [...finalResults], // Create a copy to ensure we're returning the right data rejectedResults: [...finalRejectedResults], summary: { totalJobs: finalResults.length, totalRejected: finalRejectedResults.length, keywords: keywords.join(", "), locationFilter, source: "linkedin-jobs", }, }; logger.info(`šŸ“¦ Returning: ${returnValue.results.length} results, ${returnValue.rejectedResults.length} rejected`); return returnValue; } catch (error) { logger.error(`āŒ LinkedIn Jobs parsing failed: ${error.message}`); logger.error(`Stack: ${error.stack}`); // Return whatever results we have, even if there was an error logger.info(`āš ļø Returning ${results.length} jobs found before fatal error`); return { results, rejectedResults, summary: { totalJobs: results.length, totalRejected: rejectedResults.length, keywords: keywords.join(", "), locationFilter, source: "linkedin-jobs", error: error.message, }, }; } } /** * Scroll to load more jobs (LinkedIn uses infinite scroll) - improved to load all jobs */ async function scrollToLoadJobs(page) { try { let previousJobCount = 0; let currentJobCount = 0; let scrollAttempts = 0; let noChangeCount = 0; // Count how many times count hasn't changed const maxScrollAttempts = 50; // Increased for large result sets const maxNoChangeAttempts = 3; // Stop if count doesn't change 3 times in a row logger.info(`šŸ“œ Starting to scroll and load jobs...`); // Keep scrolling until no more jobs load while (scrollAttempts < maxScrollAttempts) { // Count current jobs currentJobCount = await page.$$eval( "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", (elements) => elements.length ).catch(() => 0); // If no new jobs loaded, increment no-change counter if (currentJobCount === previousJobCount && scrollAttempts > 0) { noChangeCount++; // If count hasn't changed 3 times in a row, we're probably done if (noChangeCount >= maxNoChangeAttempts) { logger.info(`šŸ“Š Loaded ${currentJobCount} jobs after ${scrollAttempts} scrolls (no new jobs for ${noChangeCount} attempts)`); break; } } else { // Count changed, reset no-change counter noChangeCount = 0; } previousJobCount = currentJobCount; // Scroll down - use smooth scrolling to trigger lazy loading await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' }); }); // Wait for new content to load - LinkedIn sometimes needs more time await new Promise((resolve) => setTimeout(resolve, 2500)); // Also try scrolling in smaller increments to trigger lazy loading if (scrollAttempts % 3 === 0) { await page.evaluate(() => { window.scrollBy(0, 1000); }); await new Promise((resolve) => setTimeout(resolve, 1000)); } scrollAttempts++; // Log progress every 5 scrolls if (scrollAttempts % 5 === 0) { const newCount = await page.$$eval( "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", (elements) => elements.length ).catch(() => 0); logger.info(`šŸ“œ Scrolled ${scrollAttempts} times, loaded ${newCount} jobs so far...`); } } // Final scroll to ensure everything is loaded await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight); }); await new Promise((resolve) => setTimeout(resolve, 2000)); // Final count const finalCount = await page.$$eval( "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", (elements) => elements.length ).catch(() => 0); logger.info(`āœ… Finished scrolling. Total jobs loaded: ${finalCount}`); } catch (error) { logger.warning(`Could not scroll page: ${error.message}`); } } /** * Extract jobs from current page */ async function extractJobsFromPage(page, keyword, locationFilter, coreParser = null) { const jobs = []; try { // LinkedIn job listings are in