/** * LinkedIn Parsing Strategy * * Uses core-parser for browser management and ai-analyzer for utilities */ const { logger, cleanText, containsAnyKeyword, validateLocationAgainstFilters, extractLocationFromProfile, } = require("ai-analyzer"); /** * LinkedIn parsing strategy function */ async function linkedinStrategy(coreParser, options = {}) { const { keywords = ["layoff", "downsizing", "job cuts"], locationFilter = null, maxResults = 50, credentials = {}, } = options; const results = []; const rejectedResults = []; const seenPosts = new Set(); const seenProfiles = new Set(); try { // Create main page const page = await coreParser.createPage("linkedin-main"); // Authenticate to LinkedIn logger.info("🔐 Authenticating to LinkedIn..."); await coreParser.authenticate("linkedin", credentials, "linkedin-main"); logger.info("✅ LinkedIn authentication successful"); // Search for posts with each keyword for (const keyword of keywords) { logger.info(`🔍 Searching LinkedIn for: "${keyword}"`); const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent( keyword )}&sortBy=date_posted`; await coreParser.navigateTo(searchUrl, { pageId: "linkedin-main", retries: 2, }); // Wait for page to load - use delay utility instead of waitForTimeout await new Promise(resolve => setTimeout(resolve, 3000)); // Give LinkedIn time to render // Wait for search results - try multiple selectors let hasResults = false; const possibleSelectors = [ ".search-results-container", ".search-results__list", ".reusable-search__result-container", "[data-test-id='search-results']", ".feed-shared-update-v2", "article", ]; for (const selector of possibleSelectors) { try { await page.waitForSelector(selector, { timeout: 5000 }); hasResults = true; logger.info(`✅ Found results container with selector: ${selector}`); break; } catch (e) { // Try next selector } } if (!hasResults) { logger.warning(`âš ī¸ No search results container found for keyword: ${keyword}`); // Take screenshot for debugging try { const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`; await page.screenshot({ path: screenshotPath, fullPage: true }); logger.info(`📸 Debug screenshot saved: ${screenshotPath}`); } catch (e) { logger.warning(`Could not take screenshot: ${e.message}`); } continue; } // Extract posts from current page const posts = await extractPostsFromPage(page, keyword); logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`); for (const post of posts) { // Skip duplicates if (seenPosts.has(post.postId)) continue; seenPosts.add(post.postId); // Validate location if filtering enabled if (locationFilter) { const postLocation = post.location || post.profileLocation || ""; const locationValid = validateLocationAgainstFilters( postLocation, locationFilter ); if (!locationValid) { logger.debug(`â­ī¸ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`); rejectedResults.push({ ...post, rejectionReason: `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`, }); continue; } else { logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}"`); } } results.push(post); if (results.length >= maxResults) { logger.info(`📊 Reached maximum results limit: ${maxResults}`); break; } } if (results.length >= maxResults) break; } logger.info( `đŸŽ¯ LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected` ); return { results, rejectedResults, summary: { totalPosts: results.length, totalRejected: rejectedResults.length, keywords: keywords.join(", "), locationFilter, }, }; } catch (error) { logger.error(`❌ LinkedIn parsing failed: ${error.message}`); throw error; } } /** * Extract posts from current search results page */ async function extractPostsFromPage(page, keyword) { const posts = []; try { // Try multiple selectors for post elements (LinkedIn changes these frequently) const postSelectors = [ ".feed-shared-update-v2", "article.feed-shared-update-v2", "[data-urn*='urn:li:activity']", ".reusable-search__result-container", ".search-result__wrapper", "article", ]; let postElements = []; let usedSelector = null; for (const selector of postSelectors) { try { postElements = await page.$$(selector); if (postElements.length > 0) { usedSelector = selector; logger.info(`✅ Found ${postElements.length} post elements using selector: ${selector}`); break; } } catch (e) { // Try next selector } } if (postElements.length === 0) { logger.warning(`âš ī¸ No post elements found with any selector. Page might have different structure.`); // Log page title and URL for debugging try { const pageTitle = await page.title(); const pageUrl = page.url(); logger.info(`📄 Page title: ${pageTitle}`); logger.info(`🔗 Page URL: ${pageUrl}`); } catch (e) { // Ignore } return posts; } logger.info(`🔍 Processing ${postElements.length} post elements...`); for (let i = 0; i < postElements.length; i++) { try { const post = await extractPostData(postElements[i], keyword); if (post) { posts.push(post); logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}...`); } else { logger.debug(`â­ī¸ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`); } } catch (error) { logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`); } } logger.info(`✅ Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`); } catch (error) { logger.error(`❌ Failed to extract posts from page: ${error.message}`); logger.error(`Stack trace: ${error.stack}`); } return posts; } /** * Extract data from individual post element */ async function extractPostData(postElement, keyword) { try { // Extract post ID const postId = (await postElement.getAttribute("data-urn")) || ""; // Extract author info const authorElement = await postElement.$(".feed-shared-actor__name"); const authorName = authorElement ? cleanText(await authorElement.textContent()) : ""; const authorLinkElement = await postElement.$(".feed-shared-actor__name a"); const authorUrl = authorLinkElement ? await authorLinkElement.getAttribute("href") : ""; // Extract post content const contentElement = await postElement.$(".feed-shared-text"); const content = contentElement ? cleanText(await contentElement.textContent()) : ""; // Extract timestamp const timeElement = await postElement.$( ".feed-shared-actor__sub-description time" ); const timestamp = timeElement ? await timeElement.getAttribute("datetime") : ""; // Extract location from profile (try multiple selectors) let location = ""; const locationSelectors = [ ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link", ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link--without-hover", ".feed-shared-actor__sub-description span[aria-label*='location']", ".feed-shared-actor__sub-description span[aria-label*='Location']", ]; for (const selector of locationSelectors) { try { const locationElement = await postElement.$(selector); if (locationElement) { const locationText = await locationElement.textContent(); if (locationText && locationText.trim()) { location = cleanText(locationText); break; } } } catch (e) { // Try next selector } } // If no location found in sub-description, try to extract from author link hover or profile if (!location) { try { // Try to get location from data attributes or other sources const subDescElement = await postElement.$(".feed-shared-actor__sub-description"); if (subDescElement) { const subDescText = await subDescElement.textContent(); // Look for location patterns (City, Province/State, Country) const locationMatch = subDescText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/); if (locationMatch) { location = cleanText(locationMatch[0]); } } } catch (e) { // Location extraction failed, continue without it } } // Extract engagement metrics const likesElement = await postElement.$(".social-counts-reactions__count"); const likesText = likesElement ? cleanText(await likesElement.textContent()) : "0"; const commentsElement = await postElement.$( ".social-counts-comments__count" ); const commentsText = commentsElement ? cleanText(await commentsElement.textContent()) : "0"; // Note: LinkedIn search already filters by keyword semantically // We don't filter by content keyword match because: // 1. LinkedIn's search is semantic - it finds related posts, not just exact matches // 2. The keyword might be in comments, hashtags, or metadata, not visible text // 3. Posts might be about the topic without using the exact keyword // // Optional: Log if keyword appears in content (for debugging, but don't filter) const keywordLower = keyword.toLowerCase(); const contentLower = content.toLowerCase(); const hasKeywordInContent = contentLower.includes(keywordLower); if (!hasKeywordInContent && content.length > 50) { logger.debug(`â„šī¸ Post doesn't contain keyword "${keyword}" in visible content, but including it (LinkedIn search matched it)`); } // Validate we have minimum required data if (!postId && !content) { logger.debug(`â­ī¸ Post filtered: missing both postId and content`); return null; } return { postId: cleanText(postId), authorName, authorUrl, profileLink: authorUrl ? (authorUrl.startsWith("http") ? authorUrl : `https://www.linkedin.com${authorUrl}`) : "", text: content, content: content, location: location, profileLocation: location, // Alias for compatibility timestamp, keyword, likes: extractNumber(likesText), comments: extractNumber(commentsText), extractedAt: new Date().toISOString(), source: "linkedin", parser: "linkedout-parser", }; } catch (error) { logger.warning(`Error extracting post data: ${error.message}`); return null; } } /** * Extract numbers from text (e.g., "15 likes" -> 15) */ function extractNumber(text) { const match = text.match(/\d+/); return match ? parseInt(match[0]) : 0; } module.exports = { linkedinStrategy, extractPostsFromPage, extractPostData, };