/** * LinkedIn Parsing Strategy * * Uses core-parser for browser management and ai-analyzer for utilities */ const { logger, cleanText, containsAnyKeyword, validateLocationAgainstFilters, extractLocationFromProfile, parseLocationFilters, } = require("ai-analyzer"); /** * LinkedIn parsing strategy function */ async function linkedinStrategy(coreParser, options = {}) { const { keywords = ["layoff", "downsizing", "job cuts"], locationFilter = null, maxResults = 50, extractLocationFromProfile = false, credentials = {}, } = options; const results = []; const rejectedResults = []; const seenPosts = new Set(); const seenProfiles = new Set(); try { // Create main page const page = await coreParser.createPage("linkedin-main"); // Authenticate to LinkedIn logger.info("πŸ” Authenticating to LinkedIn..."); await coreParser.authenticate("linkedin", credentials, "linkedin-main"); logger.info("βœ… LinkedIn authentication successful"); // Search for posts with each keyword for (const keyword of keywords) { logger.info(`πŸ” Searching LinkedIn for: "${keyword}"`); const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent( keyword )}&sortBy=date_posted`; await coreParser.navigateTo(searchUrl, { pageId: "linkedin-main", retries: 2, waitUntil: "networkidle", // Wait for network to be idle }); // Wait for page to load and content to render await new Promise(resolve => setTimeout(resolve, 5000)); // Give LinkedIn time to render dynamic content // Scroll down a bit to trigger lazy loading try { await page.evaluate(() => { window.scrollTo(0, 500); }); await new Promise(resolve => setTimeout(resolve, 2000)); } catch (e) { logger.debug(`Could not scroll page: ${e.message}`); } // Wait for search results - try multiple selectors let hasResults = false; const possibleSelectors = [ ".feed-shared-update-v2", "article[data-urn*='urn:li:activity']", "article", ".search-results-container", ".search-results__list", ".reusable-search__result-container", "[data-test-id='search-results']", ]; for (const selector of possibleSelectors) { try { await page.waitForSelector(selector, { timeout: 10000 }); // Verify we actually have post elements const count = await page.$$(selector).then(elements => elements.length); if (count > 0) { hasResults = true; logger.info(`βœ… Found ${count} post elements with selector: ${selector}`); break; } } catch (e) { // Try next selector } } if (!hasResults) { logger.warning(`⚠️ No search results container found for keyword: ${keyword}`); // Take screenshot for debugging try { const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`; await page.screenshot({ path: screenshotPath, fullPage: true }); logger.info(`πŸ“Έ Debug screenshot saved: ${screenshotPath}`); } catch (e) { logger.warning(`Could not take screenshot: ${e.message}`); } continue; } // Extract posts from current page const posts = await extractPostsFromPage(page, keyword, extractLocationFromProfile); logger.info(`πŸ“Š Found ${posts.length} posts for keyword "${keyword}"`); for (const post of posts) { // Skip duplicates if (seenPosts.has(post.postId)) continue; seenPosts.add(post.postId); // Validate location if filtering enabled if (locationFilter) { const postLocation = post.location || post.profileLocation || ""; // Parse locationFilter string into array if it's a string const locationFiltersArray = typeof locationFilter === 'string' ? parseLocationFilters(locationFilter) : locationFilter; const locationValid = validateLocationAgainstFilters( postLocation, locationFiltersArray ); if (!locationValid.isValid) { logger.debug(`⏭️ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`); rejectedResults.push({ ...post, rejectionReason: locationValid.reasoning || `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`, }); continue; } else { logger.debug(`βœ… Post location "${postLocation}" matches filter "${locationFilter}" (${locationValid.reasoning || 'matched'})`); } } results.push(post); if (results.length >= maxResults) { logger.info(`πŸ“Š Reached maximum results limit: ${maxResults}`); break; } } if (results.length >= maxResults) break; } logger.info( `🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected` ); return { results, rejectedResults, summary: { totalPosts: results.length, totalRejected: rejectedResults.length, keywords: keywords.join(", "), locationFilter, }, }; } catch (error) { logger.error(`❌ LinkedIn parsing failed: ${error.message}`); throw error; } } /** * Extract posts from current search results page */ async function extractPostsFromPage(page, keyword, extractLocationFromProfile = false) { const posts = []; try { // Try multiple selectors for post elements (LinkedIn changes these frequently) // Prioritize selectors that are more specific to actual posts const postSelectors = [ "article[data-urn*='urn:li:activity']", // Most specific - posts with activity ID ".feed-shared-update-v2[data-urn*='urn:li:activity']", "article.feed-shared-update-v2", ".feed-shared-update-v2", "[data-urn*='urn:li:activity']", ".reusable-search__result-container", ".search-result__wrapper", "article", ]; let postElements = []; let usedSelector = null; for (const selector of postSelectors) { try { // Wait a bit for elements to be available await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {}); postElements = await page.$$(selector); // Filter to only elements that have a data-urn attribute (actual posts) if (postElements.length > 0) { const validElements = []; for (const elem of postElements) { try { const dataUrn = await elem.getAttribute("data-urn"); if (dataUrn && dataUrn.includes("urn:li:activity")) { validElements.push(elem); } } catch (e) { // Element might have been detached, skip it } } if (validElements.length > 0) { postElements = validElements; usedSelector = selector; logger.info(`βœ… Found ${postElements.length} valid post elements using selector: ${selector}`); break; } } } catch (e) { // Try next selector } } if (postElements.length === 0) { logger.warning(`⚠️ No post elements found with any selector. Page might have different structure.`); // Log page title and URL for debugging try { const pageTitle = await page.title(); const pageUrl = page.url(); logger.info(`πŸ“„ Page title: ${pageTitle}`); logger.info(`πŸ”— Page URL: ${pageUrl}`); } catch (e) { // Ignore } return posts; } logger.info(`πŸ” Processing ${postElements.length} post elements...`); for (let i = 0; i < postElements.length; i++) { try { // Scroll element into view to ensure it's fully rendered try { await postElements[i].evaluate((el) => { el.scrollIntoView({ behavior: 'smooth', block: 'center' }); }); await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for rendering } catch (e) { // Element might already be in view or detached, continue anyway } const post = await extractPostData(postElements[i], keyword); if (post) { // If location is missing and we're enabled to extract from profile, try to get it if (!post.location && extractLocationFromProfile && post.authorUrl) { try { logger.debug(`πŸ“ Location missing for post ${i + 1}, attempting to extract from profile...`); const profileLocation = await extractLocationFromProfilePage(page, post.authorUrl); if (profileLocation) { post.location = profileLocation; post.profileLocation = profileLocation; logger.debug(`βœ… Extracted location from profile: ${profileLocation}`); } } catch (error) { logger.debug(`⚠️ Could not extract location from profile: ${error.message}`); } } posts.push(post); const hasContent = post.content && post.content.length > 0; const hasAuthor = post.authorName && post.authorName.length > 0; const hasLocation = post.location && post.location.length > 0; logger.debug(`βœ… Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'}, location: ${hasLocation ? 'yes' : 'no'})`); } else { logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`); } } catch (error) { logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`); } } logger.info(`βœ… Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`); } catch (error) { logger.error(`❌ Failed to extract posts from page: ${error.message}`); logger.error(`Stack trace: ${error.stack}`); } return posts; } /** * Extract data from individual post element * Uses evaluate() to extract data directly from DOM for better reliability */ async function extractPostData(postElement, keyword) { try { // Use evaluate to extract data directly from the DOM element // This is more reliable than using selectors which may not match const postData = await postElement.evaluate((el, keyword) => { const data = { postId: "", authorName: "", authorUrl: "", content: "", timestamp: "", location: "", likes: 0, comments: 0, }; // Extract post ID from data-urn attribute data.postId = el.getAttribute("data-urn") || el.getAttribute("data-activity-id") || el.querySelector("[data-urn]")?.getAttribute("data-urn") || ""; // Extract author name - try multiple selectors and approaches const authorSelectors = [ ".feed-shared-actor__name", ".feed-shared-actor__name-link", ".update-components-actor__name", ".feed-shared-actor__name a", "[data-test-id='actor-name']", "span[aria-label*='name']", "a[href*='/in/'] span", ".feed-shared-actor a span", ".feed-shared-actor span", ".feed-shared-actor__name-link span", ]; for (const selector of authorSelectors) { const elem = el.querySelector(selector); if (elem) { const text = elem.textContent?.trim() || elem.innerText?.trim(); if (text && text.length > 0 && text.length < 100) { // Reasonable name length data.authorName = text; // Try to get link from same element or parent const link = elem.closest("a") || elem.querySelector("a"); if (link) { data.authorUrl = link.getAttribute("href") || ""; } break; } } } // If author name found but no URL, try to find link separately if (data.authorName && !data.authorUrl) { const authorLink = el.querySelector(".feed-shared-actor__name-link, .feed-shared-actor__name a, a[href*='/in/']"); if (authorLink) { data.authorUrl = authorLink.getAttribute("href") || ""; } } // Fallback: Look for any link with /in/ pattern and get the name from nearby text if (!data.authorName) { const profileLinks = el.querySelectorAll("a[href*='/in/']"); for (const link of profileLinks) { // Skip if it's a company link if (link.getAttribute("href")?.includes("/company/")) continue; // Get text from the link or nearby const linkText = link.textContent?.trim() || link.innerText?.trim(); if (linkText && linkText.length > 0 && linkText.length < 100 && !linkText.includes("View")) { data.authorName = linkText; data.authorUrl = link.getAttribute("href") || ""; break; } // Try to get text from first child span const childSpan = link.querySelector("span"); if (childSpan) { const spanText = childSpan.textContent?.trim() || childSpan.innerText?.trim(); if (spanText && spanText.length > 0 && spanText.length < 100) { data.authorName = spanText; data.authorUrl = link.getAttribute("href") || ""; break; } } // Try to get text from parent const parentText = link.parentElement?.textContent?.trim(); if (parentText && parentText.length < 100 && !parentText.includes("View")) { // Extract just the name part (first line or first few words) const namePart = parentText.split("\n")[0].split("Β·")[0].trim(); if (namePart.length > 0 && namePart.length < 100) { data.authorName = namePart; data.authorUrl = link.getAttribute("href") || ""; break; } } } } // Last resort: Extract from actor section by looking at all text if (!data.authorName) { const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor, [class*='actor']"); if (actorSection) { const actorText = actorSection.textContent || actorSection.innerText || ""; const lines = actorText.split("\n").map(l => l.trim()).filter(l => l.length > 0); // First non-empty line is often the name for (const line of lines) { if (line.length > 0 && line.length < 100 && !line.includes("Β·") && !line.includes("ago") && !line.match(/^\d+/) && !line.toLowerCase().includes("view")) { data.authorName = line; // Try to find associated link const link = actorSection.querySelector("a[href*='/in/']"); if (link) { data.authorUrl = link.getAttribute("href") || ""; } break; } } } } // Extract post content - try multiple selectors const contentSelectors = [ ".feed-shared-text", ".feed-shared-text__text-view", ".feed-shared-update-v2__description", ".update-components-text", "[data-test-id='post-text']", ".feed-shared-text span", ".feed-shared-update-v2__description-wrapper", ]; for (const selector of contentSelectors) { const elem = el.querySelector(selector); if (elem) { const text = elem.textContent?.trim() || elem.innerText?.trim(); if (text && text.length > 10) { // Only use if substantial content data.content = text; break; } } } // Extract timestamp const timeSelectors = [ ".feed-shared-actor__sub-description time", "time[datetime]", "[data-test-id='timestamp']", ".feed-shared-actor__sub-description time[datetime]", "time", ".feed-shared-actor__sub-description time", "span[aria-label*='time']", "span[aria-label*='ago']", ]; for (const selector of timeSelectors) { const elem = el.querySelector(selector); if (elem) { data.timestamp = elem.getAttribute("datetime") || elem.getAttribute("title") || elem.getAttribute("aria-label") || elem.textContent?.trim() || ""; if (data.timestamp) break; } } // Fallback: Look for time-like patterns in sub-description if (!data.timestamp) { const subDesc = el.querySelector(".feed-shared-actor__sub-description"); if (subDesc) { const subDescText = subDesc.textContent || subDesc.innerText || ""; // Look for patterns like "2h", "3d", "1w", "2 months ago", etc. const timePatterns = [ /\d+\s*(minute|hour|day|week|month|year)s?\s*ago/i, /\d+\s*(h|d|w|mo|yr)/i, /(just now|today|yesterday)/i, ]; for (const pattern of timePatterns) { const match = subDescText.match(pattern); if (match) { data.timestamp = match[0]; break; } } } } // Extract location - try multiple approaches const locationSelectors = [ ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link", ".feed-shared-actor__sub-description-link--without-hover", "span[aria-label*='location' i]", "span[aria-label*='Location']", ".feed-shared-actor__sub-description span", ".feed-shared-actor__sub-description a", "a[href*='/company/']", "a[href*='/location/']", ]; for (const selector of locationSelectors) { const elem = el.querySelector(selector); if (elem) { const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || elem.innerText?.trim() || ""; // Check if it looks like a location (contains comma or common location words) if (text && text.length > 2 && text.length < 100) { // More flexible location detection if (text.includes(",") || /(city|province|state|country|region|ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut)/i.test(text) || /^[A-Z][a-z]+,\s*[A-Z][a-z]+/i.test(text)) { data.location = text; break; } } } } // If no location found, try parsing from sub-description text if (!data.location) { const subDesc = el.querySelector(".feed-shared-actor__sub-description"); if (subDesc) { const subDescText = subDesc.textContent || subDesc.innerText || ""; // First, try to get all links in sub-description (location is often a link) const subDescLinks = subDesc.querySelectorAll("a"); for (const link of subDescLinks) { const linkText = link.textContent?.trim() || link.innerText?.trim() || ""; const linkHref = link.getAttribute("href") || ""; // Skip if it's a time/date link or company link if (linkHref.includes("/company/") || linkText.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w)/i)) { continue; } // If link text looks like a location if (linkText && linkText.length > 2 && linkText.length < 100) { if (linkText.includes(",") || /(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut|toronto|vancouver|calgary|ottawa|montreal|winnipeg|edmonton|halifax|victoria|regina|saskatoon|windsor|kitchener|hamilton|london|st\.?\s*catharines|oshawa|barrie|greater sudbury|sherbrooke|kelowna|abbotsford|trois-riviΓ¨res|guelph|cambridge|coquitlam|saanich|saint john|thunder bay|waterloo|delta|chatham|red deer|kamloops|brantford|whitehorse|yellowknife|iqaluit)/i.test(linkText)) { data.location = linkText; break; } } } // If still no location, try pattern matching on the full text if (!data.location && subDescText) { // Look for location patterns (City, Province/State, Country) const locationPatterns = [ // Full location: "City, Province, Country" /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/, // City, Province /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)/, // Just province/state names /\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut|ON|AB|BC|QC|MB|SK|NS|NB|NL|PE|YT|NT|NU)\b/i, // Major cities /\b(Toronto|Vancouver|Calgary|Ottawa|Montreal|Winnipeg|Edmonton|Halifax|Victoria|Regina|Saskatoon)\b/i, ]; for (const pattern of locationPatterns) { const match = subDescText.match(pattern); if (match) { // Get more context around the match const matchIndex = subDescText.indexOf(match[0]); const contextStart = Math.max(0, matchIndex - 30); const contextEnd = Math.min(subDescText.length, matchIndex + match[0].length + 30); const context = subDescText.substring(contextStart, contextEnd).trim(); // Extract just the location part (remove time/date info) let locationText = match[0].trim(); // If we have more context, try to get a better location string if (context.includes(",") && context.length < 100) { // Try to extract "City, Province" pattern from context const cityProvinceMatch = context.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+)/); if (cityProvinceMatch) { locationText = cityProvinceMatch[0].trim(); } } data.location = locationText; break; } } } // Last resort: extract any text that looks location-like from sub-description if (!data.location && subDescText) { // Split by common separators and look for location-like text const parts = subDescText.split(/[Β·β€’|]/).map(p => p.trim()).filter(p => p.length > 0); for (const part of parts) { // Skip if it looks like time/date if (part.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) { continue; } // Check if it looks like a location if (part.length > 2 && part.length < 100 && (part.includes(",") || /(ontario|alberta|british columbia|quebec|manitoba|toronto|vancouver|calgary|ottawa|montreal)/i.test(part))) { data.location = part; break; } } } } } // Final fallback: look anywhere in the actor section for location-like text if (!data.location) { const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor"); if (actorSection) { const actorText = actorSection.textContent || actorSection.innerText || ""; // Look for province names const provinceMatch = actorText.match(/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)\b/i); if (provinceMatch) { // Try to get city, province if available const cityProvinceMatch = actorText.match(/([A-Z][a-z]+),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i); if (cityProvinceMatch) { data.location = cityProvinceMatch[0].trim(); } else { data.location = provinceMatch[0].trim(); } } } } // Try to extract from any hover cards or mini profiles in the DOM if (!data.location) { // Look for mini profile cards or tooltips const miniProfileSelectors = [ "[data-control-name='hovercard']", ".artdeco-hoverable-trigger", ".feed-shared-actor__meta", ".pv-text-details__left-panel", ]; for (const selector of miniProfileSelectors) { const elem = el.querySelector(selector); if (elem) { const text = elem.textContent || elem.innerText || ""; // Look for location patterns const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i); if (locationMatch) { data.location = locationMatch[0].trim(); break; } } } } // Try to extract from data attributes or hidden elements if (!data.location) { // Check for data attributes that might contain location const actorSection = el.querySelector(".feed-shared-actor"); if (actorSection) { // Check all data attributes for (const attr of actorSection.attributes) { if (attr.name.startsWith("data-") && attr.value) { const value = attr.value.toLowerCase(); // Look for location-like patterns in data attributes if (/(ontario|alberta|british columbia|quebec|toronto|vancouver|calgary|ottawa|montreal)/i.test(value)) { // Try to extract the actual location text const locationMatch = attr.value.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/); if (locationMatch) { data.location = locationMatch[0]; break; } } } } // Check for hidden spans or divs with location info const hiddenElements = actorSection.querySelectorAll("span[style*='display: none'], div[style*='display: none'], [aria-hidden='true']"); for (const hiddenElem of hiddenElements) { const text = hiddenElem.textContent || hiddenElem.getAttribute("aria-label") || ""; if (text && /(ontario|alberta|british columbia|quebec|toronto|vancouver)/i.test(text)) { const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/); if (locationMatch) { data.location = locationMatch[0].trim(); break; } } } } } // Extract engagement metrics - try multiple approaches const likesSelectors = [ ".social-counts-reactions__count", "[data-test-id='reactions-count']", ".social-counts__reactions-count", ".feed-shared-social-action-bar__reactions-count", "button[aria-label*='reaction']", "button[aria-label*='like']", ".social-actions-button__reactions-count", "[data-test-id='social-actions__reactions-count']", ]; for (const selector of likesSelectors) { const elem = el.querySelector(selector); if (elem) { const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || ""; const match = text.match(/(\d+)/); if (match) { data.likes = parseInt(match[1], 10) || 0; break; } } } // Fallback: Look for any button or element with reaction/like text if (data.likes === 0) { const allButtons = el.querySelectorAll("button, span, div"); for (const btn of allButtons) { const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || ""; if (/reaction|like/i.test(text)) { const match = text.match(/(\d+)/); if (match) { data.likes = parseInt(match[1], 10) || 0; break; } } } } const commentsSelectors = [ ".social-counts-comments__count", "[data-test-id='comments-count']", ".social-counts__comments-count", ".feed-shared-social-action-bar__comments-count", "button[aria-label*='comment']", ".social-actions-button__comments-count", "[data-test-id='social-actions__comments-count']", ]; for (const selector of commentsSelectors) { const elem = el.querySelector(selector); if (elem) { const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || ""; const match = text.match(/(\d+)/); if (match) { data.comments = parseInt(match[1], 10) || 0; break; } } } // Fallback: Look for any button or element with comment text if (data.comments === 0) { const allButtons = el.querySelectorAll("button, span, div"); for (const btn of allButtons) { const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || ""; if (/comment/i.test(text)) { const match = text.match(/(\d+)/); if (match) { data.comments = parseInt(match[1], 10) || 0; break; } } } } return data; }, keyword); // Clean and format the extracted data const authorName = cleanText(postData.authorName); let authorUrl = postData.authorUrl || ""; if (authorUrl && !authorUrl.startsWith("http")) { authorUrl = `https://www.linkedin.com${authorUrl}`; } const content = cleanText(postData.content); const location = cleanText(postData.location); const timestamp = postData.timestamp || ""; // Validate we have minimum required data if (!postData.postId && !content) { logger.debug(`⏭️ Post filtered: missing both postId and content`); return null; } // Log extraction results for debugging const missingFields = []; if (!authorName) missingFields.push("authorName"); if (!authorUrl) missingFields.push("authorUrl"); if (!location) missingFields.push("location"); if (!timestamp) missingFields.push("timestamp"); if (postData.likes === 0 && postData.comments === 0) missingFields.push("engagement"); if (missingFields.length > 0 && postData.postId) { logger.debug(`⚠️ Post ${postData.postId.substring(0, 20)}... missing: ${missingFields.join(", ")}`); // If location is missing, log sub-description content for debugging if (!location && process.env.DEBUG_EXTRACTION === "true") { try { const subDescInfo = await postElement.evaluate((el) => { const subDesc = el.querySelector(".feed-shared-actor__sub-description"); if (subDesc) { return { text: subDesc.textContent || subDesc.innerText || "", html: subDesc.innerHTML.substring(0, 500), links: Array.from(subDesc.querySelectorAll("a")).map(a => ({ text: a.textContent?.trim(), href: a.getAttribute("href") })) }; } return null; }); if (subDescInfo) { logger.debug(`Sub-description text: "${subDescInfo.text}"`); logger.debug(`Sub-description links: ${JSON.stringify(subDescInfo.links)}`); } } catch (e) { // Ignore errors in debugging } } // Optionally log HTML structure for first failed extraction (to help debug) if (process.env.DEBUG_EXTRACTION === "true" && missingFields.length >= 3) { try { const htmlSnippet = await postElement.evaluate((el) => { // Get the outer HTML of the element (limited to first 2000 chars) const html = el.outerHTML || ""; return html.substring(0, 2000); }); logger.debug(`HTML structure (first 2000 chars):\n${htmlSnippet}`); } catch (e) { // Ignore errors in debugging } } } return { postId: cleanText(postData.postId), authorName, authorUrl, profileLink: authorUrl, text: content, content: content, location: location, profileLocation: location, // Alias for compatibility timestamp, keyword, likes: postData.likes || 0, comments: postData.comments || 0, extractedAt: new Date().toISOString(), source: "linkedin", parser: "linkedout-parser", }; } catch (error) { logger.warning(`Error extracting post data: ${error.message}`); logger.debug(`Stack trace: ${error.stack}`); return null; } } /** * Extract location from a LinkedIn profile page */ async function extractLocationFromProfilePage(page, profileUrl) { try { // Ensure URL is complete let fullUrl = profileUrl; if (!fullUrl.startsWith("http")) { fullUrl = `https://www.linkedin.com${fullUrl}`; } // Remove query parameters that might cause issues fullUrl = fullUrl.split("?")[0]; // Open profile in new tab const profilePage = await page.context().newPage(); try { await profilePage.goto(fullUrl, { waitUntil: "domcontentloaded", timeout: 15000, }); // Wait a bit for content to load await new Promise(resolve => setTimeout(resolve, 2000)); // Use the extractLocationFromProfile utility from ai-analyzer const location = await extractLocationFromProfile(profilePage); await profilePage.close(); return location; } catch (error) { await profilePage.close(); throw error; } } catch (error) { logger.debug(`Failed to extract location from profile ${profileUrl}: ${error.message}`); return ""; } } /** * Extract numbers from text (e.g., "15 likes" -> 15) */ function extractNumber(text) { const match = text.match(/\d+/); return match ? parseInt(match[0]) : 0; } module.exports = { linkedinStrategy, extractPostsFromPage, extractPostData, };