diff --git a/ai-analyzer/src/ai-utils.js b/ai-analyzer/src/ai-utils.js index 6d37a49..a94c9ba 100644 --- a/ai-analyzer/src/ai-utils.js +++ b/ai-analyzer/src/ai-utils.js @@ -1,305 +1,442 @@ -const { logger } = require("./logger"); - -/** - * AI Analysis utilities for post processing with Ollama - * Extracted from ai-analyzer-local.js for reuse across parsers - */ - -// Default model from environment variable or fallback to "mistral" -const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral"; - -/** - * Check if Ollama is running and the model is available - */ -async function checkOllamaStatus( - model = DEFAULT_MODEL, - ollamaHost = "http://localhost:11434" -) { - try { - // Check if Ollama is running - const response = await fetch(`${ollamaHost}/api/tags`); - if (!response.ok) { - throw new Error(`Ollama not running on ${ollamaHost}`); - } - - const data = await response.json(); - const availableModels = data.models.map((m) => m.name); - - logger.ai("Ollama is running"); - logger.info( - `📦 Available models: ${availableModels - .map((m) => m.split(":")[0]) - .join(", ")}` - ); - - // Check if requested model is available - const modelExists = availableModels.some((m) => m.startsWith(model)); - if (!modelExists) { - logger.error(`Model "${model}" not found`); - logger.error(`💡 Install it with: ollama pull ${model}`); - logger.error( - `💡 Or choose from: ${availableModels - .map((m) => m.split(":")[0]) - .join(", ")}` - ); - return false; - } - - logger.success(`Using model: ${model}`); - return true; - } catch (error) { - logger.error(`Error connecting to Ollama: ${error.message}`); - logger.error("💡 Make sure Ollama is installed and running:"); - logger.error(" 1. Install: https://ollama.ai/"); - logger.error(" 2. Start: ollama serve"); - logger.error(` 3. Install model: ollama pull ${model}`); - return false; - } -} - -/** - * Analyze multiple posts using local Ollama - */ -async function analyzeBatch( - posts, - context, - model = DEFAULT_MODEL, - ollamaHost = "http://localhost:11434" -) { - logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`); - - try { - const prompt = `You are an expert at analyzing LinkedIn posts for relevance to specific contexts. - -CONTEXT TO MATCH: "${context}" - -Analyze these ${ - posts.length - } LinkedIn posts and determine if each relates to the context above. - -POSTS: -${posts - .map( - (post, i) => ` -POST ${i + 1}: -"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}" -` - ) - .join("")} - -For each post, provide: -- Is it relevant to "${context}"? (YES/NO) -- Confidence level (0.0 to 1.0) -- Brief reasoning - -Respond in this EXACT format for each post: -POST 1: YES/NO | 0.X | brief reason -POST 2: YES/NO | 0.X | brief reason -POST 3: YES/NO | 0.X | brief reason - -Examples: -- For layoff context: "laid off 50 employees" = YES | 0.9 | mentions layoffs -- For hiring context: "we're hiring developers" = YES | 0.8 | job posting -- Unrelated content = NO | 0.1 | not relevant to context`; - - const response = await fetch(`${ollamaHost}/api/generate`, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: model, - prompt: prompt, - stream: false, - options: { - temperature: 0.3, - top_p: 0.9, - }, - }), - }); - - if (!response.ok) { - throw new Error( - `Ollama API error: ${response.status} ${response.statusText}` - ); - } - - const data = await response.json(); - const aiResponse = data.response.trim(); - - // Parse the response - const analyses = []; - const lines = aiResponse.split("\n").filter((line) => line.trim()); - - for (let i = 0; i < posts.length; i++) { - let analysis = { - postIndex: i + 1, - isRelevant: false, - confidence: 0.5, - reasoning: "Could not parse AI response", - }; - - // Look for lines that match "POST X:" pattern - const postPattern = new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i"); - - for (const line of lines) { - const match = line.match(postPattern); - if (match) { - const content = match[1].trim(); - - // Parse: YES/NO | 0.X | reasoning - const parts = content.split("|").map((p) => p.trim()); - - if (parts.length >= 3) { - analysis.isRelevant = parts[0].toUpperCase().includes("YES"); - analysis.confidence = Math.max( - 0, - Math.min(1, parseFloat(parts[1]) || 0.5) - ); - analysis.reasoning = parts[2] || "No reasoning provided"; - } else { - // Fallback parsing - analysis.isRelevant = - content.toUpperCase().includes("YES") || - content.toLowerCase().includes("relevant"); - analysis.confidence = 0.6; - analysis.reasoning = content.substring(0, 100); - } - break; - } - } - - analyses.push(analysis); - } - - // If we didn't get enough analyses, fill in defaults - while (analyses.length < posts.length) { - analyses.push({ - postIndex: analyses.length + 1, - isRelevant: false, - confidence: 0.3, - reasoning: "AI response parsing failed", - }); - } - - return analyses; - } catch (error) { - logger.error(`Error in batch AI analysis: ${error.message}`); - - // Fallback: mark all as relevant with low confidence - return posts.map((_, i) => ({ - postIndex: i + 1, - isRelevant: true, - confidence: 0.3, - reasoning: `Analysis failed: ${error.message}`, - })); - } -} - -/** - * Analyze a single post using local Ollama (fallback) - */ -async function analyzeSinglePost( - text, - context, - model = DEFAULT_MODEL, - ollamaHost = "http://localhost:11434" -) { - const prompt = `Analyze this LinkedIn post for relevance to: "${context}" - -Post: "${text}" - -Is this post relevant to "${context}"? Provide: -1. YES or NO -2. Confidence (0.0 to 1.0) -3. Brief reason - -Format: YES/NO | 0.X | reason`; - - try { - const response = await fetch(`${ollamaHost}/api/generate`, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model: model, - prompt: prompt, - stream: false, - options: { - temperature: 0.3, - }, - }), - }); - - if (!response.ok) { - throw new Error(`Ollama API error: ${response.status}`); - } - - const data = await response.json(); - const aiResponse = data.response.trim(); - - // Parse response - const parts = aiResponse.split("|").map((p) => p.trim()); - - if (parts.length >= 3) { - return { - isRelevant: parts[0].toUpperCase().includes("YES"), - confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)), - reasoning: parts[2], - }; - } else { - // Fallback parsing - return { - isRelevant: - aiResponse.toLowerCase().includes("yes") || - aiResponse.toLowerCase().includes("relevant"), - confidence: 0.6, - reasoning: aiResponse.substring(0, 100), - }; - } - } catch (error) { - return { - isRelevant: true, // Default to include on error - confidence: 0.3, - reasoning: `Analysis failed: ${error.message}`, - }; - } -} - -/** - * Find the most recent results file if none specified - */ -function findLatestResultsFile(resultsDir = "results") { - const fs = require("fs"); - const path = require("path"); - - if (!fs.existsSync(resultsDir)) { - throw new Error("Results directory not found. Run the scraper first."); - } - - const files = fs - .readdirSync(resultsDir) - .filter( - (f) => - (f.startsWith("results-") || f.startsWith("linkedin-results-")) && - f.endsWith(".json") && - !f.includes("-ai-") - ) - .sort() - .reverse(); - - if (files.length === 0) { - throw new Error("No results files found. Run the scraper first."); - } - - return path.join(resultsDir, files[0]); -} - -module.exports = { - checkOllamaStatus, - analyzeBatch, - analyzeSinglePost, - findLatestResultsFile, - DEFAULT_MODEL, // Export so other modules can use it -}; +const { logger } = require("./logger"); + +/** + * AI Analysis utilities for post processing with Ollama + * Extracted from ai-analyzer-local.js for reuse across parsers + */ + +// Default model from environment variable or fallback to "mistral" +const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral"; + +/** + * Check if Ollama is running and the model is available + */ +async function checkOllamaStatus( + model = DEFAULT_MODEL, + ollamaHost = "http://localhost:11434" +) { + try { + // Check if Ollama is running + const response = await fetch(`${ollamaHost}/api/tags`); + if (!response.ok) { + throw new Error(`Ollama not running on ${ollamaHost}`); + } + + const data = await response.json(); + const availableModels = data.models.map((m) => m.name); + + logger.ai("Ollama is running"); + logger.info( + `📦 Available models: ${availableModels + .map((m) => m.split(":")[0]) + .join(", ")}` + ); + + // Check if requested model is available + const modelExists = availableModels.some((m) => m.startsWith(model)); + if (!modelExists) { + logger.error(`Model "${model}" not found`); + logger.error(`💡 Install it with: ollama pull ${model}`); + logger.error( + `💡 Or choose from: ${availableModels + .map((m) => m.split(":")[0]) + .join(", ")}` + ); + return false; + } + + logger.success(`Using model: ${model}`); + return true; + } catch (error) { + logger.error(`Error connecting to Ollama: ${error.message}`); + logger.error("💡 Make sure Ollama is installed and running:"); + logger.error(" 1. Install: https://ollama.ai/"); + logger.error(" 2. Start: ollama serve"); + logger.error(` 3. Install model: ollama pull ${model}`); + return false; + } +} + +/** + * Analyze multiple posts using local Ollama + */ +async function analyzeBatch( + posts, + context, + model = DEFAULT_MODEL, + ollamaHost = "http://localhost:11434" +) { + logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`); + + try { + const prompt = `Analyze ${posts.length} LinkedIn posts for relevance to: "${context}" + +POSTS: +${posts + .map( + (post, i) => ` +POST ${i + 1}: +"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}" +` + ) + .join("")} + +REQUIRED FORMAT - Respond with EXACTLY ${posts.length} lines, one per post: +POST 1: YES | 0.8 | reason here +POST 2: NO | 0.2 | reason here +POST 3: YES | 0.9 | reason here + +RULES: +- Use YES or NO (uppercase) +- Use pipe character | as separator +- Confidence must be 0.0 to 1.0 (decimal number) +- Keep reasoning brief (one sentence) +- MUST include all ${posts.length} posts in order + +Examples: +POST 1: YES | 0.9 | mentions layoffs and job cuts +POST 2: NO | 0.1 | unrelated topic about vacation +POST 3: YES | 0.7 | discusses workforce reduction`; + + // Add timeout to prevent hanging (5 minutes max) + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 5 * 60 * 1000); // 5 minutes + + try { + const response = await fetch(`${ollamaHost}/api/generate`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: model, + prompt: prompt, + stream: false, + options: { + temperature: 0.3, + top_p: 0.9, + }, + }), + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + throw new Error( + `Ollama API error: ${response.status} ${response.statusText}` + ); + } + + const data = await response.json(); + const aiResponse = data.response.trim(); + + // Parse the response + const analyses = []; + const lines = aiResponse.split("\n").filter((line) => line.trim()); + + // Log the raw response for debugging + logger.debug(`AI Response length: ${aiResponse.length} chars`); + if (aiResponse.length > 0) { + logger.debug(`AI Response (first 1000 chars):\n${aiResponse.substring(0, 1000)}`); + } else { + logger.warning("⚠️ AI response is empty!"); + } + + for (let i = 0; i < posts.length; i++) { + let analysis = { + postIndex: i + 1, + isRelevant: false, + confidence: 0.5, + reasoning: "Could not parse AI response", + }; + + // Try multiple patterns to find the post analysis + // IMPORTANT: Try numbered patterns first, only use generic pattern as last resort + const numberedPatterns = [ + // Exact format: POST 1: YES | 0.8 | reason + new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i"), + // Numbered list: 1. YES | 0.8 | reason + new RegExp(`^\\s*${i + 1}[.)]\\s*(.+)`, "i"), + // Just the number: 1: YES | 0.8 | reason + new RegExp(`^\\s*${i + 1}:\\s*(.+)`, "i"), + ]; + + let found = false; + let matchedContent = null; + + // First, try to find a line with the specific post number + for (const line of lines) { + for (const pattern of numberedPatterns) { + const match = line.match(pattern); + if (match) { + matchedContent = match[1].trim(); + found = true; + break; + } + } + if (found) break; + } + + // If not found with numbered patterns, try position-based matching as fallback + if (!found && lines.length > i) { + const targetLine = lines[i]; + if (targetLine) { + // Try to parse the line even without post number + const genericMatch = targetLine.match(/^(?:POST\s*\d+:?\s*)?(.+)$/i); + if (genericMatch) { + matchedContent = genericMatch[1].trim(); + found = true; + } + } + } + + if (found && matchedContent) { + const content = matchedContent; + + // Try to parse: YES/NO | 0.X | reasoning + let parts = content.split("|").map((p) => p.trim()); + + // If no pipe separator, try other separators + if (parts.length < 2) { + // Try colon separator: YES: 0.8: reason + parts = content.split(":").map((p) => p.trim()); + } + if (parts.length < 2) { + // Try dash separator: YES - 0.8 - reason + parts = content.split("-").map((p) => p.trim()); + } + + // Extract YES/NO + const relevanceText = parts[0] || content; + analysis.isRelevant = + relevanceText.toUpperCase().includes("YES") || + relevanceText.toLowerCase().includes("relevant") || + relevanceText.toLowerCase().includes("yes"); + + // Extract confidence (look for number between 0 and 1) + if (parts.length >= 2) { + const confidenceMatch = parts[1].match(/(0?\.\d+|1\.0|0|1)/); + if (confidenceMatch) { + analysis.confidence = Math.max( + 0, + Math.min(1, parseFloat(confidenceMatch[1]) || 0.5) + ); + } + } else { + // Try to find confidence in the whole content + const confidenceMatch = content.match(/(0?\.\d+|1\.0|0|1)/); + if (confidenceMatch) { + analysis.confidence = Math.max( + 0, + Math.min(1, parseFloat(confidenceMatch[1]) || 0.5) + ); + } + } + + // Extract reasoning (everything after confidence, or whole content if no structure) + if (parts.length >= 3) { + analysis.reasoning = parts.slice(2).join(" ").trim() || parts[2] || "No reasoning provided"; + } else if (parts.length === 2) { + // If only 2 parts, second part might be reasoning + analysis.reasoning = parts[1].substring(0, 200); + } else { + // Use the whole content as reasoning, but remove YES/NO and confidence + let reasoning = content + .replace(/YES|NO/gi, "") + .replace(/0?\.\d+|1\.0/g, "") + .replace(/\|/g, "") + .trim(); + analysis.reasoning = reasoning || "Analysis provided but format unclear"; + } + } + + // If still not found, try to extract from the entire response by position + if (!found && lines.length > 0) { + // Try to get the line at position i (allowing for some variance) + const targetLine = lines[Math.min(i, lines.length - 1)]; + if (targetLine) { + // Extract any YES/NO indication + analysis.isRelevant = + targetLine.toUpperCase().includes("YES") || + targetLine.toLowerCase().includes("relevant"); + + // Extract confidence + const confidenceMatch = targetLine.match(/(0?\.\d+|1\.0|0|1)/); + if (confidenceMatch) { + analysis.confidence = Math.max( + 0, + Math.min(1, parseFloat(confidenceMatch[1]) || 0.5) + ); + } + + // Use the line as reasoning + analysis.reasoning = targetLine.substring(0, 200).trim() || "Parsed from unstructured response"; + found = true; + } + } + + // Last resort: if still not found, try to extract from the entire response text + if (!found && aiResponse.length > 0) { + // Look for any mention of relevance in the response + const responseLower = aiResponse.toLowerCase(); + const hasRelevant = responseLower.includes("relevant") || responseLower.includes("yes"); + analysis.isRelevant = hasRelevant; + + // Try to find any confidence number + const allConfidenceMatches = aiResponse.match(/(0?\.\d+|1\.0|0|1)/g); + if (allConfidenceMatches && allConfidenceMatches.length > i) { + analysis.confidence = Math.max( + 0, + Math.min(1, parseFloat(allConfidenceMatches[i]) || 0.5) + ); + } + + // Use a portion of the response as reasoning + const responseSnippet = aiResponse.substring(i * 100, (i + 1) * 200).trim(); + analysis.reasoning = responseSnippet || "Could not parse structured response, using fallback"; + + logger.warning(`⚠️ Post ${i + 1}: Using fallback parsing - AI response format unclear`); + } + + analyses.push(analysis); + } + + // If we didn't get enough analyses, fill in defaults + while (analyses.length < posts.length) { + analyses.push({ + postIndex: analyses.length + 1, + isRelevant: false, + confidence: 0.3, + reasoning: "AI response parsing failed", + }); + } + + return analyses; + } catch (error) { + clearTimeout(timeoutId); + if (error.name === 'AbortError') { + throw new Error('Request timeout: AI analysis took longer than 5 minutes'); + } + throw error; + } + } catch (error) { + logger.error(`Error in batch AI analysis: ${error.message}`); + + // Fallback: mark all as relevant with low confidence + return posts.map((_, i) => ({ + postIndex: i + 1, + isRelevant: true, + confidence: 0.3, + reasoning: `Analysis failed: ${error.message}`, + })); + } +} + +/** + * Analyze a single post using local Ollama (fallback) + */ +async function analyzeSinglePost( + text, + context, + model = DEFAULT_MODEL, + ollamaHost = "http://localhost:11434" +) { + const prompt = `Analyze this LinkedIn post for relevance to: "${context}" + +Post: "${text}" + +Is this post relevant to "${context}"? Provide: +1. YES or NO +2. Confidence (0.0 to 1.0) +3. Brief reason + +Format: YES/NO | 0.X | reason`; + + try { + const response = await fetch(`${ollamaHost}/api/generate`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: model, + prompt: prompt, + stream: false, + options: { + temperature: 0.3, + }, + }), + }); + + if (!response.ok) { + throw new Error(`Ollama API error: ${response.status}`); + } + + const data = await response.json(); + const aiResponse = data.response.trim(); + + // Parse response + const parts = aiResponse.split("|").map((p) => p.trim()); + + if (parts.length >= 3) { + return { + isRelevant: parts[0].toUpperCase().includes("YES"), + confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)), + reasoning: parts[2], + }; + } else { + // Fallback parsing + return { + isRelevant: + aiResponse.toLowerCase().includes("yes") || + aiResponse.toLowerCase().includes("relevant"), + confidence: 0.6, + reasoning: aiResponse.substring(0, 100), + }; + } + } catch (error) { + return { + isRelevant: true, // Default to include on error + confidence: 0.3, + reasoning: `Analysis failed: ${error.message}`, + }; + } +} + +/** + * Find the most recent results file if none specified + */ +function findLatestResultsFile(resultsDir = "results") { + const fs = require("fs"); + const path = require("path"); + + if (!fs.existsSync(resultsDir)) { + throw new Error("Results directory not found. Run the scraper first."); + } + + const files = fs + .readdirSync(resultsDir) + .filter( + (f) => + (f.startsWith("results-") || f.startsWith("linkedin-results-")) && + f.endsWith(".json") && + !f.includes("-ai-") + ) + .sort() + .reverse(); + + if (files.length === 0) { + throw new Error("No results files found. Run the scraper first."); + } + + return path.join(resultsDir, files[0]); +} + +module.exports = { + checkOllamaStatus, + analyzeBatch, + analyzeSinglePost, + findLatestResultsFile, + DEFAULT_MODEL, // Export so other modules can use it +}; diff --git a/linkedin-parser/index.js b/linkedin-parser/index.js index 5a84bcf..b7d6719 100644 --- a/linkedin-parser/index.js +++ b/linkedin-parser/index.js @@ -31,7 +31,7 @@ const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME; const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD; const HEADLESS = process.env.HEADLESS !== "false"; const SEARCH_KEYWORDS = - process.env.SEARCH_KEYWORDS || "layoff,downsizing,job cuts"; + process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts"; const LOCATION_FILTER = process.env.LOCATION_FILTER; const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false"; const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends"; diff --git a/linkedin-parser/strategies/linkedin-strategy.js b/linkedin-parser/strategies/linkedin-strategy.js index 7bf4e4b..dbc4da4 100644 --- a/linkedin-parser/strategies/linkedin-strategy.js +++ b/linkedin-parser/strategies/linkedin-strategy.js @@ -10,6 +10,7 @@ const { containsAnyKeyword, validateLocationAgainstFilters, extractLocationFromProfile, + parseLocationFilters, } = require("ai-analyzer"); /** @@ -48,28 +49,44 @@ async function linkedinStrategy(coreParser, options = {}) { await coreParser.navigateTo(searchUrl, { pageId: "linkedin-main", retries: 2, + waitUntil: "networkidle", // Wait for network to be idle }); - // Wait for page to load - use delay utility instead of waitForTimeout - await new Promise(resolve => setTimeout(resolve, 3000)); // Give LinkedIn time to render + // Wait for page to load and content to render + await new Promise(resolve => setTimeout(resolve, 5000)); // Give LinkedIn time to render dynamic content + + // Scroll down a bit to trigger lazy loading + try { + await page.evaluate(() => { + window.scrollTo(0, 500); + }); + await new Promise(resolve => setTimeout(resolve, 2000)); + } catch (e) { + logger.debug(`Could not scroll page: ${e.message}`); + } // Wait for search results - try multiple selectors let hasResults = false; const possibleSelectors = [ + ".feed-shared-update-v2", + "article[data-urn*='urn:li:activity']", + "article", ".search-results-container", ".search-results__list", ".reusable-search__result-container", "[data-test-id='search-results']", - ".feed-shared-update-v2", - "article", ]; for (const selector of possibleSelectors) { try { - await page.waitForSelector(selector, { timeout: 5000 }); - hasResults = true; - logger.info(`✅ Found results container with selector: ${selector}`); - break; + await page.waitForSelector(selector, { timeout: 10000 }); + // Verify we actually have post elements + const count = await page.$$(selector).then(elements => elements.length); + if (count > 0) { + hasResults = true; + logger.info(`✅ Found ${count} post elements with selector: ${selector}`); + break; + } } catch (e) { // Try next selector } @@ -100,20 +117,24 @@ async function linkedinStrategy(coreParser, options = {}) { // Validate location if filtering enabled if (locationFilter) { const postLocation = post.location || post.profileLocation || ""; + // Parse locationFilter string into array if it's a string + const locationFiltersArray = typeof locationFilter === 'string' + ? parseLocationFilters(locationFilter) + : locationFilter; const locationValid = validateLocationAgainstFilters( postLocation, - locationFilter + locationFiltersArray ); - if (!locationValid) { + if (!locationValid.isValid) { logger.debug(`⏭️ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`); rejectedResults.push({ ...post, - rejectionReason: `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`, + rejectionReason: locationValid.reasoning || `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`, }); continue; } else { - logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}"`); + logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}" (${locationValid.reasoning || 'matched'})`); } } @@ -156,9 +177,12 @@ async function extractPostsFromPage(page, keyword) { try { // Try multiple selectors for post elements (LinkedIn changes these frequently) + // Prioritize selectors that are more specific to actual posts const postSelectors = [ - ".feed-shared-update-v2", + "article[data-urn*='urn:li:activity']", // Most specific - posts with activity ID + ".feed-shared-update-v2[data-urn*='urn:li:activity']", "article.feed-shared-update-v2", + ".feed-shared-update-v2", "[data-urn*='urn:li:activity']", ".reusable-search__result-container", ".search-result__wrapper", @@ -170,11 +194,30 @@ async function extractPostsFromPage(page, keyword) { for (const selector of postSelectors) { try { + // Wait a bit for elements to be available + await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {}); postElements = await page.$$(selector); + + // Filter to only elements that have a data-urn attribute (actual posts) if (postElements.length > 0) { - usedSelector = selector; - logger.info(`✅ Found ${postElements.length} post elements using selector: ${selector}`); - break; + const validElements = []; + for (const elem of postElements) { + try { + const dataUrn = await elem.getAttribute("data-urn"); + if (dataUrn && dataUrn.includes("urn:li:activity")) { + validElements.push(elem); + } + } catch (e) { + // Element might have been detached, skip it + } + } + + if (validElements.length > 0) { + postElements = validElements; + usedSelector = selector; + logger.info(`✅ Found ${postElements.length} valid post elements using selector: ${selector}`); + break; + } } } catch (e) { // Try next selector @@ -199,10 +242,22 @@ async function extractPostsFromPage(page, keyword) { for (let i = 0; i < postElements.length; i++) { try { + // Scroll element into view to ensure it's fully rendered + try { + await postElements[i].evaluate((el) => { + el.scrollIntoView({ behavior: 'smooth', block: 'center' }); + }); + await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for rendering + } catch (e) { + // Element might already be in view or detached, continue anyway + } + const post = await extractPostData(postElements[i], keyword); if (post) { posts.push(post); - logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}...`); + const hasContent = post.content && post.content.length > 0; + const hasAuthor = post.authorName && post.authorName.length > 0; + logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`); } else { logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`); } @@ -222,131 +277,524 @@ async function extractPostsFromPage(page, keyword) { /** * Extract data from individual post element + * Uses evaluate() to extract data directly from DOM for better reliability */ async function extractPostData(postElement, keyword) { try { - // Extract post ID - const postId = (await postElement.getAttribute("data-urn")) || ""; + // Use evaluate to extract data directly from the DOM element + // This is more reliable than using selectors which may not match + const postData = await postElement.evaluate((el, keyword) => { + const data = { + postId: "", + authorName: "", + authorUrl: "", + content: "", + timestamp: "", + location: "", + likes: 0, + comments: 0, + }; - // Extract author info - const authorElement = await postElement.$(".feed-shared-actor__name"); - const authorName = authorElement - ? cleanText(await authorElement.textContent()) - : ""; + // Extract post ID from data-urn attribute + data.postId = el.getAttribute("data-urn") || + el.getAttribute("data-activity-id") || + el.querySelector("[data-urn]")?.getAttribute("data-urn") || ""; - const authorLinkElement = await postElement.$(".feed-shared-actor__name a"); - const authorUrl = authorLinkElement - ? await authorLinkElement.getAttribute("href") - : ""; + // Extract author name - try multiple selectors and approaches + const authorSelectors = [ + ".feed-shared-actor__name", + ".feed-shared-actor__name-link", + ".update-components-actor__name", + ".feed-shared-actor__name a", + "[data-test-id='actor-name']", + "span[aria-label*='name']", + "a[href*='/in/'] span", + ".feed-shared-actor a span", + ".feed-shared-actor span", + ".feed-shared-actor__name-link span", + ]; - // Extract post content - const contentElement = await postElement.$(".feed-shared-text"); - const content = contentElement - ? cleanText(await contentElement.textContent()) - : ""; - - // Extract timestamp - const timeElement = await postElement.$( - ".feed-shared-actor__sub-description time" - ); - const timestamp = timeElement - ? await timeElement.getAttribute("datetime") - : ""; - - // Extract location from profile (try multiple selectors) - let location = ""; - const locationSelectors = [ - ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link", - ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link--without-hover", - ".feed-shared-actor__sub-description span[aria-label*='location']", - ".feed-shared-actor__sub-description span[aria-label*='Location']", - ]; - - for (const selector of locationSelectors) { - try { - const locationElement = await postElement.$(selector); - if (locationElement) { - const locationText = await locationElement.textContent(); - if (locationText && locationText.trim()) { - location = cleanText(locationText); + for (const selector of authorSelectors) { + const elem = el.querySelector(selector); + if (elem) { + const text = elem.textContent?.trim() || elem.innerText?.trim(); + if (text && text.length > 0 && text.length < 100) { // Reasonable name length + data.authorName = text; + // Try to get link from same element or parent + const link = elem.closest("a") || elem.querySelector("a"); + if (link) { + data.authorUrl = link.getAttribute("href") || ""; + } break; } } - } catch (e) { - // Try next selector } - } - // If no location found in sub-description, try to extract from author link hover or profile - if (!location) { - try { - // Try to get location from data attributes or other sources - const subDescElement = await postElement.$(".feed-shared-actor__sub-description"); - if (subDescElement) { - const subDescText = await subDescElement.textContent(); - // Look for location patterns (City, Province/State, Country) - const locationMatch = subDescText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/); - if (locationMatch) { - location = cleanText(locationMatch[0]); + // If author name found but no URL, try to find link separately + if (data.authorName && !data.authorUrl) { + const authorLink = el.querySelector(".feed-shared-actor__name-link, .feed-shared-actor__name a, a[href*='/in/']"); + if (authorLink) { + data.authorUrl = authorLink.getAttribute("href") || ""; + } + } + + // Fallback: Look for any link with /in/ pattern and get the name from nearby text + if (!data.authorName) { + const profileLinks = el.querySelectorAll("a[href*='/in/']"); + for (const link of profileLinks) { + // Skip if it's a company link + if (link.getAttribute("href")?.includes("/company/")) continue; + + // Get text from the link or nearby + const linkText = link.textContent?.trim() || link.innerText?.trim(); + if (linkText && linkText.length > 0 && linkText.length < 100 && !linkText.includes("View")) { + data.authorName = linkText; + data.authorUrl = link.getAttribute("href") || ""; + break; + } + // Try to get text from first child span + const childSpan = link.querySelector("span"); + if (childSpan) { + const spanText = childSpan.textContent?.trim() || childSpan.innerText?.trim(); + if (spanText && spanText.length > 0 && spanText.length < 100) { + data.authorName = spanText; + data.authorUrl = link.getAttribute("href") || ""; + break; + } + } + // Try to get text from parent + const parentText = link.parentElement?.textContent?.trim(); + if (parentText && parentText.length < 100 && !parentText.includes("View")) { + // Extract just the name part (first line or first few words) + const namePart = parentText.split("\n")[0].split("·")[0].trim(); + if (namePart.length > 0 && namePart.length < 100) { + data.authorName = namePart; + data.authorUrl = link.getAttribute("href") || ""; + break; + } } } - } catch (e) { - // Location extraction failed, continue without it } + + // Last resort: Extract from actor section by looking at all text + if (!data.authorName) { + const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor, [class*='actor']"); + if (actorSection) { + const actorText = actorSection.textContent || actorSection.innerText || ""; + const lines = actorText.split("\n").map(l => l.trim()).filter(l => l.length > 0); + // First non-empty line is often the name + for (const line of lines) { + if (line.length > 0 && line.length < 100 && + !line.includes("·") && + !line.includes("ago") && + !line.match(/^\d+/) && + !line.toLowerCase().includes("view")) { + data.authorName = line; + // Try to find associated link + const link = actorSection.querySelector("a[href*='/in/']"); + if (link) { + data.authorUrl = link.getAttribute("href") || ""; + } + break; + } + } + } + } + + // Extract post content - try multiple selectors + const contentSelectors = [ + ".feed-shared-text", + ".feed-shared-text__text-view", + ".feed-shared-update-v2__description", + ".update-components-text", + "[data-test-id='post-text']", + ".feed-shared-text span", + ".feed-shared-update-v2__description-wrapper", + ]; + + for (const selector of contentSelectors) { + const elem = el.querySelector(selector); + if (elem) { + const text = elem.textContent?.trim() || elem.innerText?.trim(); + if (text && text.length > 10) { // Only use if substantial content + data.content = text; + break; + } + } + } + + // Extract timestamp + const timeSelectors = [ + ".feed-shared-actor__sub-description time", + "time[datetime]", + "[data-test-id='timestamp']", + ".feed-shared-actor__sub-description time[datetime]", + "time", + ".feed-shared-actor__sub-description time", + "span[aria-label*='time']", + "span[aria-label*='ago']", + ]; + + for (const selector of timeSelectors) { + const elem = el.querySelector(selector); + if (elem) { + data.timestamp = elem.getAttribute("datetime") || + elem.getAttribute("title") || + elem.getAttribute("aria-label") || + elem.textContent?.trim() || ""; + if (data.timestamp) break; + } + } + + // Fallback: Look for time-like patterns in sub-description + if (!data.timestamp) { + const subDesc = el.querySelector(".feed-shared-actor__sub-description"); + if (subDesc) { + const subDescText = subDesc.textContent || subDesc.innerText || ""; + // Look for patterns like "2h", "3d", "1w", "2 months ago", etc. + const timePatterns = [ + /\d+\s*(minute|hour|day|week|month|year)s?\s*ago/i, + /\d+\s*(h|d|w|mo|yr)/i, + /(just now|today|yesterday)/i, + ]; + for (const pattern of timePatterns) { + const match = subDescText.match(pattern); + if (match) { + data.timestamp = match[0]; + break; + } + } + } + } + + // Extract location - try multiple approaches + const locationSelectors = [ + ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link", + ".feed-shared-actor__sub-description-link--without-hover", + "span[aria-label*='location' i]", + "span[aria-label*='Location']", + ".feed-shared-actor__sub-description span", + ".feed-shared-actor__sub-description a", + "a[href*='/company/']", + "a[href*='/location/']", + ]; + + for (const selector of locationSelectors) { + const elem = el.querySelector(selector); + if (elem) { + const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || elem.innerText?.trim() || ""; + // Check if it looks like a location (contains comma or common location words) + if (text && text.length > 2 && text.length < 100) { + // More flexible location detection + if (text.includes(",") || + /(city|province|state|country|region|ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut)/i.test(text) || + /^[A-Z][a-z]+,\s*[A-Z][a-z]+/i.test(text)) { + data.location = text; + break; + } + } + } + } + + // If no location found, try parsing from sub-description text + if (!data.location) { + const subDesc = el.querySelector(".feed-shared-actor__sub-description"); + if (subDesc) { + const subDescText = subDesc.textContent || subDesc.innerText || ""; + + // First, try to get all links in sub-description (location is often a link) + const subDescLinks = subDesc.querySelectorAll("a"); + for (const link of subDescLinks) { + const linkText = link.textContent?.trim() || link.innerText?.trim() || ""; + const linkHref = link.getAttribute("href") || ""; + + // Skip if it's a time/date link or company link + if (linkHref.includes("/company/") || linkText.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w)/i)) { + continue; + } + + // If link text looks like a location + if (linkText && linkText.length > 2 && linkText.length < 100) { + if (linkText.includes(",") || + /(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut|toronto|vancouver|calgary|ottawa|montreal|winnipeg|edmonton|halifax|victoria|regina|saskatoon|windsor|kitchener|hamilton|london|st\.?\s*catharines|oshawa|barrie|greater sudbury|sherbrooke|kelowna|abbotsford|trois-rivières|guelph|cambridge|coquitlam|saanich|saint john|thunder bay|waterloo|delta|chatham|red deer|kamloops|brantford|whitehorse|yellowknife|iqaluit)/i.test(linkText)) { + data.location = linkText; + break; + } + } + } + + // If still no location, try pattern matching on the full text + if (!data.location && subDescText) { + // Look for location patterns (City, Province/State, Country) + const locationPatterns = [ + // Full location: "City, Province, Country" + /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/, + // City, Province + /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)/, + // Just province/state names + /\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut|ON|AB|BC|QC|MB|SK|NS|NB|NL|PE|YT|NT|NU)\b/i, + // Major cities + /\b(Toronto|Vancouver|Calgary|Ottawa|Montreal|Winnipeg|Edmonton|Halifax|Victoria|Regina|Saskatoon)\b/i, + ]; + + for (const pattern of locationPatterns) { + const match = subDescText.match(pattern); + if (match) { + // Get more context around the match + const matchIndex = subDescText.indexOf(match[0]); + const contextStart = Math.max(0, matchIndex - 30); + const contextEnd = Math.min(subDescText.length, matchIndex + match[0].length + 30); + const context = subDescText.substring(contextStart, contextEnd).trim(); + + // Extract just the location part (remove time/date info) + let locationText = match[0].trim(); + // If we have more context, try to get a better location string + if (context.includes(",") && context.length < 100) { + // Try to extract "City, Province" pattern from context + const cityProvinceMatch = context.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+)/); + if (cityProvinceMatch) { + locationText = cityProvinceMatch[0].trim(); + } + } + + data.location = locationText; + break; + } + } + } + + // Last resort: extract any text that looks location-like from sub-description + if (!data.location && subDescText) { + // Split by common separators and look for location-like text + const parts = subDescText.split(/[·•|]/).map(p => p.trim()).filter(p => p.length > 0); + for (const part of parts) { + // Skip if it looks like time/date + if (part.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) { + continue; + } + // Check if it looks like a location + if (part.length > 2 && part.length < 100 && + (part.includes(",") || + /(ontario|alberta|british columbia|quebec|manitoba|toronto|vancouver|calgary|ottawa|montreal)/i.test(part))) { + data.location = part; + break; + } + } + } + } + } + + // Final fallback: look anywhere in the actor section for location-like text + if (!data.location) { + const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor"); + if (actorSection) { + const actorText = actorSection.textContent || actorSection.innerText || ""; + // Look for province names + const provinceMatch = actorText.match(/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)\b/i); + if (provinceMatch) { + // Try to get city, province if available + const cityProvinceMatch = actorText.match(/([A-Z][a-z]+),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i); + if (cityProvinceMatch) { + data.location = cityProvinceMatch[0].trim(); + } else { + data.location = provinceMatch[0].trim(); + } + } + } + } + + // Try to extract from any hover cards or mini profiles in the DOM + if (!data.location) { + // Look for mini profile cards or tooltips + const miniProfileSelectors = [ + "[data-control-name='hovercard']", + ".artdeco-hoverable-trigger", + ".feed-shared-actor__meta", + ".pv-text-details__left-panel", + ]; + + for (const selector of miniProfileSelectors) { + const elem = el.querySelector(selector); + if (elem) { + const text = elem.textContent || elem.innerText || ""; + // Look for location patterns + const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i); + if (locationMatch) { + data.location = locationMatch[0].trim(); + break; + } + } + } + } + + // Extract engagement metrics - try multiple approaches + const likesSelectors = [ + ".social-counts-reactions__count", + "[data-test-id='reactions-count']", + ".social-counts__reactions-count", + ".feed-shared-social-action-bar__reactions-count", + "button[aria-label*='reaction']", + "button[aria-label*='like']", + ".social-actions-button__reactions-count", + "[data-test-id='social-actions__reactions-count']", + ]; + + for (const selector of likesSelectors) { + const elem = el.querySelector(selector); + if (elem) { + const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || ""; + const match = text.match(/(\d+)/); + if (match) { + data.likes = parseInt(match[1], 10) || 0; + break; + } + } + } + + // Fallback: Look for any button or element with reaction/like text + if (data.likes === 0) { + const allButtons = el.querySelectorAll("button, span, div"); + for (const btn of allButtons) { + const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || ""; + if (/reaction|like/i.test(text)) { + const match = text.match(/(\d+)/); + if (match) { + data.likes = parseInt(match[1], 10) || 0; + break; + } + } + } + } + + const commentsSelectors = [ + ".social-counts-comments__count", + "[data-test-id='comments-count']", + ".social-counts__comments-count", + ".feed-shared-social-action-bar__comments-count", + "button[aria-label*='comment']", + ".social-actions-button__comments-count", + "[data-test-id='social-actions__comments-count']", + ]; + + for (const selector of commentsSelectors) { + const elem = el.querySelector(selector); + if (elem) { + const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || ""; + const match = text.match(/(\d+)/); + if (match) { + data.comments = parseInt(match[1], 10) || 0; + break; + } + } + } + + // Fallback: Look for any button or element with comment text + if (data.comments === 0) { + const allButtons = el.querySelectorAll("button, span, div"); + for (const btn of allButtons) { + const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || ""; + if (/comment/i.test(text)) { + const match = text.match(/(\d+)/); + if (match) { + data.comments = parseInt(match[1], 10) || 0; + break; + } + } + } + } + + return data; + }, keyword); + + // Clean and format the extracted data + const authorName = cleanText(postData.authorName); + let authorUrl = postData.authorUrl || ""; + if (authorUrl && !authorUrl.startsWith("http")) { + authorUrl = `https://www.linkedin.com${authorUrl}`; } - // Extract engagement metrics - const likesElement = await postElement.$(".social-counts-reactions__count"); - const likesText = likesElement - ? cleanText(await likesElement.textContent()) - : "0"; - - const commentsElement = await postElement.$( - ".social-counts-comments__count" - ); - const commentsText = commentsElement - ? cleanText(await commentsElement.textContent()) - : "0"; - - // Note: LinkedIn search already filters by keyword semantically - // We don't filter by content keyword match because: - // 1. LinkedIn's search is semantic - it finds related posts, not just exact matches - // 2. The keyword might be in comments, hashtags, or metadata, not visible text - // 3. Posts might be about the topic without using the exact keyword - // - // Optional: Log if keyword appears in content (for debugging, but don't filter) - const keywordLower = keyword.toLowerCase(); - const contentLower = content.toLowerCase(); - const hasKeywordInContent = contentLower.includes(keywordLower); - if (!hasKeywordInContent && content.length > 50) { - logger.debug(`ℹ️ Post doesn't contain keyword "${keyword}" in visible content, but including it (LinkedIn search matched it)`); - } + const content = cleanText(postData.content); + const location = cleanText(postData.location); + const timestamp = postData.timestamp || ""; // Validate we have minimum required data - if (!postId && !content) { + if (!postData.postId && !content) { logger.debug(`⏭️ Post filtered: missing both postId and content`); return null; } + // Log extraction results for debugging + const missingFields = []; + if (!authorName) missingFields.push("authorName"); + if (!authorUrl) missingFields.push("authorUrl"); + if (!location) missingFields.push("location"); + if (!timestamp) missingFields.push("timestamp"); + if (postData.likes === 0 && postData.comments === 0) missingFields.push("engagement"); + + if (missingFields.length > 0 && postData.postId) { + logger.debug(`⚠️ Post ${postData.postId.substring(0, 20)}... missing: ${missingFields.join(", ")}`); + + // If location is missing, log sub-description content for debugging + if (!location && process.env.DEBUG_EXTRACTION === "true") { + try { + const subDescInfo = await postElement.evaluate((el) => { + const subDesc = el.querySelector(".feed-shared-actor__sub-description"); + if (subDesc) { + return { + text: subDesc.textContent || subDesc.innerText || "", + html: subDesc.innerHTML.substring(0, 500), + links: Array.from(subDesc.querySelectorAll("a")).map(a => ({ + text: a.textContent?.trim(), + href: a.getAttribute("href") + })) + }; + } + return null; + }); + if (subDescInfo) { + logger.debug(`Sub-description text: "${subDescInfo.text}"`); + logger.debug(`Sub-description links: ${JSON.stringify(subDescInfo.links)}`); + } + } catch (e) { + // Ignore errors in debugging + } + } + + // Optionally log HTML structure for first failed extraction (to help debug) + if (process.env.DEBUG_EXTRACTION === "true" && missingFields.length >= 3) { + try { + const htmlSnippet = await postElement.evaluate((el) => { + // Get the outer HTML of the element (limited to first 2000 chars) + const html = el.outerHTML || ""; + return html.substring(0, 2000); + }); + logger.debug(`HTML structure (first 2000 chars):\n${htmlSnippet}`); + } catch (e) { + // Ignore errors in debugging + } + } + } + return { - postId: cleanText(postId), + postId: cleanText(postData.postId), authorName, authorUrl, - profileLink: authorUrl ? (authorUrl.startsWith("http") ? authorUrl : `https://www.linkedin.com${authorUrl}`) : "", + profileLink: authorUrl, text: content, content: content, location: location, profileLocation: location, // Alias for compatibility timestamp, keyword, - likes: extractNumber(likesText), - comments: extractNumber(commentsText), + likes: postData.likes || 0, + comments: postData.comments || 0, extractedAt: new Date().toISOString(), source: "linkedin", parser: "linkedout-parser", }; } catch (error) { logger.warning(`Error extracting post data: ${error.message}`); + logger.debug(`Stack trace: ${error.stack}`); return null; } }