Refactor AI analysis utilities and enhance LinkedIn parser
- Updated `ai-utils.js` to improve AI response parsing and added timeout handling for API requests. - Modified `linkedin-parser` to refine search keyword handling and improve post extraction reliability. - Enhanced location filtering logic and added more robust selectors for extracting post data. - Improved logging for debugging purposes, including detailed extraction results and fallback mechanisms.
This commit is contained in:
parent
8de65bc04c
commit
bbfd3c84aa
@ -1,305 +1,442 @@
|
||||
const { logger } = require("./logger");
|
||||
|
||||
/**
|
||||
* AI Analysis utilities for post processing with Ollama
|
||||
* Extracted from ai-analyzer-local.js for reuse across parsers
|
||||
*/
|
||||
|
||||
// Default model from environment variable or fallback to "mistral"
|
||||
const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral";
|
||||
|
||||
/**
|
||||
* Check if Ollama is running and the model is available
|
||||
*/
|
||||
async function checkOllamaStatus(
|
||||
model = DEFAULT_MODEL,
|
||||
ollamaHost = "http://localhost:11434"
|
||||
) {
|
||||
try {
|
||||
// Check if Ollama is running
|
||||
const response = await fetch(`${ollamaHost}/api/tags`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama not running on ${ollamaHost}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const availableModels = data.models.map((m) => m.name);
|
||||
|
||||
logger.ai("Ollama is running");
|
||||
logger.info(
|
||||
`📦 Available models: ${availableModels
|
||||
.map((m) => m.split(":")[0])
|
||||
.join(", ")}`
|
||||
);
|
||||
|
||||
// Check if requested model is available
|
||||
const modelExists = availableModels.some((m) => m.startsWith(model));
|
||||
if (!modelExists) {
|
||||
logger.error(`Model "${model}" not found`);
|
||||
logger.error(`💡 Install it with: ollama pull ${model}`);
|
||||
logger.error(
|
||||
`💡 Or choose from: ${availableModels
|
||||
.map((m) => m.split(":")[0])
|
||||
.join(", ")}`
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
logger.success(`Using model: ${model}`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Error connecting to Ollama: ${error.message}`);
|
||||
logger.error("💡 Make sure Ollama is installed and running:");
|
||||
logger.error(" 1. Install: https://ollama.ai/");
|
||||
logger.error(" 2. Start: ollama serve");
|
||||
logger.error(` 3. Install model: ollama pull ${model}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze multiple posts using local Ollama
|
||||
*/
|
||||
async function analyzeBatch(
|
||||
posts,
|
||||
context,
|
||||
model = DEFAULT_MODEL,
|
||||
ollamaHost = "http://localhost:11434"
|
||||
) {
|
||||
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
|
||||
|
||||
try {
|
||||
const prompt = `You are an expert at analyzing LinkedIn posts for relevance to specific contexts.
|
||||
|
||||
CONTEXT TO MATCH: "${context}"
|
||||
|
||||
Analyze these ${
|
||||
posts.length
|
||||
} LinkedIn posts and determine if each relates to the context above.
|
||||
|
||||
POSTS:
|
||||
${posts
|
||||
.map(
|
||||
(post, i) => `
|
||||
POST ${i + 1}:
|
||||
"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}"
|
||||
`
|
||||
)
|
||||
.join("")}
|
||||
|
||||
For each post, provide:
|
||||
- Is it relevant to "${context}"? (YES/NO)
|
||||
- Confidence level (0.0 to 1.0)
|
||||
- Brief reasoning
|
||||
|
||||
Respond in this EXACT format for each post:
|
||||
POST 1: YES/NO | 0.X | brief reason
|
||||
POST 2: YES/NO | 0.X | brief reason
|
||||
POST 3: YES/NO | 0.X | brief reason
|
||||
|
||||
Examples:
|
||||
- For layoff context: "laid off 50 employees" = YES | 0.9 | mentions layoffs
|
||||
- For hiring context: "we're hiring developers" = YES | 0.8 | job posting
|
||||
- Unrelated content = NO | 0.1 | not relevant to context`;
|
||||
|
||||
const response = await fetch(`${ollamaHost}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.3,
|
||||
top_p: 0.9,
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`Ollama API error: ${response.status} ${response.statusText}`
|
||||
);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const aiResponse = data.response.trim();
|
||||
|
||||
// Parse the response
|
||||
const analyses = [];
|
||||
const lines = aiResponse.split("\n").filter((line) => line.trim());
|
||||
|
||||
for (let i = 0; i < posts.length; i++) {
|
||||
let analysis = {
|
||||
postIndex: i + 1,
|
||||
isRelevant: false,
|
||||
confidence: 0.5,
|
||||
reasoning: "Could not parse AI response",
|
||||
};
|
||||
|
||||
// Look for lines that match "POST X:" pattern
|
||||
const postPattern = new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i");
|
||||
|
||||
for (const line of lines) {
|
||||
const match = line.match(postPattern);
|
||||
if (match) {
|
||||
const content = match[1].trim();
|
||||
|
||||
// Parse: YES/NO | 0.X | reasoning
|
||||
const parts = content.split("|").map((p) => p.trim());
|
||||
|
||||
if (parts.length >= 3) {
|
||||
analysis.isRelevant = parts[0].toUpperCase().includes("YES");
|
||||
analysis.confidence = Math.max(
|
||||
0,
|
||||
Math.min(1, parseFloat(parts[1]) || 0.5)
|
||||
);
|
||||
analysis.reasoning = parts[2] || "No reasoning provided";
|
||||
} else {
|
||||
// Fallback parsing
|
||||
analysis.isRelevant =
|
||||
content.toUpperCase().includes("YES") ||
|
||||
content.toLowerCase().includes("relevant");
|
||||
analysis.confidence = 0.6;
|
||||
analysis.reasoning = content.substring(0, 100);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
analyses.push(analysis);
|
||||
}
|
||||
|
||||
// If we didn't get enough analyses, fill in defaults
|
||||
while (analyses.length < posts.length) {
|
||||
analyses.push({
|
||||
postIndex: analyses.length + 1,
|
||||
isRelevant: false,
|
||||
confidence: 0.3,
|
||||
reasoning: "AI response parsing failed",
|
||||
});
|
||||
}
|
||||
|
||||
return analyses;
|
||||
} catch (error) {
|
||||
logger.error(`Error in batch AI analysis: ${error.message}`);
|
||||
|
||||
// Fallback: mark all as relevant with low confidence
|
||||
return posts.map((_, i) => ({
|
||||
postIndex: i + 1,
|
||||
isRelevant: true,
|
||||
confidence: 0.3,
|
||||
reasoning: `Analysis failed: ${error.message}`,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze a single post using local Ollama (fallback)
|
||||
*/
|
||||
async function analyzeSinglePost(
|
||||
text,
|
||||
context,
|
||||
model = DEFAULT_MODEL,
|
||||
ollamaHost = "http://localhost:11434"
|
||||
) {
|
||||
const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
|
||||
|
||||
Post: "${text}"
|
||||
|
||||
Is this post relevant to "${context}"? Provide:
|
||||
1. YES or NO
|
||||
2. Confidence (0.0 to 1.0)
|
||||
3. Brief reason
|
||||
|
||||
Format: YES/NO | 0.X | reason`;
|
||||
|
||||
try {
|
||||
const response = await fetch(`${ollamaHost}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.3,
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama API error: ${response.status}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const aiResponse = data.response.trim();
|
||||
|
||||
// Parse response
|
||||
const parts = aiResponse.split("|").map((p) => p.trim());
|
||||
|
||||
if (parts.length >= 3) {
|
||||
return {
|
||||
isRelevant: parts[0].toUpperCase().includes("YES"),
|
||||
confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)),
|
||||
reasoning: parts[2],
|
||||
};
|
||||
} else {
|
||||
// Fallback parsing
|
||||
return {
|
||||
isRelevant:
|
||||
aiResponse.toLowerCase().includes("yes") ||
|
||||
aiResponse.toLowerCase().includes("relevant"),
|
||||
confidence: 0.6,
|
||||
reasoning: aiResponse.substring(0, 100),
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
isRelevant: true, // Default to include on error
|
||||
confidence: 0.3,
|
||||
reasoning: `Analysis failed: ${error.message}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the most recent results file if none specified
|
||||
*/
|
||||
function findLatestResultsFile(resultsDir = "results") {
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
if (!fs.existsSync(resultsDir)) {
|
||||
throw new Error("Results directory not found. Run the scraper first.");
|
||||
}
|
||||
|
||||
const files = fs
|
||||
.readdirSync(resultsDir)
|
||||
.filter(
|
||||
(f) =>
|
||||
(f.startsWith("results-") || f.startsWith("linkedin-results-")) &&
|
||||
f.endsWith(".json") &&
|
||||
!f.includes("-ai-")
|
||||
)
|
||||
.sort()
|
||||
.reverse();
|
||||
|
||||
if (files.length === 0) {
|
||||
throw new Error("No results files found. Run the scraper first.");
|
||||
}
|
||||
|
||||
return path.join(resultsDir, files[0]);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
checkOllamaStatus,
|
||||
analyzeBatch,
|
||||
analyzeSinglePost,
|
||||
findLatestResultsFile,
|
||||
DEFAULT_MODEL, // Export so other modules can use it
|
||||
};
|
||||
const { logger } = require("./logger");
|
||||
|
||||
/**
|
||||
* AI Analysis utilities for post processing with Ollama
|
||||
* Extracted from ai-analyzer-local.js for reuse across parsers
|
||||
*/
|
||||
|
||||
// Default model from environment variable or fallback to "mistral"
|
||||
const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral";
|
||||
|
||||
/**
|
||||
* Check if Ollama is running and the model is available
|
||||
*/
|
||||
async function checkOllamaStatus(
|
||||
model = DEFAULT_MODEL,
|
||||
ollamaHost = "http://localhost:11434"
|
||||
) {
|
||||
try {
|
||||
// Check if Ollama is running
|
||||
const response = await fetch(`${ollamaHost}/api/tags`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama not running on ${ollamaHost}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const availableModels = data.models.map((m) => m.name);
|
||||
|
||||
logger.ai("Ollama is running");
|
||||
logger.info(
|
||||
`📦 Available models: ${availableModels
|
||||
.map((m) => m.split(":")[0])
|
||||
.join(", ")}`
|
||||
);
|
||||
|
||||
// Check if requested model is available
|
||||
const modelExists = availableModels.some((m) => m.startsWith(model));
|
||||
if (!modelExists) {
|
||||
logger.error(`Model "${model}" not found`);
|
||||
logger.error(`💡 Install it with: ollama pull ${model}`);
|
||||
logger.error(
|
||||
`💡 Or choose from: ${availableModels
|
||||
.map((m) => m.split(":")[0])
|
||||
.join(", ")}`
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
logger.success(`Using model: ${model}`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Error connecting to Ollama: ${error.message}`);
|
||||
logger.error("💡 Make sure Ollama is installed and running:");
|
||||
logger.error(" 1. Install: https://ollama.ai/");
|
||||
logger.error(" 2. Start: ollama serve");
|
||||
logger.error(` 3. Install model: ollama pull ${model}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze multiple posts using local Ollama
|
||||
*/
|
||||
async function analyzeBatch(
|
||||
posts,
|
||||
context,
|
||||
model = DEFAULT_MODEL,
|
||||
ollamaHost = "http://localhost:11434"
|
||||
) {
|
||||
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
|
||||
|
||||
try {
|
||||
const prompt = `Analyze ${posts.length} LinkedIn posts for relevance to: "${context}"
|
||||
|
||||
POSTS:
|
||||
${posts
|
||||
.map(
|
||||
(post, i) => `
|
||||
POST ${i + 1}:
|
||||
"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}"
|
||||
`
|
||||
)
|
||||
.join("")}
|
||||
|
||||
REQUIRED FORMAT - Respond with EXACTLY ${posts.length} lines, one per post:
|
||||
POST 1: YES | 0.8 | reason here
|
||||
POST 2: NO | 0.2 | reason here
|
||||
POST 3: YES | 0.9 | reason here
|
||||
|
||||
RULES:
|
||||
- Use YES or NO (uppercase)
|
||||
- Use pipe character | as separator
|
||||
- Confidence must be 0.0 to 1.0 (decimal number)
|
||||
- Keep reasoning brief (one sentence)
|
||||
- MUST include all ${posts.length} posts in order
|
||||
|
||||
Examples:
|
||||
POST 1: YES | 0.9 | mentions layoffs and job cuts
|
||||
POST 2: NO | 0.1 | unrelated topic about vacation
|
||||
POST 3: YES | 0.7 | discusses workforce reduction`;
|
||||
|
||||
// Add timeout to prevent hanging (5 minutes max)
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), 5 * 60 * 1000); // 5 minutes
|
||||
|
||||
try {
|
||||
const response = await fetch(`${ollamaHost}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.3,
|
||||
top_p: 0.9,
|
||||
},
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`Ollama API error: ${response.status} ${response.statusText}`
|
||||
);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const aiResponse = data.response.trim();
|
||||
|
||||
// Parse the response
|
||||
const analyses = [];
|
||||
const lines = aiResponse.split("\n").filter((line) => line.trim());
|
||||
|
||||
// Log the raw response for debugging
|
||||
logger.debug(`AI Response length: ${aiResponse.length} chars`);
|
||||
if (aiResponse.length > 0) {
|
||||
logger.debug(`AI Response (first 1000 chars):\n${aiResponse.substring(0, 1000)}`);
|
||||
} else {
|
||||
logger.warning("⚠️ AI response is empty!");
|
||||
}
|
||||
|
||||
for (let i = 0; i < posts.length; i++) {
|
||||
let analysis = {
|
||||
postIndex: i + 1,
|
||||
isRelevant: false,
|
||||
confidence: 0.5,
|
||||
reasoning: "Could not parse AI response",
|
||||
};
|
||||
|
||||
// Try multiple patterns to find the post analysis
|
||||
// IMPORTANT: Try numbered patterns first, only use generic pattern as last resort
|
||||
const numberedPatterns = [
|
||||
// Exact format: POST 1: YES | 0.8 | reason
|
||||
new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i"),
|
||||
// Numbered list: 1. YES | 0.8 | reason
|
||||
new RegExp(`^\\s*${i + 1}[.)]\\s*(.+)`, "i"),
|
||||
// Just the number: 1: YES | 0.8 | reason
|
||||
new RegExp(`^\\s*${i + 1}:\\s*(.+)`, "i"),
|
||||
];
|
||||
|
||||
let found = false;
|
||||
let matchedContent = null;
|
||||
|
||||
// First, try to find a line with the specific post number
|
||||
for (const line of lines) {
|
||||
for (const pattern of numberedPatterns) {
|
||||
const match = line.match(pattern);
|
||||
if (match) {
|
||||
matchedContent = match[1].trim();
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found) break;
|
||||
}
|
||||
|
||||
// If not found with numbered patterns, try position-based matching as fallback
|
||||
if (!found && lines.length > i) {
|
||||
const targetLine = lines[i];
|
||||
if (targetLine) {
|
||||
// Try to parse the line even without post number
|
||||
const genericMatch = targetLine.match(/^(?:POST\s*\d+:?\s*)?(.+)$/i);
|
||||
if (genericMatch) {
|
||||
matchedContent = genericMatch[1].trim();
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (found && matchedContent) {
|
||||
const content = matchedContent;
|
||||
|
||||
// Try to parse: YES/NO | 0.X | reasoning
|
||||
let parts = content.split("|").map((p) => p.trim());
|
||||
|
||||
// If no pipe separator, try other separators
|
||||
if (parts.length < 2) {
|
||||
// Try colon separator: YES: 0.8: reason
|
||||
parts = content.split(":").map((p) => p.trim());
|
||||
}
|
||||
if (parts.length < 2) {
|
||||
// Try dash separator: YES - 0.8 - reason
|
||||
parts = content.split("-").map((p) => p.trim());
|
||||
}
|
||||
|
||||
// Extract YES/NO
|
||||
const relevanceText = parts[0] || content;
|
||||
analysis.isRelevant =
|
||||
relevanceText.toUpperCase().includes("YES") ||
|
||||
relevanceText.toLowerCase().includes("relevant") ||
|
||||
relevanceText.toLowerCase().includes("yes");
|
||||
|
||||
// Extract confidence (look for number between 0 and 1)
|
||||
if (parts.length >= 2) {
|
||||
const confidenceMatch = parts[1].match(/(0?\.\d+|1\.0|0|1)/);
|
||||
if (confidenceMatch) {
|
||||
analysis.confidence = Math.max(
|
||||
0,
|
||||
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5)
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Try to find confidence in the whole content
|
||||
const confidenceMatch = content.match(/(0?\.\d+|1\.0|0|1)/);
|
||||
if (confidenceMatch) {
|
||||
analysis.confidence = Math.max(
|
||||
0,
|
||||
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Extract reasoning (everything after confidence, or whole content if no structure)
|
||||
if (parts.length >= 3) {
|
||||
analysis.reasoning = parts.slice(2).join(" ").trim() || parts[2] || "No reasoning provided";
|
||||
} else if (parts.length === 2) {
|
||||
// If only 2 parts, second part might be reasoning
|
||||
analysis.reasoning = parts[1].substring(0, 200);
|
||||
} else {
|
||||
// Use the whole content as reasoning, but remove YES/NO and confidence
|
||||
let reasoning = content
|
||||
.replace(/YES|NO/gi, "")
|
||||
.replace(/0?\.\d+|1\.0/g, "")
|
||||
.replace(/\|/g, "")
|
||||
.trim();
|
||||
analysis.reasoning = reasoning || "Analysis provided but format unclear";
|
||||
}
|
||||
}
|
||||
|
||||
// If still not found, try to extract from the entire response by position
|
||||
if (!found && lines.length > 0) {
|
||||
// Try to get the line at position i (allowing for some variance)
|
||||
const targetLine = lines[Math.min(i, lines.length - 1)];
|
||||
if (targetLine) {
|
||||
// Extract any YES/NO indication
|
||||
analysis.isRelevant =
|
||||
targetLine.toUpperCase().includes("YES") ||
|
||||
targetLine.toLowerCase().includes("relevant");
|
||||
|
||||
// Extract confidence
|
||||
const confidenceMatch = targetLine.match(/(0?\.\d+|1\.0|0|1)/);
|
||||
if (confidenceMatch) {
|
||||
analysis.confidence = Math.max(
|
||||
0,
|
||||
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5)
|
||||
);
|
||||
}
|
||||
|
||||
// Use the line as reasoning
|
||||
analysis.reasoning = targetLine.substring(0, 200).trim() || "Parsed from unstructured response";
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Last resort: if still not found, try to extract from the entire response text
|
||||
if (!found && aiResponse.length > 0) {
|
||||
// Look for any mention of relevance in the response
|
||||
const responseLower = aiResponse.toLowerCase();
|
||||
const hasRelevant = responseLower.includes("relevant") || responseLower.includes("yes");
|
||||
analysis.isRelevant = hasRelevant;
|
||||
|
||||
// Try to find any confidence number
|
||||
const allConfidenceMatches = aiResponse.match(/(0?\.\d+|1\.0|0|1)/g);
|
||||
if (allConfidenceMatches && allConfidenceMatches.length > i) {
|
||||
analysis.confidence = Math.max(
|
||||
0,
|
||||
Math.min(1, parseFloat(allConfidenceMatches[i]) || 0.5)
|
||||
);
|
||||
}
|
||||
|
||||
// Use a portion of the response as reasoning
|
||||
const responseSnippet = aiResponse.substring(i * 100, (i + 1) * 200).trim();
|
||||
analysis.reasoning = responseSnippet || "Could not parse structured response, using fallback";
|
||||
|
||||
logger.warning(`⚠️ Post ${i + 1}: Using fallback parsing - AI response format unclear`);
|
||||
}
|
||||
|
||||
analyses.push(analysis);
|
||||
}
|
||||
|
||||
// If we didn't get enough analyses, fill in defaults
|
||||
while (analyses.length < posts.length) {
|
||||
analyses.push({
|
||||
postIndex: analyses.length + 1,
|
||||
isRelevant: false,
|
||||
confidence: 0.3,
|
||||
reasoning: "AI response parsing failed",
|
||||
});
|
||||
}
|
||||
|
||||
return analyses;
|
||||
} catch (error) {
|
||||
clearTimeout(timeoutId);
|
||||
if (error.name === 'AbortError') {
|
||||
throw new Error('Request timeout: AI analysis took longer than 5 minutes');
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Error in batch AI analysis: ${error.message}`);
|
||||
|
||||
// Fallback: mark all as relevant with low confidence
|
||||
return posts.map((_, i) => ({
|
||||
postIndex: i + 1,
|
||||
isRelevant: true,
|
||||
confidence: 0.3,
|
||||
reasoning: `Analysis failed: ${error.message}`,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze a single post using local Ollama (fallback)
|
||||
*/
|
||||
async function analyzeSinglePost(
|
||||
text,
|
||||
context,
|
||||
model = DEFAULT_MODEL,
|
||||
ollamaHost = "http://localhost:11434"
|
||||
) {
|
||||
const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
|
||||
|
||||
Post: "${text}"
|
||||
|
||||
Is this post relevant to "${context}"? Provide:
|
||||
1. YES or NO
|
||||
2. Confidence (0.0 to 1.0)
|
||||
3. Brief reason
|
||||
|
||||
Format: YES/NO | 0.X | reason`;
|
||||
|
||||
try {
|
||||
const response = await fetch(`${ollamaHost}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.3,
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama API error: ${response.status}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const aiResponse = data.response.trim();
|
||||
|
||||
// Parse response
|
||||
const parts = aiResponse.split("|").map((p) => p.trim());
|
||||
|
||||
if (parts.length >= 3) {
|
||||
return {
|
||||
isRelevant: parts[0].toUpperCase().includes("YES"),
|
||||
confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)),
|
||||
reasoning: parts[2],
|
||||
};
|
||||
} else {
|
||||
// Fallback parsing
|
||||
return {
|
||||
isRelevant:
|
||||
aiResponse.toLowerCase().includes("yes") ||
|
||||
aiResponse.toLowerCase().includes("relevant"),
|
||||
confidence: 0.6,
|
||||
reasoning: aiResponse.substring(0, 100),
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
isRelevant: true, // Default to include on error
|
||||
confidence: 0.3,
|
||||
reasoning: `Analysis failed: ${error.message}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the most recent results file if none specified
|
||||
*/
|
||||
function findLatestResultsFile(resultsDir = "results") {
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
if (!fs.existsSync(resultsDir)) {
|
||||
throw new Error("Results directory not found. Run the scraper first.");
|
||||
}
|
||||
|
||||
const files = fs
|
||||
.readdirSync(resultsDir)
|
||||
.filter(
|
||||
(f) =>
|
||||
(f.startsWith("results-") || f.startsWith("linkedin-results-")) &&
|
||||
f.endsWith(".json") &&
|
||||
!f.includes("-ai-")
|
||||
)
|
||||
.sort()
|
||||
.reverse();
|
||||
|
||||
if (files.length === 0) {
|
||||
throw new Error("No results files found. Run the scraper first.");
|
||||
}
|
||||
|
||||
return path.join(resultsDir, files[0]);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
checkOllamaStatus,
|
||||
analyzeBatch,
|
||||
analyzeSinglePost,
|
||||
findLatestResultsFile,
|
||||
DEFAULT_MODEL, // Export so other modules can use it
|
||||
};
|
||||
|
||||
@ -31,7 +31,7 @@ const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
||||
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||
const HEADLESS = process.env.HEADLESS !== "false";
|
||||
const SEARCH_KEYWORDS =
|
||||
process.env.SEARCH_KEYWORDS || "layoff,downsizing,job cuts";
|
||||
process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts";
|
||||
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
||||
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
|
||||
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
|
||||
|
||||
@ -10,6 +10,7 @@ const {
|
||||
containsAnyKeyword,
|
||||
validateLocationAgainstFilters,
|
||||
extractLocationFromProfile,
|
||||
parseLocationFilters,
|
||||
} = require("ai-analyzer");
|
||||
|
||||
/**
|
||||
@ -48,28 +49,44 @@ async function linkedinStrategy(coreParser, options = {}) {
|
||||
await coreParser.navigateTo(searchUrl, {
|
||||
pageId: "linkedin-main",
|
||||
retries: 2,
|
||||
waitUntil: "networkidle", // Wait for network to be idle
|
||||
});
|
||||
|
||||
// Wait for page to load - use delay utility instead of waitForTimeout
|
||||
await new Promise(resolve => setTimeout(resolve, 3000)); // Give LinkedIn time to render
|
||||
// Wait for page to load and content to render
|
||||
await new Promise(resolve => setTimeout(resolve, 5000)); // Give LinkedIn time to render dynamic content
|
||||
|
||||
// Scroll down a bit to trigger lazy loading
|
||||
try {
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo(0, 500);
|
||||
});
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
} catch (e) {
|
||||
logger.debug(`Could not scroll page: ${e.message}`);
|
||||
}
|
||||
|
||||
// Wait for search results - try multiple selectors
|
||||
let hasResults = false;
|
||||
const possibleSelectors = [
|
||||
".feed-shared-update-v2",
|
||||
"article[data-urn*='urn:li:activity']",
|
||||
"article",
|
||||
".search-results-container",
|
||||
".search-results__list",
|
||||
".reusable-search__result-container",
|
||||
"[data-test-id='search-results']",
|
||||
".feed-shared-update-v2",
|
||||
"article",
|
||||
];
|
||||
|
||||
for (const selector of possibleSelectors) {
|
||||
try {
|
||||
await page.waitForSelector(selector, { timeout: 5000 });
|
||||
hasResults = true;
|
||||
logger.info(`✅ Found results container with selector: ${selector}`);
|
||||
break;
|
||||
await page.waitForSelector(selector, { timeout: 10000 });
|
||||
// Verify we actually have post elements
|
||||
const count = await page.$$(selector).then(elements => elements.length);
|
||||
if (count > 0) {
|
||||
hasResults = true;
|
||||
logger.info(`✅ Found ${count} post elements with selector: ${selector}`);
|
||||
break;
|
||||
}
|
||||
} catch (e) {
|
||||
// Try next selector
|
||||
}
|
||||
@ -100,20 +117,24 @@ async function linkedinStrategy(coreParser, options = {}) {
|
||||
// Validate location if filtering enabled
|
||||
if (locationFilter) {
|
||||
const postLocation = post.location || post.profileLocation || "";
|
||||
// Parse locationFilter string into array if it's a string
|
||||
const locationFiltersArray = typeof locationFilter === 'string'
|
||||
? parseLocationFilters(locationFilter)
|
||||
: locationFilter;
|
||||
const locationValid = validateLocationAgainstFilters(
|
||||
postLocation,
|
||||
locationFilter
|
||||
locationFiltersArray
|
||||
);
|
||||
|
||||
if (!locationValid) {
|
||||
if (!locationValid.isValid) {
|
||||
logger.debug(`⏭️ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`);
|
||||
rejectedResults.push({
|
||||
...post,
|
||||
rejectionReason: `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
|
||||
rejectionReason: locationValid.reasoning || `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
|
||||
});
|
||||
continue;
|
||||
} else {
|
||||
logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}"`);
|
||||
logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}" (${locationValid.reasoning || 'matched'})`);
|
||||
}
|
||||
}
|
||||
|
||||
@ -156,9 +177,12 @@ async function extractPostsFromPage(page, keyword) {
|
||||
|
||||
try {
|
||||
// Try multiple selectors for post elements (LinkedIn changes these frequently)
|
||||
// Prioritize selectors that are more specific to actual posts
|
||||
const postSelectors = [
|
||||
".feed-shared-update-v2",
|
||||
"article[data-urn*='urn:li:activity']", // Most specific - posts with activity ID
|
||||
".feed-shared-update-v2[data-urn*='urn:li:activity']",
|
||||
"article.feed-shared-update-v2",
|
||||
".feed-shared-update-v2",
|
||||
"[data-urn*='urn:li:activity']",
|
||||
".reusable-search__result-container",
|
||||
".search-result__wrapper",
|
||||
@ -170,11 +194,30 @@ async function extractPostsFromPage(page, keyword) {
|
||||
|
||||
for (const selector of postSelectors) {
|
||||
try {
|
||||
// Wait a bit for elements to be available
|
||||
await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {});
|
||||
postElements = await page.$$(selector);
|
||||
|
||||
// Filter to only elements that have a data-urn attribute (actual posts)
|
||||
if (postElements.length > 0) {
|
||||
usedSelector = selector;
|
||||
logger.info(`✅ Found ${postElements.length} post elements using selector: ${selector}`);
|
||||
break;
|
||||
const validElements = [];
|
||||
for (const elem of postElements) {
|
||||
try {
|
||||
const dataUrn = await elem.getAttribute("data-urn");
|
||||
if (dataUrn && dataUrn.includes("urn:li:activity")) {
|
||||
validElements.push(elem);
|
||||
}
|
||||
} catch (e) {
|
||||
// Element might have been detached, skip it
|
||||
}
|
||||
}
|
||||
|
||||
if (validElements.length > 0) {
|
||||
postElements = validElements;
|
||||
usedSelector = selector;
|
||||
logger.info(`✅ Found ${postElements.length} valid post elements using selector: ${selector}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Try next selector
|
||||
@ -199,10 +242,22 @@ async function extractPostsFromPage(page, keyword) {
|
||||
|
||||
for (let i = 0; i < postElements.length; i++) {
|
||||
try {
|
||||
// Scroll element into view to ensure it's fully rendered
|
||||
try {
|
||||
await postElements[i].evaluate((el) => {
|
||||
el.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||||
});
|
||||
await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for rendering
|
||||
} catch (e) {
|
||||
// Element might already be in view or detached, continue anyway
|
||||
}
|
||||
|
||||
const post = await extractPostData(postElements[i], keyword);
|
||||
if (post) {
|
||||
posts.push(post);
|
||||
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}...`);
|
||||
const hasContent = post.content && post.content.length > 0;
|
||||
const hasAuthor = post.authorName && post.authorName.length > 0;
|
||||
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`);
|
||||
} else {
|
||||
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
|
||||
}
|
||||
@ -222,131 +277,524 @@ async function extractPostsFromPage(page, keyword) {
|
||||
|
||||
/**
|
||||
* Extract data from individual post element
|
||||
* Uses evaluate() to extract data directly from DOM for better reliability
|
||||
*/
|
||||
async function extractPostData(postElement, keyword) {
|
||||
try {
|
||||
// Extract post ID
|
||||
const postId = (await postElement.getAttribute("data-urn")) || "";
|
||||
// Use evaluate to extract data directly from the DOM element
|
||||
// This is more reliable than using selectors which may not match
|
||||
const postData = await postElement.evaluate((el, keyword) => {
|
||||
const data = {
|
||||
postId: "",
|
||||
authorName: "",
|
||||
authorUrl: "",
|
||||
content: "",
|
||||
timestamp: "",
|
||||
location: "",
|
||||
likes: 0,
|
||||
comments: 0,
|
||||
};
|
||||
|
||||
// Extract author info
|
||||
const authorElement = await postElement.$(".feed-shared-actor__name");
|
||||
const authorName = authorElement
|
||||
? cleanText(await authorElement.textContent())
|
||||
: "";
|
||||
// Extract post ID from data-urn attribute
|
||||
data.postId = el.getAttribute("data-urn") ||
|
||||
el.getAttribute("data-activity-id") ||
|
||||
el.querySelector("[data-urn]")?.getAttribute("data-urn") || "";
|
||||
|
||||
const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
|
||||
const authorUrl = authorLinkElement
|
||||
? await authorLinkElement.getAttribute("href")
|
||||
: "";
|
||||
// Extract author name - try multiple selectors and approaches
|
||||
const authorSelectors = [
|
||||
".feed-shared-actor__name",
|
||||
".feed-shared-actor__name-link",
|
||||
".update-components-actor__name",
|
||||
".feed-shared-actor__name a",
|
||||
"[data-test-id='actor-name']",
|
||||
"span[aria-label*='name']",
|
||||
"a[href*='/in/'] span",
|
||||
".feed-shared-actor a span",
|
||||
".feed-shared-actor span",
|
||||
".feed-shared-actor__name-link span",
|
||||
];
|
||||
|
||||
// Extract post content
|
||||
const contentElement = await postElement.$(".feed-shared-text");
|
||||
const content = contentElement
|
||||
? cleanText(await contentElement.textContent())
|
||||
: "";
|
||||
|
||||
// Extract timestamp
|
||||
const timeElement = await postElement.$(
|
||||
".feed-shared-actor__sub-description time"
|
||||
);
|
||||
const timestamp = timeElement
|
||||
? await timeElement.getAttribute("datetime")
|
||||
: "";
|
||||
|
||||
// Extract location from profile (try multiple selectors)
|
||||
let location = "";
|
||||
const locationSelectors = [
|
||||
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
|
||||
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link--without-hover",
|
||||
".feed-shared-actor__sub-description span[aria-label*='location']",
|
||||
".feed-shared-actor__sub-description span[aria-label*='Location']",
|
||||
];
|
||||
|
||||
for (const selector of locationSelectors) {
|
||||
try {
|
||||
const locationElement = await postElement.$(selector);
|
||||
if (locationElement) {
|
||||
const locationText = await locationElement.textContent();
|
||||
if (locationText && locationText.trim()) {
|
||||
location = cleanText(locationText);
|
||||
for (const selector of authorSelectors) {
|
||||
const elem = el.querySelector(selector);
|
||||
if (elem) {
|
||||
const text = elem.textContent?.trim() || elem.innerText?.trim();
|
||||
if (text && text.length > 0 && text.length < 100) { // Reasonable name length
|
||||
data.authorName = text;
|
||||
// Try to get link from same element or parent
|
||||
const link = elem.closest("a") || elem.querySelector("a");
|
||||
if (link) {
|
||||
data.authorUrl = link.getAttribute("href") || "";
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Try next selector
|
||||
}
|
||||
}
|
||||
|
||||
// If no location found in sub-description, try to extract from author link hover or profile
|
||||
if (!location) {
|
||||
try {
|
||||
// Try to get location from data attributes or other sources
|
||||
const subDescElement = await postElement.$(".feed-shared-actor__sub-description");
|
||||
if (subDescElement) {
|
||||
const subDescText = await subDescElement.textContent();
|
||||
// Look for location patterns (City, Province/State, Country)
|
||||
const locationMatch = subDescText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/);
|
||||
if (locationMatch) {
|
||||
location = cleanText(locationMatch[0]);
|
||||
// If author name found but no URL, try to find link separately
|
||||
if (data.authorName && !data.authorUrl) {
|
||||
const authorLink = el.querySelector(".feed-shared-actor__name-link, .feed-shared-actor__name a, a[href*='/in/']");
|
||||
if (authorLink) {
|
||||
data.authorUrl = authorLink.getAttribute("href") || "";
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Look for any link with /in/ pattern and get the name from nearby text
|
||||
if (!data.authorName) {
|
||||
const profileLinks = el.querySelectorAll("a[href*='/in/']");
|
||||
for (const link of profileLinks) {
|
||||
// Skip if it's a company link
|
||||
if (link.getAttribute("href")?.includes("/company/")) continue;
|
||||
|
||||
// Get text from the link or nearby
|
||||
const linkText = link.textContent?.trim() || link.innerText?.trim();
|
||||
if (linkText && linkText.length > 0 && linkText.length < 100 && !linkText.includes("View")) {
|
||||
data.authorName = linkText;
|
||||
data.authorUrl = link.getAttribute("href") || "";
|
||||
break;
|
||||
}
|
||||
// Try to get text from first child span
|
||||
const childSpan = link.querySelector("span");
|
||||
if (childSpan) {
|
||||
const spanText = childSpan.textContent?.trim() || childSpan.innerText?.trim();
|
||||
if (spanText && spanText.length > 0 && spanText.length < 100) {
|
||||
data.authorName = spanText;
|
||||
data.authorUrl = link.getAttribute("href") || "";
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Try to get text from parent
|
||||
const parentText = link.parentElement?.textContent?.trim();
|
||||
if (parentText && parentText.length < 100 && !parentText.includes("View")) {
|
||||
// Extract just the name part (first line or first few words)
|
||||
const namePart = parentText.split("\n")[0].split("·")[0].trim();
|
||||
if (namePart.length > 0 && namePart.length < 100) {
|
||||
data.authorName = namePart;
|
||||
data.authorUrl = link.getAttribute("href") || "";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Location extraction failed, continue without it
|
||||
}
|
||||
|
||||
// Last resort: Extract from actor section by looking at all text
|
||||
if (!data.authorName) {
|
||||
const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor, [class*='actor']");
|
||||
if (actorSection) {
|
||||
const actorText = actorSection.textContent || actorSection.innerText || "";
|
||||
const lines = actorText.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
||||
// First non-empty line is often the name
|
||||
for (const line of lines) {
|
||||
if (line.length > 0 && line.length < 100 &&
|
||||
!line.includes("·") &&
|
||||
!line.includes("ago") &&
|
||||
!line.match(/^\d+/) &&
|
||||
!line.toLowerCase().includes("view")) {
|
||||
data.authorName = line;
|
||||
// Try to find associated link
|
||||
const link = actorSection.querySelector("a[href*='/in/']");
|
||||
if (link) {
|
||||
data.authorUrl = link.getAttribute("href") || "";
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract post content - try multiple selectors
|
||||
const contentSelectors = [
|
||||
".feed-shared-text",
|
||||
".feed-shared-text__text-view",
|
||||
".feed-shared-update-v2__description",
|
||||
".update-components-text",
|
||||
"[data-test-id='post-text']",
|
||||
".feed-shared-text span",
|
||||
".feed-shared-update-v2__description-wrapper",
|
||||
];
|
||||
|
||||
for (const selector of contentSelectors) {
|
||||
const elem = el.querySelector(selector);
|
||||
if (elem) {
|
||||
const text = elem.textContent?.trim() || elem.innerText?.trim();
|
||||
if (text && text.length > 10) { // Only use if substantial content
|
||||
data.content = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract timestamp
|
||||
const timeSelectors = [
|
||||
".feed-shared-actor__sub-description time",
|
||||
"time[datetime]",
|
||||
"[data-test-id='timestamp']",
|
||||
".feed-shared-actor__sub-description time[datetime]",
|
||||
"time",
|
||||
".feed-shared-actor__sub-description time",
|
||||
"span[aria-label*='time']",
|
||||
"span[aria-label*='ago']",
|
||||
];
|
||||
|
||||
for (const selector of timeSelectors) {
|
||||
const elem = el.querySelector(selector);
|
||||
if (elem) {
|
||||
data.timestamp = elem.getAttribute("datetime") ||
|
||||
elem.getAttribute("title") ||
|
||||
elem.getAttribute("aria-label") ||
|
||||
elem.textContent?.trim() || "";
|
||||
if (data.timestamp) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Look for time-like patterns in sub-description
|
||||
if (!data.timestamp) {
|
||||
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
||||
if (subDesc) {
|
||||
const subDescText = subDesc.textContent || subDesc.innerText || "";
|
||||
// Look for patterns like "2h", "3d", "1w", "2 months ago", etc.
|
||||
const timePatterns = [
|
||||
/\d+\s*(minute|hour|day|week|month|year)s?\s*ago/i,
|
||||
/\d+\s*(h|d|w|mo|yr)/i,
|
||||
/(just now|today|yesterday)/i,
|
||||
];
|
||||
for (const pattern of timePatterns) {
|
||||
const match = subDescText.match(pattern);
|
||||
if (match) {
|
||||
data.timestamp = match[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract location - try multiple approaches
|
||||
const locationSelectors = [
|
||||
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
|
||||
".feed-shared-actor__sub-description-link--without-hover",
|
||||
"span[aria-label*='location' i]",
|
||||
"span[aria-label*='Location']",
|
||||
".feed-shared-actor__sub-description span",
|
||||
".feed-shared-actor__sub-description a",
|
||||
"a[href*='/company/']",
|
||||
"a[href*='/location/']",
|
||||
];
|
||||
|
||||
for (const selector of locationSelectors) {
|
||||
const elem = el.querySelector(selector);
|
||||
if (elem) {
|
||||
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || elem.innerText?.trim() || "";
|
||||
// Check if it looks like a location (contains comma or common location words)
|
||||
if (text && text.length > 2 && text.length < 100) {
|
||||
// More flexible location detection
|
||||
if (text.includes(",") ||
|
||||
/(city|province|state|country|region|ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut)/i.test(text) ||
|
||||
/^[A-Z][a-z]+,\s*[A-Z][a-z]+/i.test(text)) {
|
||||
data.location = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no location found, try parsing from sub-description text
|
||||
if (!data.location) {
|
||||
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
||||
if (subDesc) {
|
||||
const subDescText = subDesc.textContent || subDesc.innerText || "";
|
||||
|
||||
// First, try to get all links in sub-description (location is often a link)
|
||||
const subDescLinks = subDesc.querySelectorAll("a");
|
||||
for (const link of subDescLinks) {
|
||||
const linkText = link.textContent?.trim() || link.innerText?.trim() || "";
|
||||
const linkHref = link.getAttribute("href") || "";
|
||||
|
||||
// Skip if it's a time/date link or company link
|
||||
if (linkHref.includes("/company/") || linkText.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w)/i)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If link text looks like a location
|
||||
if (linkText && linkText.length > 2 && linkText.length < 100) {
|
||||
if (linkText.includes(",") ||
|
||||
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut|toronto|vancouver|calgary|ottawa|montreal|winnipeg|edmonton|halifax|victoria|regina|saskatoon|windsor|kitchener|hamilton|london|st\.?\s*catharines|oshawa|barrie|greater sudbury|sherbrooke|kelowna|abbotsford|trois-rivières|guelph|cambridge|coquitlam|saanich|saint john|thunder bay|waterloo|delta|chatham|red deer|kamloops|brantford|whitehorse|yellowknife|iqaluit)/i.test(linkText)) {
|
||||
data.location = linkText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If still no location, try pattern matching on the full text
|
||||
if (!data.location && subDescText) {
|
||||
// Look for location patterns (City, Province/State, Country)
|
||||
const locationPatterns = [
|
||||
// Full location: "City, Province, Country"
|
||||
/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/,
|
||||
// City, Province
|
||||
/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)/,
|
||||
// Just province/state names
|
||||
/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut|ON|AB|BC|QC|MB|SK|NS|NB|NL|PE|YT|NT|NU)\b/i,
|
||||
// Major cities
|
||||
/\b(Toronto|Vancouver|Calgary|Ottawa|Montreal|Winnipeg|Edmonton|Halifax|Victoria|Regina|Saskatoon)\b/i,
|
||||
];
|
||||
|
||||
for (const pattern of locationPatterns) {
|
||||
const match = subDescText.match(pattern);
|
||||
if (match) {
|
||||
// Get more context around the match
|
||||
const matchIndex = subDescText.indexOf(match[0]);
|
||||
const contextStart = Math.max(0, matchIndex - 30);
|
||||
const contextEnd = Math.min(subDescText.length, matchIndex + match[0].length + 30);
|
||||
const context = subDescText.substring(contextStart, contextEnd).trim();
|
||||
|
||||
// Extract just the location part (remove time/date info)
|
||||
let locationText = match[0].trim();
|
||||
// If we have more context, try to get a better location string
|
||||
if (context.includes(",") && context.length < 100) {
|
||||
// Try to extract "City, Province" pattern from context
|
||||
const cityProvinceMatch = context.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+)/);
|
||||
if (cityProvinceMatch) {
|
||||
locationText = cityProvinceMatch[0].trim();
|
||||
}
|
||||
}
|
||||
|
||||
data.location = locationText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Last resort: extract any text that looks location-like from sub-description
|
||||
if (!data.location && subDescText) {
|
||||
// Split by common separators and look for location-like text
|
||||
const parts = subDescText.split(/[·•|]/).map(p => p.trim()).filter(p => p.length > 0);
|
||||
for (const part of parts) {
|
||||
// Skip if it looks like time/date
|
||||
if (part.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
|
||||
continue;
|
||||
}
|
||||
// Check if it looks like a location
|
||||
if (part.length > 2 && part.length < 100 &&
|
||||
(part.includes(",") ||
|
||||
/(ontario|alberta|british columbia|quebec|manitoba|toronto|vancouver|calgary|ottawa|montreal)/i.test(part))) {
|
||||
data.location = part;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final fallback: look anywhere in the actor section for location-like text
|
||||
if (!data.location) {
|
||||
const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor");
|
||||
if (actorSection) {
|
||||
const actorText = actorSection.textContent || actorSection.innerText || "";
|
||||
// Look for province names
|
||||
const provinceMatch = actorText.match(/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)\b/i);
|
||||
if (provinceMatch) {
|
||||
// Try to get city, province if available
|
||||
const cityProvinceMatch = actorText.match(/([A-Z][a-z]+),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
|
||||
if (cityProvinceMatch) {
|
||||
data.location = cityProvinceMatch[0].trim();
|
||||
} else {
|
||||
data.location = provinceMatch[0].trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to extract from any hover cards or mini profiles in the DOM
|
||||
if (!data.location) {
|
||||
// Look for mini profile cards or tooltips
|
||||
const miniProfileSelectors = [
|
||||
"[data-control-name='hovercard']",
|
||||
".artdeco-hoverable-trigger",
|
||||
".feed-shared-actor__meta",
|
||||
".pv-text-details__left-panel",
|
||||
];
|
||||
|
||||
for (const selector of miniProfileSelectors) {
|
||||
const elem = el.querySelector(selector);
|
||||
if (elem) {
|
||||
const text = elem.textContent || elem.innerText || "";
|
||||
// Look for location patterns
|
||||
const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
|
||||
if (locationMatch) {
|
||||
data.location = locationMatch[0].trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract engagement metrics - try multiple approaches
|
||||
const likesSelectors = [
|
||||
".social-counts-reactions__count",
|
||||
"[data-test-id='reactions-count']",
|
||||
".social-counts__reactions-count",
|
||||
".feed-shared-social-action-bar__reactions-count",
|
||||
"button[aria-label*='reaction']",
|
||||
"button[aria-label*='like']",
|
||||
".social-actions-button__reactions-count",
|
||||
"[data-test-id='social-actions__reactions-count']",
|
||||
];
|
||||
|
||||
for (const selector of likesSelectors) {
|
||||
const elem = el.querySelector(selector);
|
||||
if (elem) {
|
||||
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
|
||||
const match = text.match(/(\d+)/);
|
||||
if (match) {
|
||||
data.likes = parseInt(match[1], 10) || 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Look for any button or element with reaction/like text
|
||||
if (data.likes === 0) {
|
||||
const allButtons = el.querySelectorAll("button, span, div");
|
||||
for (const btn of allButtons) {
|
||||
const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
|
||||
if (/reaction|like/i.test(text)) {
|
||||
const match = text.match(/(\d+)/);
|
||||
if (match) {
|
||||
data.likes = parseInt(match[1], 10) || 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const commentsSelectors = [
|
||||
".social-counts-comments__count",
|
||||
"[data-test-id='comments-count']",
|
||||
".social-counts__comments-count",
|
||||
".feed-shared-social-action-bar__comments-count",
|
||||
"button[aria-label*='comment']",
|
||||
".social-actions-button__comments-count",
|
||||
"[data-test-id='social-actions__comments-count']",
|
||||
];
|
||||
|
||||
for (const selector of commentsSelectors) {
|
||||
const elem = el.querySelector(selector);
|
||||
if (elem) {
|
||||
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
|
||||
const match = text.match(/(\d+)/);
|
||||
if (match) {
|
||||
data.comments = parseInt(match[1], 10) || 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Look for any button or element with comment text
|
||||
if (data.comments === 0) {
|
||||
const allButtons = el.querySelectorAll("button, span, div");
|
||||
for (const btn of allButtons) {
|
||||
const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
|
||||
if (/comment/i.test(text)) {
|
||||
const match = text.match(/(\d+)/);
|
||||
if (match) {
|
||||
data.comments = parseInt(match[1], 10) || 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}, keyword);
|
||||
|
||||
// Clean and format the extracted data
|
||||
const authorName = cleanText(postData.authorName);
|
||||
let authorUrl = postData.authorUrl || "";
|
||||
if (authorUrl && !authorUrl.startsWith("http")) {
|
||||
authorUrl = `https://www.linkedin.com${authorUrl}`;
|
||||
}
|
||||
|
||||
// Extract engagement metrics
|
||||
const likesElement = await postElement.$(".social-counts-reactions__count");
|
||||
const likesText = likesElement
|
||||
? cleanText(await likesElement.textContent())
|
||||
: "0";
|
||||
|
||||
const commentsElement = await postElement.$(
|
||||
".social-counts-comments__count"
|
||||
);
|
||||
const commentsText = commentsElement
|
||||
? cleanText(await commentsElement.textContent())
|
||||
: "0";
|
||||
|
||||
// Note: LinkedIn search already filters by keyword semantically
|
||||
// We don't filter by content keyword match because:
|
||||
// 1. LinkedIn's search is semantic - it finds related posts, not just exact matches
|
||||
// 2. The keyword might be in comments, hashtags, or metadata, not visible text
|
||||
// 3. Posts might be about the topic without using the exact keyword
|
||||
//
|
||||
// Optional: Log if keyword appears in content (for debugging, but don't filter)
|
||||
const keywordLower = keyword.toLowerCase();
|
||||
const contentLower = content.toLowerCase();
|
||||
const hasKeywordInContent = contentLower.includes(keywordLower);
|
||||
if (!hasKeywordInContent && content.length > 50) {
|
||||
logger.debug(`ℹ️ Post doesn't contain keyword "${keyword}" in visible content, but including it (LinkedIn search matched it)`);
|
||||
}
|
||||
const content = cleanText(postData.content);
|
||||
const location = cleanText(postData.location);
|
||||
const timestamp = postData.timestamp || "";
|
||||
|
||||
// Validate we have minimum required data
|
||||
if (!postId && !content) {
|
||||
if (!postData.postId && !content) {
|
||||
logger.debug(`⏭️ Post filtered: missing both postId and content`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Log extraction results for debugging
|
||||
const missingFields = [];
|
||||
if (!authorName) missingFields.push("authorName");
|
||||
if (!authorUrl) missingFields.push("authorUrl");
|
||||
if (!location) missingFields.push("location");
|
||||
if (!timestamp) missingFields.push("timestamp");
|
||||
if (postData.likes === 0 && postData.comments === 0) missingFields.push("engagement");
|
||||
|
||||
if (missingFields.length > 0 && postData.postId) {
|
||||
logger.debug(`⚠️ Post ${postData.postId.substring(0, 20)}... missing: ${missingFields.join(", ")}`);
|
||||
|
||||
// If location is missing, log sub-description content for debugging
|
||||
if (!location && process.env.DEBUG_EXTRACTION === "true") {
|
||||
try {
|
||||
const subDescInfo = await postElement.evaluate((el) => {
|
||||
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
||||
if (subDesc) {
|
||||
return {
|
||||
text: subDesc.textContent || subDesc.innerText || "",
|
||||
html: subDesc.innerHTML.substring(0, 500),
|
||||
links: Array.from(subDesc.querySelectorAll("a")).map(a => ({
|
||||
text: a.textContent?.trim(),
|
||||
href: a.getAttribute("href")
|
||||
}))
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
if (subDescInfo) {
|
||||
logger.debug(`Sub-description text: "${subDescInfo.text}"`);
|
||||
logger.debug(`Sub-description links: ${JSON.stringify(subDescInfo.links)}`);
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore errors in debugging
|
||||
}
|
||||
}
|
||||
|
||||
// Optionally log HTML structure for first failed extraction (to help debug)
|
||||
if (process.env.DEBUG_EXTRACTION === "true" && missingFields.length >= 3) {
|
||||
try {
|
||||
const htmlSnippet = await postElement.evaluate((el) => {
|
||||
// Get the outer HTML of the element (limited to first 2000 chars)
|
||||
const html = el.outerHTML || "";
|
||||
return html.substring(0, 2000);
|
||||
});
|
||||
logger.debug(`HTML structure (first 2000 chars):\n${htmlSnippet}`);
|
||||
} catch (e) {
|
||||
// Ignore errors in debugging
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
postId: cleanText(postId),
|
||||
postId: cleanText(postData.postId),
|
||||
authorName,
|
||||
authorUrl,
|
||||
profileLink: authorUrl ? (authorUrl.startsWith("http") ? authorUrl : `https://www.linkedin.com${authorUrl}`) : "",
|
||||
profileLink: authorUrl,
|
||||
text: content,
|
||||
content: content,
|
||||
location: location,
|
||||
profileLocation: location, // Alias for compatibility
|
||||
timestamp,
|
||||
keyword,
|
||||
likes: extractNumber(likesText),
|
||||
comments: extractNumber(commentsText),
|
||||
likes: postData.likes || 0,
|
||||
comments: postData.comments || 0,
|
||||
extractedAt: new Date().toISOString(),
|
||||
source: "linkedin",
|
||||
parser: "linkedout-parser",
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warning(`Error extracting post data: ${error.message}`);
|
||||
logger.debug(`Stack trace: ${error.stack}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user