linkedout/linkedin-parser/strategies/linkedin-strategy.js

231 lines
6.2 KiB
JavaScript

/**
* LinkedIn Parsing Strategy
*
* Uses core-parser for browser management and ai-analyzer for utilities
*/
const {
logger,
cleanText,
containsAnyKeyword,
validateLocationAgainstFilters,
extractLocationFromProfile,
} = require("ai-analyzer");
/**
* LinkedIn parsing strategy function
*/
async function linkedinStrategy(coreParser, options = {}) {
const {
keywords = ["layoff", "downsizing", "job cuts"],
locationFilter = null,
maxResults = 50,
credentials = {},
} = options;
const results = [];
const rejectedResults = [];
const seenPosts = new Set();
const seenProfiles = new Set();
try {
// Create main page
const page = await coreParser.createPage("linkedin-main");
// Authenticate to LinkedIn
logger.info("🔐 Authenticating to LinkedIn...");
await coreParser.authenticate("linkedin", credentials, "linkedin-main");
logger.info("✅ LinkedIn authentication successful");
// Search for posts with each keyword
for (const keyword of keywords) {
logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);
const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
keyword
)}&sortBy=date_posted`;
await coreParser.navigateTo(searchUrl, {
pageId: "linkedin-main",
retries: 2,
});
// Wait for search results
const hasResults = await coreParser.navigationManager.navigateAndWaitFor(
searchUrl,
".search-results-container",
{ pageId: "linkedin-main", timeout: 10000 }
);
if (!hasResults) {
logger.warning(`No search results found for keyword: ${keyword}`);
continue;
}
// Extract posts from current page
const posts = await extractPostsFromPage(page, keyword);
for (const post of posts) {
// Skip duplicates
if (seenPosts.has(post.postId)) continue;
seenPosts.add(post.postId);
// Validate location if filtering enabled
if (locationFilter) {
const locationValid = validateLocationAgainstFilters(
post.location || post.profileLocation,
locationFilter
);
if (!locationValid) {
rejectedResults.push({
...post,
rejectionReason: "Location filter mismatch",
});
continue;
}
}
results.push(post);
if (results.length >= maxResults) {
logger.info(`📊 Reached maximum results limit: ${maxResults}`);
break;
}
}
if (results.length >= maxResults) break;
}
logger.info(
`🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
);
return {
results,
rejectedResults,
summary: {
totalPosts: results.length,
totalRejected: rejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
},
};
} catch (error) {
logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
throw error;
}
}
/**
* Extract posts from current search results page
*/
async function extractPostsFromPage(page, keyword) {
const posts = [];
try {
// Get all post elements
const postElements = await page.$$(".feed-shared-update-v2");
for (const postElement of postElements) {
try {
const post = await extractPostData(postElement, keyword);
if (post) {
posts.push(post);
}
} catch (error) {
logger.warning(`Failed to extract post data: ${error.message}`);
}
}
} catch (error) {
logger.error(`Failed to extract posts from page: ${error.message}`);
}
return posts;
}
/**
* Extract data from individual post element
*/
async function extractPostData(postElement, keyword) {
try {
// Extract post ID
const postId = (await postElement.getAttribute("data-urn")) || "";
// Extract author info
const authorElement = await postElement.$(".feed-shared-actor__name");
const authorName = authorElement
? cleanText(await authorElement.textContent())
: "";
const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
const authorUrl = authorLinkElement
? await authorLinkElement.getAttribute("href")
: "";
// Extract post content
const contentElement = await postElement.$(".feed-shared-text");
const content = contentElement
? cleanText(await contentElement.textContent())
: "";
// Extract timestamp
const timeElement = await postElement.$(
".feed-shared-actor__sub-description time"
);
const timestamp = timeElement
? await timeElement.getAttribute("datetime")
: "";
// Extract engagement metrics
const likesElement = await postElement.$(".social-counts-reactions__count");
const likesText = likesElement
? cleanText(await likesElement.textContent())
: "0";
const commentsElement = await postElement.$(
".social-counts-comments__count"
);
const commentsText = commentsElement
? cleanText(await commentsElement.textContent())
: "0";
// Check if post contains relevant keywords
const isRelevant = containsAnyKeyword(content, [keyword]);
if (!isRelevant) {
return null; // Skip irrelevant posts
}
return {
postId: cleanText(postId),
authorName,
authorUrl,
content,
timestamp,
keyword,
likes: extractNumber(likesText),
comments: extractNumber(commentsText),
extractedAt: new Date().toISOString(),
source: "linkedin",
};
} catch (error) {
logger.warning(`Error extracting post data: ${error.message}`);
return null;
}
}
/**
* Extract numbers from text (e.g., "15 likes" -> 15)
*/
function extractNumber(text) {
const match = text.match(/\d+/);
return match ? parseInt(match[0]) : 0;
}
module.exports = {
linkedinStrategy,
extractPostsFromPage,
extractPostData,
};