231 lines
6.2 KiB
JavaScript
231 lines
6.2 KiB
JavaScript
/**
|
|
* LinkedIn Parsing Strategy
|
|
*
|
|
* Uses core-parser for browser management and ai-analyzer for utilities
|
|
*/
|
|
|
|
const {
|
|
logger,
|
|
cleanText,
|
|
containsAnyKeyword,
|
|
validateLocationAgainstFilters,
|
|
extractLocationFromProfile,
|
|
} = require("ai-analyzer");
|
|
|
|
/**
|
|
* LinkedIn parsing strategy function
|
|
*/
|
|
async function linkedinStrategy(coreParser, options = {}) {
|
|
const {
|
|
keywords = ["layoff", "downsizing", "job cuts"],
|
|
locationFilter = null,
|
|
maxResults = 50,
|
|
credentials = {},
|
|
} = options;
|
|
|
|
const results = [];
|
|
const rejectedResults = [];
|
|
const seenPosts = new Set();
|
|
const seenProfiles = new Set();
|
|
|
|
try {
|
|
// Create main page
|
|
const page = await coreParser.createPage("linkedin-main");
|
|
|
|
// Authenticate to LinkedIn
|
|
logger.info("🔐 Authenticating to LinkedIn...");
|
|
await coreParser.authenticate("linkedin", credentials, "linkedin-main");
|
|
logger.info("✅ LinkedIn authentication successful");
|
|
|
|
// Search for posts with each keyword
|
|
for (const keyword of keywords) {
|
|
logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);
|
|
|
|
const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
|
|
keyword
|
|
)}&sortBy=date_posted`;
|
|
|
|
await coreParser.navigateTo(searchUrl, {
|
|
pageId: "linkedin-main",
|
|
retries: 2,
|
|
});
|
|
|
|
// Wait for search results
|
|
const hasResults = await coreParser.navigationManager.navigateAndWaitFor(
|
|
searchUrl,
|
|
".search-results-container",
|
|
{ pageId: "linkedin-main", timeout: 10000 }
|
|
);
|
|
|
|
if (!hasResults) {
|
|
logger.warning(`No search results found for keyword: ${keyword}`);
|
|
continue;
|
|
}
|
|
|
|
// Extract posts from current page
|
|
const posts = await extractPostsFromPage(page, keyword);
|
|
|
|
for (const post of posts) {
|
|
// Skip duplicates
|
|
if (seenPosts.has(post.postId)) continue;
|
|
seenPosts.add(post.postId);
|
|
|
|
// Validate location if filtering enabled
|
|
if (locationFilter) {
|
|
const locationValid = validateLocationAgainstFilters(
|
|
post.location || post.profileLocation,
|
|
locationFilter
|
|
);
|
|
|
|
if (!locationValid) {
|
|
rejectedResults.push({
|
|
...post,
|
|
rejectionReason: "Location filter mismatch",
|
|
});
|
|
continue;
|
|
}
|
|
}
|
|
|
|
results.push(post);
|
|
|
|
if (results.length >= maxResults) {
|
|
logger.info(`📊 Reached maximum results limit: ${maxResults}`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (results.length >= maxResults) break;
|
|
}
|
|
|
|
logger.info(
|
|
`🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
|
|
);
|
|
|
|
return {
|
|
results,
|
|
rejectedResults,
|
|
summary: {
|
|
totalPosts: results.length,
|
|
totalRejected: rejectedResults.length,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
},
|
|
};
|
|
} catch (error) {
|
|
logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract posts from current search results page
|
|
*/
|
|
async function extractPostsFromPage(page, keyword) {
|
|
const posts = [];
|
|
|
|
try {
|
|
// Get all post elements
|
|
const postElements = await page.$$(".feed-shared-update-v2");
|
|
|
|
for (const postElement of postElements) {
|
|
try {
|
|
const post = await extractPostData(postElement, keyword);
|
|
if (post) {
|
|
posts.push(post);
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`Failed to extract post data: ${error.message}`);
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Failed to extract posts from page: ${error.message}`);
|
|
}
|
|
|
|
return posts;
|
|
}
|
|
|
|
/**
|
|
* Extract data from individual post element
|
|
*/
|
|
async function extractPostData(postElement, keyword) {
|
|
try {
|
|
// Extract post ID
|
|
const postId = (await postElement.getAttribute("data-urn")) || "";
|
|
|
|
// Extract author info
|
|
const authorElement = await postElement.$(".feed-shared-actor__name");
|
|
const authorName = authorElement
|
|
? cleanText(await authorElement.textContent())
|
|
: "";
|
|
|
|
const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
|
|
const authorUrl = authorLinkElement
|
|
? await authorLinkElement.getAttribute("href")
|
|
: "";
|
|
|
|
// Extract post content
|
|
const contentElement = await postElement.$(".feed-shared-text");
|
|
const content = contentElement
|
|
? cleanText(await contentElement.textContent())
|
|
: "";
|
|
|
|
// Extract timestamp
|
|
const timeElement = await postElement.$(
|
|
".feed-shared-actor__sub-description time"
|
|
);
|
|
const timestamp = timeElement
|
|
? await timeElement.getAttribute("datetime")
|
|
: "";
|
|
|
|
// Extract engagement metrics
|
|
const likesElement = await postElement.$(".social-counts-reactions__count");
|
|
const likesText = likesElement
|
|
? cleanText(await likesElement.textContent())
|
|
: "0";
|
|
|
|
const commentsElement = await postElement.$(
|
|
".social-counts-comments__count"
|
|
);
|
|
const commentsText = commentsElement
|
|
? cleanText(await commentsElement.textContent())
|
|
: "0";
|
|
|
|
// Check if post contains relevant keywords
|
|
const isRelevant = containsAnyKeyword(content, [keyword]);
|
|
|
|
if (!isRelevant) {
|
|
return null; // Skip irrelevant posts
|
|
}
|
|
|
|
return {
|
|
postId: cleanText(postId),
|
|
authorName,
|
|
authorUrl,
|
|
content,
|
|
timestamp,
|
|
keyword,
|
|
likes: extractNumber(likesText),
|
|
comments: extractNumber(commentsText),
|
|
extractedAt: new Date().toISOString(),
|
|
source: "linkedin",
|
|
};
|
|
} catch (error) {
|
|
logger.warning(`Error extracting post data: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract numbers from text (e.g., "15 likes" -> 15)
|
|
*/
|
|
function extractNumber(text) {
|
|
const match = text.match(/\d+/);
|
|
return match ? parseInt(match[0]) : 0;
|
|
}
|
|
|
|
module.exports = {
|
|
linkedinStrategy,
|
|
extractPostsFromPage,
|
|
extractPostData,
|
|
};
|