linkedout/linkedin-parser/strategies/linkedin-strategy.js
tanyar09 8de65bc04c Add initial project structure for Job Market Intelligence platform
- Created core modules: `ai-analyzer`, `core-parser`, and `job-search-parser`.
- Implemented LinkedIn and job search parsers with integrated AI analysis.
- Added CLI tools for AI analysis and job parsing.
- Included comprehensive README files for each module detailing usage and features.
- Established a `.gitignore` file to exclude unnecessary files.
- Introduced sample data for testing and demonstration purposes.
- Set up package.json files for dependency management across modules.
- Implemented logging and error handling utilities for better debugging and user feedback.
2025-12-12 14:23:01 -05:00

367 lines
12 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* LinkedIn Parsing Strategy
*
* Uses core-parser for browser management and ai-analyzer for utilities
*/
const {
logger,
cleanText,
containsAnyKeyword,
validateLocationAgainstFilters,
extractLocationFromProfile,
} = require("ai-analyzer");
/**
* LinkedIn parsing strategy function
*/
async function linkedinStrategy(coreParser, options = {}) {
const {
keywords = ["layoff", "downsizing", "job cuts"],
locationFilter = null,
maxResults = 50,
credentials = {},
} = options;
const results = [];
const rejectedResults = [];
const seenPosts = new Set();
const seenProfiles = new Set();
try {
// Create main page
const page = await coreParser.createPage("linkedin-main");
// Authenticate to LinkedIn
logger.info("🔐 Authenticating to LinkedIn...");
await coreParser.authenticate("linkedin", credentials, "linkedin-main");
logger.info("✅ LinkedIn authentication successful");
// Search for posts with each keyword
for (const keyword of keywords) {
logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);
const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
keyword
)}&sortBy=date_posted`;
await coreParser.navigateTo(searchUrl, {
pageId: "linkedin-main",
retries: 2,
});
// Wait for page to load - use delay utility instead of waitForTimeout
await new Promise(resolve => setTimeout(resolve, 3000)); // Give LinkedIn time to render
// Wait for search results - try multiple selectors
let hasResults = false;
const possibleSelectors = [
".search-results-container",
".search-results__list",
".reusable-search__result-container",
"[data-test-id='search-results']",
".feed-shared-update-v2",
"article",
];
for (const selector of possibleSelectors) {
try {
await page.waitForSelector(selector, { timeout: 5000 });
hasResults = true;
logger.info(`✅ Found results container with selector: ${selector}`);
break;
} catch (e) {
// Try next selector
}
}
if (!hasResults) {
logger.warning(`⚠️ No search results container found for keyword: ${keyword}`);
// Take screenshot for debugging
try {
const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
await page.screenshot({ path: screenshotPath, fullPage: true });
logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
} catch (e) {
logger.warning(`Could not take screenshot: ${e.message}`);
}
continue;
}
// Extract posts from current page
const posts = await extractPostsFromPage(page, keyword);
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
for (const post of posts) {
// Skip duplicates
if (seenPosts.has(post.postId)) continue;
seenPosts.add(post.postId);
// Validate location if filtering enabled
if (locationFilter) {
const postLocation = post.location || post.profileLocation || "";
const locationValid = validateLocationAgainstFilters(
postLocation,
locationFilter
);
if (!locationValid) {
logger.debug(`⏭️ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`);
rejectedResults.push({
...post,
rejectionReason: `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
});
continue;
} else {
logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}"`);
}
}
results.push(post);
if (results.length >= maxResults) {
logger.info(`📊 Reached maximum results limit: ${maxResults}`);
break;
}
}
if (results.length >= maxResults) break;
}
logger.info(
`🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
);
return {
results,
rejectedResults,
summary: {
totalPosts: results.length,
totalRejected: rejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
},
};
} catch (error) {
logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
throw error;
}
}
/**
* Extract posts from current search results page
*/
async function extractPostsFromPage(page, keyword) {
const posts = [];
try {
// Try multiple selectors for post elements (LinkedIn changes these frequently)
const postSelectors = [
".feed-shared-update-v2",
"article.feed-shared-update-v2",
"[data-urn*='urn:li:activity']",
".reusable-search__result-container",
".search-result__wrapper",
"article",
];
let postElements = [];
let usedSelector = null;
for (const selector of postSelectors) {
try {
postElements = await page.$$(selector);
if (postElements.length > 0) {
usedSelector = selector;
logger.info(`✅ Found ${postElements.length} post elements using selector: ${selector}`);
break;
}
} catch (e) {
// Try next selector
}
}
if (postElements.length === 0) {
logger.warning(`⚠️ No post elements found with any selector. Page might have different structure.`);
// Log page title and URL for debugging
try {
const pageTitle = await page.title();
const pageUrl = page.url();
logger.info(`📄 Page title: ${pageTitle}`);
logger.info(`🔗 Page URL: ${pageUrl}`);
} catch (e) {
// Ignore
}
return posts;
}
logger.info(`🔍 Processing ${postElements.length} post elements...`);
for (let i = 0; i < postElements.length; i++) {
try {
const post = await extractPostData(postElements[i], keyword);
if (post) {
posts.push(post);
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}...`);
} else {
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
}
} catch (error) {
logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`);
}
}
logger.info(`✅ Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`);
} catch (error) {
logger.error(`❌ Failed to extract posts from page: ${error.message}`);
logger.error(`Stack trace: ${error.stack}`);
}
return posts;
}
/**
* Extract data from individual post element
*/
async function extractPostData(postElement, keyword) {
try {
// Extract post ID
const postId = (await postElement.getAttribute("data-urn")) || "";
// Extract author info
const authorElement = await postElement.$(".feed-shared-actor__name");
const authorName = authorElement
? cleanText(await authorElement.textContent())
: "";
const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
const authorUrl = authorLinkElement
? await authorLinkElement.getAttribute("href")
: "";
// Extract post content
const contentElement = await postElement.$(".feed-shared-text");
const content = contentElement
? cleanText(await contentElement.textContent())
: "";
// Extract timestamp
const timeElement = await postElement.$(
".feed-shared-actor__sub-description time"
);
const timestamp = timeElement
? await timeElement.getAttribute("datetime")
: "";
// Extract location from profile (try multiple selectors)
let location = "";
const locationSelectors = [
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link--without-hover",
".feed-shared-actor__sub-description span[aria-label*='location']",
".feed-shared-actor__sub-description span[aria-label*='Location']",
];
for (const selector of locationSelectors) {
try {
const locationElement = await postElement.$(selector);
if (locationElement) {
const locationText = await locationElement.textContent();
if (locationText && locationText.trim()) {
location = cleanText(locationText);
break;
}
}
} catch (e) {
// Try next selector
}
}
// If no location found in sub-description, try to extract from author link hover or profile
if (!location) {
try {
// Try to get location from data attributes or other sources
const subDescElement = await postElement.$(".feed-shared-actor__sub-description");
if (subDescElement) {
const subDescText = await subDescElement.textContent();
// Look for location patterns (City, Province/State, Country)
const locationMatch = subDescText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/);
if (locationMatch) {
location = cleanText(locationMatch[0]);
}
}
} catch (e) {
// Location extraction failed, continue without it
}
}
// Extract engagement metrics
const likesElement = await postElement.$(".social-counts-reactions__count");
const likesText = likesElement
? cleanText(await likesElement.textContent())
: "0";
const commentsElement = await postElement.$(
".social-counts-comments__count"
);
const commentsText = commentsElement
? cleanText(await commentsElement.textContent())
: "0";
// Note: LinkedIn search already filters by keyword semantically
// We don't filter by content keyword match because:
// 1. LinkedIn's search is semantic - it finds related posts, not just exact matches
// 2. The keyword might be in comments, hashtags, or metadata, not visible text
// 3. Posts might be about the topic without using the exact keyword
//
// Optional: Log if keyword appears in content (for debugging, but don't filter)
const keywordLower = keyword.toLowerCase();
const contentLower = content.toLowerCase();
const hasKeywordInContent = contentLower.includes(keywordLower);
if (!hasKeywordInContent && content.length > 50) {
logger.debug(` Post doesn't contain keyword "${keyword}" in visible content, but including it (LinkedIn search matched it)`);
}
// Validate we have minimum required data
if (!postId && !content) {
logger.debug(`⏭️ Post filtered: missing both postId and content`);
return null;
}
return {
postId: cleanText(postId),
authorName,
authorUrl,
profileLink: authorUrl ? (authorUrl.startsWith("http") ? authorUrl : `https://www.linkedin.com${authorUrl}`) : "",
text: content,
content: content,
location: location,
profileLocation: location, // Alias for compatibility
timestamp,
keyword,
likes: extractNumber(likesText),
comments: extractNumber(commentsText),
extractedAt: new Date().toISOString(),
source: "linkedin",
parser: "linkedout-parser",
};
} catch (error) {
logger.warning(`Error extracting post data: ${error.message}`);
return null;
}
}
/**
* Extract numbers from text (e.g., "15 likes" -> 15)
*/
function extractNumber(text) {
const match = text.match(/\d+/);
return match ? parseInt(match[0]) : 0;
}
module.exports = {
linkedinStrategy,
extractPostsFromPage,
extractPostData,
};