- Created core modules: `ai-analyzer`, `core-parser`, and `job-search-parser`. - Implemented LinkedIn and job search parsers with integrated AI analysis. - Added CLI tools for AI analysis and job parsing. - Included comprehensive README files for each module detailing usage and features. - Established a `.gitignore` file to exclude unnecessary files. - Introduced sample data for testing and demonstration purposes. - Set up package.json files for dependency management across modules. - Implemented logging and error handling utilities for better debugging and user feedback.
367 lines
12 KiB
JavaScript
367 lines
12 KiB
JavaScript
/**
|
||
* LinkedIn Parsing Strategy
|
||
*
|
||
* Uses core-parser for browser management and ai-analyzer for utilities
|
||
*/
|
||
|
||
const {
|
||
logger,
|
||
cleanText,
|
||
containsAnyKeyword,
|
||
validateLocationAgainstFilters,
|
||
extractLocationFromProfile,
|
||
} = require("ai-analyzer");
|
||
|
||
/**
|
||
* LinkedIn parsing strategy function
|
||
*/
|
||
async function linkedinStrategy(coreParser, options = {}) {
|
||
const {
|
||
keywords = ["layoff", "downsizing", "job cuts"],
|
||
locationFilter = null,
|
||
maxResults = 50,
|
||
credentials = {},
|
||
} = options;
|
||
|
||
const results = [];
|
||
const rejectedResults = [];
|
||
const seenPosts = new Set();
|
||
const seenProfiles = new Set();
|
||
|
||
try {
|
||
// Create main page
|
||
const page = await coreParser.createPage("linkedin-main");
|
||
|
||
// Authenticate to LinkedIn
|
||
logger.info("🔐 Authenticating to LinkedIn...");
|
||
await coreParser.authenticate("linkedin", credentials, "linkedin-main");
|
||
logger.info("✅ LinkedIn authentication successful");
|
||
|
||
// Search for posts with each keyword
|
||
for (const keyword of keywords) {
|
||
logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);
|
||
|
||
const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
|
||
keyword
|
||
)}&sortBy=date_posted`;
|
||
|
||
await coreParser.navigateTo(searchUrl, {
|
||
pageId: "linkedin-main",
|
||
retries: 2,
|
||
});
|
||
|
||
// Wait for page to load - use delay utility instead of waitForTimeout
|
||
await new Promise(resolve => setTimeout(resolve, 3000)); // Give LinkedIn time to render
|
||
|
||
// Wait for search results - try multiple selectors
|
||
let hasResults = false;
|
||
const possibleSelectors = [
|
||
".search-results-container",
|
||
".search-results__list",
|
||
".reusable-search__result-container",
|
||
"[data-test-id='search-results']",
|
||
".feed-shared-update-v2",
|
||
"article",
|
||
];
|
||
|
||
for (const selector of possibleSelectors) {
|
||
try {
|
||
await page.waitForSelector(selector, { timeout: 5000 });
|
||
hasResults = true;
|
||
logger.info(`✅ Found results container with selector: ${selector}`);
|
||
break;
|
||
} catch (e) {
|
||
// Try next selector
|
||
}
|
||
}
|
||
|
||
if (!hasResults) {
|
||
logger.warning(`⚠️ No search results container found for keyword: ${keyword}`);
|
||
// Take screenshot for debugging
|
||
try {
|
||
const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
|
||
await page.screenshot({ path: screenshotPath, fullPage: true });
|
||
logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
|
||
} catch (e) {
|
||
logger.warning(`Could not take screenshot: ${e.message}`);
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// Extract posts from current page
|
||
const posts = await extractPostsFromPage(page, keyword);
|
||
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
|
||
|
||
for (const post of posts) {
|
||
// Skip duplicates
|
||
if (seenPosts.has(post.postId)) continue;
|
||
seenPosts.add(post.postId);
|
||
|
||
// Validate location if filtering enabled
|
||
if (locationFilter) {
|
||
const postLocation = post.location || post.profileLocation || "";
|
||
const locationValid = validateLocationAgainstFilters(
|
||
postLocation,
|
||
locationFilter
|
||
);
|
||
|
||
if (!locationValid) {
|
||
logger.debug(`⏭️ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`);
|
||
rejectedResults.push({
|
||
...post,
|
||
rejectionReason: `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
|
||
});
|
||
continue;
|
||
} else {
|
||
logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}"`);
|
||
}
|
||
}
|
||
|
||
results.push(post);
|
||
|
||
if (results.length >= maxResults) {
|
||
logger.info(`📊 Reached maximum results limit: ${maxResults}`);
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (results.length >= maxResults) break;
|
||
}
|
||
|
||
logger.info(
|
||
`🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
|
||
);
|
||
|
||
return {
|
||
results,
|
||
rejectedResults,
|
||
summary: {
|
||
totalPosts: results.length,
|
||
totalRejected: rejectedResults.length,
|
||
keywords: keywords.join(", "),
|
||
locationFilter,
|
||
},
|
||
};
|
||
} catch (error) {
|
||
logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract posts from current search results page
|
||
*/
|
||
async function extractPostsFromPage(page, keyword) {
|
||
const posts = [];
|
||
|
||
try {
|
||
// Try multiple selectors for post elements (LinkedIn changes these frequently)
|
||
const postSelectors = [
|
||
".feed-shared-update-v2",
|
||
"article.feed-shared-update-v2",
|
||
"[data-urn*='urn:li:activity']",
|
||
".reusable-search__result-container",
|
||
".search-result__wrapper",
|
||
"article",
|
||
];
|
||
|
||
let postElements = [];
|
||
let usedSelector = null;
|
||
|
||
for (const selector of postSelectors) {
|
||
try {
|
||
postElements = await page.$$(selector);
|
||
if (postElements.length > 0) {
|
||
usedSelector = selector;
|
||
logger.info(`✅ Found ${postElements.length} post elements using selector: ${selector}`);
|
||
break;
|
||
}
|
||
} catch (e) {
|
||
// Try next selector
|
||
}
|
||
}
|
||
|
||
if (postElements.length === 0) {
|
||
logger.warning(`⚠️ No post elements found with any selector. Page might have different structure.`);
|
||
// Log page title and URL for debugging
|
||
try {
|
||
const pageTitle = await page.title();
|
||
const pageUrl = page.url();
|
||
logger.info(`📄 Page title: ${pageTitle}`);
|
||
logger.info(`🔗 Page URL: ${pageUrl}`);
|
||
} catch (e) {
|
||
// Ignore
|
||
}
|
||
return posts;
|
||
}
|
||
|
||
logger.info(`🔍 Processing ${postElements.length} post elements...`);
|
||
|
||
for (let i = 0; i < postElements.length; i++) {
|
||
try {
|
||
const post = await extractPostData(postElements[i], keyword);
|
||
if (post) {
|
||
posts.push(post);
|
||
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}...`);
|
||
} else {
|
||
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
|
||
}
|
||
} catch (error) {
|
||
logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`);
|
||
}
|
||
}
|
||
|
||
logger.info(`✅ Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`);
|
||
} catch (error) {
|
||
logger.error(`❌ Failed to extract posts from page: ${error.message}`);
|
||
logger.error(`Stack trace: ${error.stack}`);
|
||
}
|
||
|
||
return posts;
|
||
}
|
||
|
||
/**
|
||
* Extract data from individual post element
|
||
*/
|
||
async function extractPostData(postElement, keyword) {
|
||
try {
|
||
// Extract post ID
|
||
const postId = (await postElement.getAttribute("data-urn")) || "";
|
||
|
||
// Extract author info
|
||
const authorElement = await postElement.$(".feed-shared-actor__name");
|
||
const authorName = authorElement
|
||
? cleanText(await authorElement.textContent())
|
||
: "";
|
||
|
||
const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
|
||
const authorUrl = authorLinkElement
|
||
? await authorLinkElement.getAttribute("href")
|
||
: "";
|
||
|
||
// Extract post content
|
||
const contentElement = await postElement.$(".feed-shared-text");
|
||
const content = contentElement
|
||
? cleanText(await contentElement.textContent())
|
||
: "";
|
||
|
||
// Extract timestamp
|
||
const timeElement = await postElement.$(
|
||
".feed-shared-actor__sub-description time"
|
||
);
|
||
const timestamp = timeElement
|
||
? await timeElement.getAttribute("datetime")
|
||
: "";
|
||
|
||
// Extract location from profile (try multiple selectors)
|
||
let location = "";
|
||
const locationSelectors = [
|
||
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
|
||
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link--without-hover",
|
||
".feed-shared-actor__sub-description span[aria-label*='location']",
|
||
".feed-shared-actor__sub-description span[aria-label*='Location']",
|
||
];
|
||
|
||
for (const selector of locationSelectors) {
|
||
try {
|
||
const locationElement = await postElement.$(selector);
|
||
if (locationElement) {
|
||
const locationText = await locationElement.textContent();
|
||
if (locationText && locationText.trim()) {
|
||
location = cleanText(locationText);
|
||
break;
|
||
}
|
||
}
|
||
} catch (e) {
|
||
// Try next selector
|
||
}
|
||
}
|
||
|
||
// If no location found in sub-description, try to extract from author link hover or profile
|
||
if (!location) {
|
||
try {
|
||
// Try to get location from data attributes or other sources
|
||
const subDescElement = await postElement.$(".feed-shared-actor__sub-description");
|
||
if (subDescElement) {
|
||
const subDescText = await subDescElement.textContent();
|
||
// Look for location patterns (City, Province/State, Country)
|
||
const locationMatch = subDescText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/);
|
||
if (locationMatch) {
|
||
location = cleanText(locationMatch[0]);
|
||
}
|
||
}
|
||
} catch (e) {
|
||
// Location extraction failed, continue without it
|
||
}
|
||
}
|
||
|
||
// Extract engagement metrics
|
||
const likesElement = await postElement.$(".social-counts-reactions__count");
|
||
const likesText = likesElement
|
||
? cleanText(await likesElement.textContent())
|
||
: "0";
|
||
|
||
const commentsElement = await postElement.$(
|
||
".social-counts-comments__count"
|
||
);
|
||
const commentsText = commentsElement
|
||
? cleanText(await commentsElement.textContent())
|
||
: "0";
|
||
|
||
// Note: LinkedIn search already filters by keyword semantically
|
||
// We don't filter by content keyword match because:
|
||
// 1. LinkedIn's search is semantic - it finds related posts, not just exact matches
|
||
// 2. The keyword might be in comments, hashtags, or metadata, not visible text
|
||
// 3. Posts might be about the topic without using the exact keyword
|
||
//
|
||
// Optional: Log if keyword appears in content (for debugging, but don't filter)
|
||
const keywordLower = keyword.toLowerCase();
|
||
const contentLower = content.toLowerCase();
|
||
const hasKeywordInContent = contentLower.includes(keywordLower);
|
||
if (!hasKeywordInContent && content.length > 50) {
|
||
logger.debug(`ℹ️ Post doesn't contain keyword "${keyword}" in visible content, but including it (LinkedIn search matched it)`);
|
||
}
|
||
|
||
// Validate we have minimum required data
|
||
if (!postId && !content) {
|
||
logger.debug(`⏭️ Post filtered: missing both postId and content`);
|
||
return null;
|
||
}
|
||
|
||
return {
|
||
postId: cleanText(postId),
|
||
authorName,
|
||
authorUrl,
|
||
profileLink: authorUrl ? (authorUrl.startsWith("http") ? authorUrl : `https://www.linkedin.com${authorUrl}`) : "",
|
||
text: content,
|
||
content: content,
|
||
location: location,
|
||
profileLocation: location, // Alias for compatibility
|
||
timestamp,
|
||
keyword,
|
||
likes: extractNumber(likesText),
|
||
comments: extractNumber(commentsText),
|
||
extractedAt: new Date().toISOString(),
|
||
source: "linkedin",
|
||
parser: "linkedout-parser",
|
||
};
|
||
} catch (error) {
|
||
logger.warning(`Error extracting post data: ${error.message}`);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract numbers from text (e.g., "15 likes" -> 15)
|
||
*/
|
||
function extractNumber(text) {
|
||
const match = text.match(/\d+/);
|
||
return match ? parseInt(match[0]) : 0;
|
||
}
|
||
|
||
module.exports = {
|
||
linkedinStrategy,
|
||
extractPostsFromPage,
|
||
extractPostData,
|
||
};
|