- Updated `ai-utils.js` to improve AI response parsing and added timeout handling for API requests. - Modified `linkedin-parser` to refine search keyword handling and improve post extraction reliability. - Enhanced location filtering logic and added more robust selectors for extracting post data. - Improved logging for debugging purposes, including detailed extraction results and fallback mechanisms.
815 lines
31 KiB
JavaScript
815 lines
31 KiB
JavaScript
/**
|
|
* LinkedIn Parsing Strategy
|
|
*
|
|
* Uses core-parser for browser management and ai-analyzer for utilities
|
|
*/
|
|
|
|
const {
|
|
logger,
|
|
cleanText,
|
|
containsAnyKeyword,
|
|
validateLocationAgainstFilters,
|
|
extractLocationFromProfile,
|
|
parseLocationFilters,
|
|
} = require("ai-analyzer");
|
|
|
|
/**
|
|
* LinkedIn parsing strategy function
|
|
*/
|
|
async function linkedinStrategy(coreParser, options = {}) {
|
|
const {
|
|
keywords = ["layoff", "downsizing", "job cuts"],
|
|
locationFilter = null,
|
|
maxResults = 50,
|
|
credentials = {},
|
|
} = options;
|
|
|
|
const results = [];
|
|
const rejectedResults = [];
|
|
const seenPosts = new Set();
|
|
const seenProfiles = new Set();
|
|
|
|
try {
|
|
// Create main page
|
|
const page = await coreParser.createPage("linkedin-main");
|
|
|
|
// Authenticate to LinkedIn
|
|
logger.info("🔐 Authenticating to LinkedIn...");
|
|
await coreParser.authenticate("linkedin", credentials, "linkedin-main");
|
|
logger.info("✅ LinkedIn authentication successful");
|
|
|
|
// Search for posts with each keyword
|
|
for (const keyword of keywords) {
|
|
logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);
|
|
|
|
const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
|
|
keyword
|
|
)}&sortBy=date_posted`;
|
|
|
|
await coreParser.navigateTo(searchUrl, {
|
|
pageId: "linkedin-main",
|
|
retries: 2,
|
|
waitUntil: "networkidle", // Wait for network to be idle
|
|
});
|
|
|
|
// Wait for page to load and content to render
|
|
await new Promise(resolve => setTimeout(resolve, 5000)); // Give LinkedIn time to render dynamic content
|
|
|
|
// Scroll down a bit to trigger lazy loading
|
|
try {
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, 500);
|
|
});
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
} catch (e) {
|
|
logger.debug(`Could not scroll page: ${e.message}`);
|
|
}
|
|
|
|
// Wait for search results - try multiple selectors
|
|
let hasResults = false;
|
|
const possibleSelectors = [
|
|
".feed-shared-update-v2",
|
|
"article[data-urn*='urn:li:activity']",
|
|
"article",
|
|
".search-results-container",
|
|
".search-results__list",
|
|
".reusable-search__result-container",
|
|
"[data-test-id='search-results']",
|
|
];
|
|
|
|
for (const selector of possibleSelectors) {
|
|
try {
|
|
await page.waitForSelector(selector, { timeout: 10000 });
|
|
// Verify we actually have post elements
|
|
const count = await page.$$(selector).then(elements => elements.length);
|
|
if (count > 0) {
|
|
hasResults = true;
|
|
logger.info(`✅ Found ${count} post elements with selector: ${selector}`);
|
|
break;
|
|
}
|
|
} catch (e) {
|
|
// Try next selector
|
|
}
|
|
}
|
|
|
|
if (!hasResults) {
|
|
logger.warning(`⚠️ No search results container found for keyword: ${keyword}`);
|
|
// Take screenshot for debugging
|
|
try {
|
|
const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
|
|
await page.screenshot({ path: screenshotPath, fullPage: true });
|
|
logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
|
|
} catch (e) {
|
|
logger.warning(`Could not take screenshot: ${e.message}`);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Extract posts from current page
|
|
const posts = await extractPostsFromPage(page, keyword);
|
|
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
|
|
|
|
for (const post of posts) {
|
|
// Skip duplicates
|
|
if (seenPosts.has(post.postId)) continue;
|
|
seenPosts.add(post.postId);
|
|
|
|
// Validate location if filtering enabled
|
|
if (locationFilter) {
|
|
const postLocation = post.location || post.profileLocation || "";
|
|
// Parse locationFilter string into array if it's a string
|
|
const locationFiltersArray = typeof locationFilter === 'string'
|
|
? parseLocationFilters(locationFilter)
|
|
: locationFilter;
|
|
const locationValid = validateLocationAgainstFilters(
|
|
postLocation,
|
|
locationFiltersArray
|
|
);
|
|
|
|
if (!locationValid.isValid) {
|
|
logger.debug(`⏭️ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`);
|
|
rejectedResults.push({
|
|
...post,
|
|
rejectionReason: locationValid.reasoning || `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
|
|
});
|
|
continue;
|
|
} else {
|
|
logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}" (${locationValid.reasoning || 'matched'})`);
|
|
}
|
|
}
|
|
|
|
results.push(post);
|
|
|
|
if (results.length >= maxResults) {
|
|
logger.info(`📊 Reached maximum results limit: ${maxResults}`);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (results.length >= maxResults) break;
|
|
}
|
|
|
|
logger.info(
|
|
`🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
|
|
);
|
|
|
|
return {
|
|
results,
|
|
rejectedResults,
|
|
summary: {
|
|
totalPosts: results.length,
|
|
totalRejected: rejectedResults.length,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
},
|
|
};
|
|
} catch (error) {
|
|
logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract posts from current search results page
|
|
*/
|
|
async function extractPostsFromPage(page, keyword) {
|
|
const posts = [];
|
|
|
|
try {
|
|
// Try multiple selectors for post elements (LinkedIn changes these frequently)
|
|
// Prioritize selectors that are more specific to actual posts
|
|
const postSelectors = [
|
|
"article[data-urn*='urn:li:activity']", // Most specific - posts with activity ID
|
|
".feed-shared-update-v2[data-urn*='urn:li:activity']",
|
|
"article.feed-shared-update-v2",
|
|
".feed-shared-update-v2",
|
|
"[data-urn*='urn:li:activity']",
|
|
".reusable-search__result-container",
|
|
".search-result__wrapper",
|
|
"article",
|
|
];
|
|
|
|
let postElements = [];
|
|
let usedSelector = null;
|
|
|
|
for (const selector of postSelectors) {
|
|
try {
|
|
// Wait a bit for elements to be available
|
|
await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {});
|
|
postElements = await page.$$(selector);
|
|
|
|
// Filter to only elements that have a data-urn attribute (actual posts)
|
|
if (postElements.length > 0) {
|
|
const validElements = [];
|
|
for (const elem of postElements) {
|
|
try {
|
|
const dataUrn = await elem.getAttribute("data-urn");
|
|
if (dataUrn && dataUrn.includes("urn:li:activity")) {
|
|
validElements.push(elem);
|
|
}
|
|
} catch (e) {
|
|
// Element might have been detached, skip it
|
|
}
|
|
}
|
|
|
|
if (validElements.length > 0) {
|
|
postElements = validElements;
|
|
usedSelector = selector;
|
|
logger.info(`✅ Found ${postElements.length} valid post elements using selector: ${selector}`);
|
|
break;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Try next selector
|
|
}
|
|
}
|
|
|
|
if (postElements.length === 0) {
|
|
logger.warning(`⚠️ No post elements found with any selector. Page might have different structure.`);
|
|
// Log page title and URL for debugging
|
|
try {
|
|
const pageTitle = await page.title();
|
|
const pageUrl = page.url();
|
|
logger.info(`📄 Page title: ${pageTitle}`);
|
|
logger.info(`🔗 Page URL: ${pageUrl}`);
|
|
} catch (e) {
|
|
// Ignore
|
|
}
|
|
return posts;
|
|
}
|
|
|
|
logger.info(`🔍 Processing ${postElements.length} post elements...`);
|
|
|
|
for (let i = 0; i < postElements.length; i++) {
|
|
try {
|
|
// Scroll element into view to ensure it's fully rendered
|
|
try {
|
|
await postElements[i].evaluate((el) => {
|
|
el.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
|
});
|
|
await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for rendering
|
|
} catch (e) {
|
|
// Element might already be in view or detached, continue anyway
|
|
}
|
|
|
|
const post = await extractPostData(postElements[i], keyword);
|
|
if (post) {
|
|
posts.push(post);
|
|
const hasContent = post.content && post.content.length > 0;
|
|
const hasAuthor = post.authorName && post.authorName.length > 0;
|
|
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`);
|
|
} else {
|
|
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
logger.info(`✅ Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`);
|
|
} catch (error) {
|
|
logger.error(`❌ Failed to extract posts from page: ${error.message}`);
|
|
logger.error(`Stack trace: ${error.stack}`);
|
|
}
|
|
|
|
return posts;
|
|
}
|
|
|
|
/**
|
|
* Extract data from individual post element
|
|
* Uses evaluate() to extract data directly from DOM for better reliability
|
|
*/
|
|
async function extractPostData(postElement, keyword) {
|
|
try {
|
|
// Use evaluate to extract data directly from the DOM element
|
|
// This is more reliable than using selectors which may not match
|
|
const postData = await postElement.evaluate((el, keyword) => {
|
|
const data = {
|
|
postId: "",
|
|
authorName: "",
|
|
authorUrl: "",
|
|
content: "",
|
|
timestamp: "",
|
|
location: "",
|
|
likes: 0,
|
|
comments: 0,
|
|
};
|
|
|
|
// Extract post ID from data-urn attribute
|
|
data.postId = el.getAttribute("data-urn") ||
|
|
el.getAttribute("data-activity-id") ||
|
|
el.querySelector("[data-urn]")?.getAttribute("data-urn") || "";
|
|
|
|
// Extract author name - try multiple selectors and approaches
|
|
const authorSelectors = [
|
|
".feed-shared-actor__name",
|
|
".feed-shared-actor__name-link",
|
|
".update-components-actor__name",
|
|
".feed-shared-actor__name a",
|
|
"[data-test-id='actor-name']",
|
|
"span[aria-label*='name']",
|
|
"a[href*='/in/'] span",
|
|
".feed-shared-actor a span",
|
|
".feed-shared-actor span",
|
|
".feed-shared-actor__name-link span",
|
|
];
|
|
|
|
for (const selector of authorSelectors) {
|
|
const elem = el.querySelector(selector);
|
|
if (elem) {
|
|
const text = elem.textContent?.trim() || elem.innerText?.trim();
|
|
if (text && text.length > 0 && text.length < 100) { // Reasonable name length
|
|
data.authorName = text;
|
|
// Try to get link from same element or parent
|
|
const link = elem.closest("a") || elem.querySelector("a");
|
|
if (link) {
|
|
data.authorUrl = link.getAttribute("href") || "";
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If author name found but no URL, try to find link separately
|
|
if (data.authorName && !data.authorUrl) {
|
|
const authorLink = el.querySelector(".feed-shared-actor__name-link, .feed-shared-actor__name a, a[href*='/in/']");
|
|
if (authorLink) {
|
|
data.authorUrl = authorLink.getAttribute("href") || "";
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for any link with /in/ pattern and get the name from nearby text
|
|
if (!data.authorName) {
|
|
const profileLinks = el.querySelectorAll("a[href*='/in/']");
|
|
for (const link of profileLinks) {
|
|
// Skip if it's a company link
|
|
if (link.getAttribute("href")?.includes("/company/")) continue;
|
|
|
|
// Get text from the link or nearby
|
|
const linkText = link.textContent?.trim() || link.innerText?.trim();
|
|
if (linkText && linkText.length > 0 && linkText.length < 100 && !linkText.includes("View")) {
|
|
data.authorName = linkText;
|
|
data.authorUrl = link.getAttribute("href") || "";
|
|
break;
|
|
}
|
|
// Try to get text from first child span
|
|
const childSpan = link.querySelector("span");
|
|
if (childSpan) {
|
|
const spanText = childSpan.textContent?.trim() || childSpan.innerText?.trim();
|
|
if (spanText && spanText.length > 0 && spanText.length < 100) {
|
|
data.authorName = spanText;
|
|
data.authorUrl = link.getAttribute("href") || "";
|
|
break;
|
|
}
|
|
}
|
|
// Try to get text from parent
|
|
const parentText = link.parentElement?.textContent?.trim();
|
|
if (parentText && parentText.length < 100 && !parentText.includes("View")) {
|
|
// Extract just the name part (first line or first few words)
|
|
const namePart = parentText.split("\n")[0].split("·")[0].trim();
|
|
if (namePart.length > 0 && namePart.length < 100) {
|
|
data.authorName = namePart;
|
|
data.authorUrl = link.getAttribute("href") || "";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last resort: Extract from actor section by looking at all text
|
|
if (!data.authorName) {
|
|
const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor, [class*='actor']");
|
|
if (actorSection) {
|
|
const actorText = actorSection.textContent || actorSection.innerText || "";
|
|
const lines = actorText.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
|
// First non-empty line is often the name
|
|
for (const line of lines) {
|
|
if (line.length > 0 && line.length < 100 &&
|
|
!line.includes("·") &&
|
|
!line.includes("ago") &&
|
|
!line.match(/^\d+/) &&
|
|
!line.toLowerCase().includes("view")) {
|
|
data.authorName = line;
|
|
// Try to find associated link
|
|
const link = actorSection.querySelector("a[href*='/in/']");
|
|
if (link) {
|
|
data.authorUrl = link.getAttribute("href") || "";
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract post content - try multiple selectors
|
|
const contentSelectors = [
|
|
".feed-shared-text",
|
|
".feed-shared-text__text-view",
|
|
".feed-shared-update-v2__description",
|
|
".update-components-text",
|
|
"[data-test-id='post-text']",
|
|
".feed-shared-text span",
|
|
".feed-shared-update-v2__description-wrapper",
|
|
];
|
|
|
|
for (const selector of contentSelectors) {
|
|
const elem = el.querySelector(selector);
|
|
if (elem) {
|
|
const text = elem.textContent?.trim() || elem.innerText?.trim();
|
|
if (text && text.length > 10) { // Only use if substantial content
|
|
data.content = text;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract timestamp
|
|
const timeSelectors = [
|
|
".feed-shared-actor__sub-description time",
|
|
"time[datetime]",
|
|
"[data-test-id='timestamp']",
|
|
".feed-shared-actor__sub-description time[datetime]",
|
|
"time",
|
|
".feed-shared-actor__sub-description time",
|
|
"span[aria-label*='time']",
|
|
"span[aria-label*='ago']",
|
|
];
|
|
|
|
for (const selector of timeSelectors) {
|
|
const elem = el.querySelector(selector);
|
|
if (elem) {
|
|
data.timestamp = elem.getAttribute("datetime") ||
|
|
elem.getAttribute("title") ||
|
|
elem.getAttribute("aria-label") ||
|
|
elem.textContent?.trim() || "";
|
|
if (data.timestamp) break;
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for time-like patterns in sub-description
|
|
if (!data.timestamp) {
|
|
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
|
if (subDesc) {
|
|
const subDescText = subDesc.textContent || subDesc.innerText || "";
|
|
// Look for patterns like "2h", "3d", "1w", "2 months ago", etc.
|
|
const timePatterns = [
|
|
/\d+\s*(minute|hour|day|week|month|year)s?\s*ago/i,
|
|
/\d+\s*(h|d|w|mo|yr)/i,
|
|
/(just now|today|yesterday)/i,
|
|
];
|
|
for (const pattern of timePatterns) {
|
|
const match = subDescText.match(pattern);
|
|
if (match) {
|
|
data.timestamp = match[0];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract location - try multiple approaches
|
|
const locationSelectors = [
|
|
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
|
|
".feed-shared-actor__sub-description-link--without-hover",
|
|
"span[aria-label*='location' i]",
|
|
"span[aria-label*='Location']",
|
|
".feed-shared-actor__sub-description span",
|
|
".feed-shared-actor__sub-description a",
|
|
"a[href*='/company/']",
|
|
"a[href*='/location/']",
|
|
];
|
|
|
|
for (const selector of locationSelectors) {
|
|
const elem = el.querySelector(selector);
|
|
if (elem) {
|
|
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || elem.innerText?.trim() || "";
|
|
// Check if it looks like a location (contains comma or common location words)
|
|
if (text && text.length > 2 && text.length < 100) {
|
|
// More flexible location detection
|
|
if (text.includes(",") ||
|
|
/(city|province|state|country|region|ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut)/i.test(text) ||
|
|
/^[A-Z][a-z]+,\s*[A-Z][a-z]+/i.test(text)) {
|
|
data.location = text;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no location found, try parsing from sub-description text
|
|
if (!data.location) {
|
|
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
|
if (subDesc) {
|
|
const subDescText = subDesc.textContent || subDesc.innerText || "";
|
|
|
|
// First, try to get all links in sub-description (location is often a link)
|
|
const subDescLinks = subDesc.querySelectorAll("a");
|
|
for (const link of subDescLinks) {
|
|
const linkText = link.textContent?.trim() || link.innerText?.trim() || "";
|
|
const linkHref = link.getAttribute("href") || "";
|
|
|
|
// Skip if it's a time/date link or company link
|
|
if (linkHref.includes("/company/") || linkText.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w)/i)) {
|
|
continue;
|
|
}
|
|
|
|
// If link text looks like a location
|
|
if (linkText && linkText.length > 2 && linkText.length < 100) {
|
|
if (linkText.includes(",") ||
|
|
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut|toronto|vancouver|calgary|ottawa|montreal|winnipeg|edmonton|halifax|victoria|regina|saskatoon|windsor|kitchener|hamilton|london|st\.?\s*catharines|oshawa|barrie|greater sudbury|sherbrooke|kelowna|abbotsford|trois-rivières|guelph|cambridge|coquitlam|saanich|saint john|thunder bay|waterloo|delta|chatham|red deer|kamloops|brantford|whitehorse|yellowknife|iqaluit)/i.test(linkText)) {
|
|
data.location = linkText;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If still no location, try pattern matching on the full text
|
|
if (!data.location && subDescText) {
|
|
// Look for location patterns (City, Province/State, Country)
|
|
const locationPatterns = [
|
|
// Full location: "City, Province, Country"
|
|
/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/,
|
|
// City, Province
|
|
/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)/,
|
|
// Just province/state names
|
|
/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut|ON|AB|BC|QC|MB|SK|NS|NB|NL|PE|YT|NT|NU)\b/i,
|
|
// Major cities
|
|
/\b(Toronto|Vancouver|Calgary|Ottawa|Montreal|Winnipeg|Edmonton|Halifax|Victoria|Regina|Saskatoon)\b/i,
|
|
];
|
|
|
|
for (const pattern of locationPatterns) {
|
|
const match = subDescText.match(pattern);
|
|
if (match) {
|
|
// Get more context around the match
|
|
const matchIndex = subDescText.indexOf(match[0]);
|
|
const contextStart = Math.max(0, matchIndex - 30);
|
|
const contextEnd = Math.min(subDescText.length, matchIndex + match[0].length + 30);
|
|
const context = subDescText.substring(contextStart, contextEnd).trim();
|
|
|
|
// Extract just the location part (remove time/date info)
|
|
let locationText = match[0].trim();
|
|
// If we have more context, try to get a better location string
|
|
if (context.includes(",") && context.length < 100) {
|
|
// Try to extract "City, Province" pattern from context
|
|
const cityProvinceMatch = context.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+)/);
|
|
if (cityProvinceMatch) {
|
|
locationText = cityProvinceMatch[0].trim();
|
|
}
|
|
}
|
|
|
|
data.location = locationText;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last resort: extract any text that looks location-like from sub-description
|
|
if (!data.location && subDescText) {
|
|
// Split by common separators and look for location-like text
|
|
const parts = subDescText.split(/[·•|]/).map(p => p.trim()).filter(p => p.length > 0);
|
|
for (const part of parts) {
|
|
// Skip if it looks like time/date
|
|
if (part.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
|
|
continue;
|
|
}
|
|
// Check if it looks like a location
|
|
if (part.length > 2 && part.length < 100 &&
|
|
(part.includes(",") ||
|
|
/(ontario|alberta|british columbia|quebec|manitoba|toronto|vancouver|calgary|ottawa|montreal)/i.test(part))) {
|
|
data.location = part;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Final fallback: look anywhere in the actor section for location-like text
|
|
if (!data.location) {
|
|
const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor");
|
|
if (actorSection) {
|
|
const actorText = actorSection.textContent || actorSection.innerText || "";
|
|
// Look for province names
|
|
const provinceMatch = actorText.match(/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)\b/i);
|
|
if (provinceMatch) {
|
|
// Try to get city, province if available
|
|
const cityProvinceMatch = actorText.match(/([A-Z][a-z]+),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
|
|
if (cityProvinceMatch) {
|
|
data.location = cityProvinceMatch[0].trim();
|
|
} else {
|
|
data.location = provinceMatch[0].trim();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try to extract from any hover cards or mini profiles in the DOM
|
|
if (!data.location) {
|
|
// Look for mini profile cards or tooltips
|
|
const miniProfileSelectors = [
|
|
"[data-control-name='hovercard']",
|
|
".artdeco-hoverable-trigger",
|
|
".feed-shared-actor__meta",
|
|
".pv-text-details__left-panel",
|
|
];
|
|
|
|
for (const selector of miniProfileSelectors) {
|
|
const elem = el.querySelector(selector);
|
|
if (elem) {
|
|
const text = elem.textContent || elem.innerText || "";
|
|
// Look for location patterns
|
|
const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
|
|
if (locationMatch) {
|
|
data.location = locationMatch[0].trim();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract engagement metrics - try multiple approaches
|
|
const likesSelectors = [
|
|
".social-counts-reactions__count",
|
|
"[data-test-id='reactions-count']",
|
|
".social-counts__reactions-count",
|
|
".feed-shared-social-action-bar__reactions-count",
|
|
"button[aria-label*='reaction']",
|
|
"button[aria-label*='like']",
|
|
".social-actions-button__reactions-count",
|
|
"[data-test-id='social-actions__reactions-count']",
|
|
];
|
|
|
|
for (const selector of likesSelectors) {
|
|
const elem = el.querySelector(selector);
|
|
if (elem) {
|
|
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
|
|
const match = text.match(/(\d+)/);
|
|
if (match) {
|
|
data.likes = parseInt(match[1], 10) || 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for any button or element with reaction/like text
|
|
if (data.likes === 0) {
|
|
const allButtons = el.querySelectorAll("button, span, div");
|
|
for (const btn of allButtons) {
|
|
const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
|
|
if (/reaction|like/i.test(text)) {
|
|
const match = text.match(/(\d+)/);
|
|
if (match) {
|
|
data.likes = parseInt(match[1], 10) || 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const commentsSelectors = [
|
|
".social-counts-comments__count",
|
|
"[data-test-id='comments-count']",
|
|
".social-counts__comments-count",
|
|
".feed-shared-social-action-bar__comments-count",
|
|
"button[aria-label*='comment']",
|
|
".social-actions-button__comments-count",
|
|
"[data-test-id='social-actions__comments-count']",
|
|
];
|
|
|
|
for (const selector of commentsSelectors) {
|
|
const elem = el.querySelector(selector);
|
|
if (elem) {
|
|
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
|
|
const match = text.match(/(\d+)/);
|
|
if (match) {
|
|
data.comments = parseInt(match[1], 10) || 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Look for any button or element with comment text
|
|
if (data.comments === 0) {
|
|
const allButtons = el.querySelectorAll("button, span, div");
|
|
for (const btn of allButtons) {
|
|
const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
|
|
if (/comment/i.test(text)) {
|
|
const match = text.match(/(\d+)/);
|
|
if (match) {
|
|
data.comments = parseInt(match[1], 10) || 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return data;
|
|
}, keyword);
|
|
|
|
// Clean and format the extracted data
|
|
const authorName = cleanText(postData.authorName);
|
|
let authorUrl = postData.authorUrl || "";
|
|
if (authorUrl && !authorUrl.startsWith("http")) {
|
|
authorUrl = `https://www.linkedin.com${authorUrl}`;
|
|
}
|
|
|
|
const content = cleanText(postData.content);
|
|
const location = cleanText(postData.location);
|
|
const timestamp = postData.timestamp || "";
|
|
|
|
// Validate we have minimum required data
|
|
if (!postData.postId && !content) {
|
|
logger.debug(`⏭️ Post filtered: missing both postId and content`);
|
|
return null;
|
|
}
|
|
|
|
// Log extraction results for debugging
|
|
const missingFields = [];
|
|
if (!authorName) missingFields.push("authorName");
|
|
if (!authorUrl) missingFields.push("authorUrl");
|
|
if (!location) missingFields.push("location");
|
|
if (!timestamp) missingFields.push("timestamp");
|
|
if (postData.likes === 0 && postData.comments === 0) missingFields.push("engagement");
|
|
|
|
if (missingFields.length > 0 && postData.postId) {
|
|
logger.debug(`⚠️ Post ${postData.postId.substring(0, 20)}... missing: ${missingFields.join(", ")}`);
|
|
|
|
// If location is missing, log sub-description content for debugging
|
|
if (!location && process.env.DEBUG_EXTRACTION === "true") {
|
|
try {
|
|
const subDescInfo = await postElement.evaluate((el) => {
|
|
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
|
if (subDesc) {
|
|
return {
|
|
text: subDesc.textContent || subDesc.innerText || "",
|
|
html: subDesc.innerHTML.substring(0, 500),
|
|
links: Array.from(subDesc.querySelectorAll("a")).map(a => ({
|
|
text: a.textContent?.trim(),
|
|
href: a.getAttribute("href")
|
|
}))
|
|
};
|
|
}
|
|
return null;
|
|
});
|
|
if (subDescInfo) {
|
|
logger.debug(`Sub-description text: "${subDescInfo.text}"`);
|
|
logger.debug(`Sub-description links: ${JSON.stringify(subDescInfo.links)}`);
|
|
}
|
|
} catch (e) {
|
|
// Ignore errors in debugging
|
|
}
|
|
}
|
|
|
|
// Optionally log HTML structure for first failed extraction (to help debug)
|
|
if (process.env.DEBUG_EXTRACTION === "true" && missingFields.length >= 3) {
|
|
try {
|
|
const htmlSnippet = await postElement.evaluate((el) => {
|
|
// Get the outer HTML of the element (limited to first 2000 chars)
|
|
const html = el.outerHTML || "";
|
|
return html.substring(0, 2000);
|
|
});
|
|
logger.debug(`HTML structure (first 2000 chars):\n${htmlSnippet}`);
|
|
} catch (e) {
|
|
// Ignore errors in debugging
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
postId: cleanText(postData.postId),
|
|
authorName,
|
|
authorUrl,
|
|
profileLink: authorUrl,
|
|
text: content,
|
|
content: content,
|
|
location: location,
|
|
profileLocation: location, // Alias for compatibility
|
|
timestamp,
|
|
keyword,
|
|
likes: postData.likes || 0,
|
|
comments: postData.comments || 0,
|
|
extractedAt: new Date().toISOString(),
|
|
source: "linkedin",
|
|
parser: "linkedout-parser",
|
|
};
|
|
} catch (error) {
|
|
logger.warning(`Error extracting post data: ${error.message}`);
|
|
logger.debug(`Stack trace: ${error.stack}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract numbers from text (e.g., "15 likes" -> 15)
|
|
*/
|
|
function extractNumber(text) {
|
|
const match = text.match(/\d+/);
|
|
return match ? parseInt(match[0]) : 0;
|
|
}
|
|
|
|
module.exports = {
|
|
linkedinStrategy,
|
|
extractPostsFromPage,
|
|
extractPostData,
|
|
};
|