linkedout/linkedin-parser/strategies/linkedin-strategy.js

/**
 * LinkedIn Parsing Strategy
 *
 * Uses core-parser for browser management and ai-analyzer for utilities
 */

const {
  logger,
  cleanText,
  containsAnyKeyword,
  validateLocationAgainstFilters,
  extractLocationFromProfile,
} = require("ai-analyzer");

/**
 * LinkedIn parsing strategy function
 */
async function linkedinStrategy(coreParser, options = {}) {
  const {
    keywords = ["layoff", "downsizing", "job cuts"],
    locationFilter = null,
    maxResults = 50,
    credentials = {},
  } = options;

  const results = [];
  const rejectedResults = [];
  const seenPosts = new Set();
  const seenProfiles = new Set();

  try {
    // Create main page
    const page = await coreParser.createPage("linkedin-main");

    // Authenticate to LinkedIn
    logger.info("🔐 Authenticating to LinkedIn...");
    await coreParser.authenticate("linkedin", credentials, "linkedin-main");
    logger.info("✅ LinkedIn authentication successful");

    // Search for posts with each keyword
    for (const keyword of keywords) {
      logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);

      const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
        keyword
      )}&sortBy=date_posted`;

      await coreParser.navigateTo(searchUrl, {
        pageId: "linkedin-main",
        retries: 2,
      });

      // Wait for page to load - use delay utility instead of waitForTimeout
      await new Promise(resolve => setTimeout(resolve, 3000)); // Give LinkedIn time to render

      // Wait for search results - try multiple selectors
      let hasResults = false;
      const possibleSelectors = [
        ".search-results-container",
        ".search-results__list",
        ".reusable-search__result-container",
        "[data-test-id='search-results']",
        ".feed-shared-update-v2",
        "article",
      ];

      for (const selector of possibleSelectors) {
        try {
          await page.waitForSelector(selector, { timeout: 5000 });
          hasResults = true;
          logger.info(`✅ Found results container with selector: ${selector}`);
          break;
        } catch (e) {
          // Try next selector
        }
      }

      if (!hasResults) {
        logger.warning(`⚠️  No search results container found for keyword: ${keyword}`);
        // Take screenshot for debugging
        try {
          const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
          await page.screenshot({ path: screenshotPath, fullPage: true });
          logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
        } catch (e) {
          logger.warning(`Could not take screenshot: ${e.message}`);
        }
        continue;
      }

      // Extract posts from current page
      const posts = await extractPostsFromPage(page, keyword);
      logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);

      for (const post of posts) {
        // Skip duplicates
        if (seenPosts.has(post.postId)) continue;
        seenPosts.add(post.postId);

        // Validate location if filtering enabled
        if (locationFilter) {
          const postLocation = post.location || post.profileLocation || "";
          const locationValid = validateLocationAgainstFilters(
            postLocation,
            locationFilter
          );

          if (!locationValid) {
            logger.debug(`⏭️  Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`);
            rejectedResults.push({
              ...post,
              rejectionReason: `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
            });
            continue;
          } else {
            logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}"`);
          }
        }

        results.push(post);

        if (results.length >= maxResults) {
          logger.info(`📊 Reached maximum results limit: ${maxResults}`);
          break;
        }
      }

      if (results.length >= maxResults) break;
    }

    logger.info(
      `🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
    );

    return {
      results,
      rejectedResults,
      summary: {
        totalPosts: results.length,
        totalRejected: rejectedResults.length,
        keywords: keywords.join(", "),
        locationFilter,
      },
    };
  } catch (error) {
    logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
    throw error;
  }
}

/**
 * Extract posts from current search results page
 */
async function extractPostsFromPage(page, keyword) {
  const posts = [];

  try {
    // Try multiple selectors for post elements (LinkedIn changes these frequently)
    const postSelectors = [
      ".feed-shared-update-v2",
      "article.feed-shared-update-v2",
      "[data-urn*='urn:li:activity']",
      ".reusable-search__result-container",
      ".search-result__wrapper",
      "article",
    ];

    let postElements = [];
    let usedSelector = null;

    for (const selector of postSelectors) {
      try {
        postElements = await page.$$(selector);
        if (postElements.length > 0) {
          usedSelector = selector;
          logger.info(`✅ Found ${postElements.length} post elements using selector: ${selector}`);
          break;
        }
      } catch (e) {
        // Try next selector
      }
    }

    if (postElements.length === 0) {
      logger.warning(`⚠️  No post elements found with any selector. Page might have different structure.`);
      // Log page title and URL for debugging
      try {
        const pageTitle = await page.title();
        const pageUrl = page.url();
        logger.info(`📄 Page title: ${pageTitle}`);
        logger.info(`🔗 Page URL: ${pageUrl}`);
      } catch (e) {
        // Ignore
      }
      return posts;
    }

    logger.info(`🔍 Processing ${postElements.length} post elements...`);

    for (let i = 0; i < postElements.length; i++) {
      try {
        const post = await extractPostData(postElements[i], keyword);
        if (post) {
          posts.push(post);
          logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}...`);
        } else {
          logger.debug(`⏭️  Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
        }
      } catch (error) {
        logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`);
      }
    }

    logger.info(`✅ Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`);
  } catch (error) {
    logger.error(`❌ Failed to extract posts from page: ${error.message}`);
    logger.error(`Stack trace: ${error.stack}`);
  }

  return posts;
}

/**
 * Extract data from individual post element
 */
async function extractPostData(postElement, keyword) {
  try {
    // Extract post ID
    const postId = (await postElement.getAttribute("data-urn")) || "";

    // Extract author info
    const authorElement = await postElement.$(".feed-shared-actor__name");
    const authorName = authorElement
      ? cleanText(await authorElement.textContent())
      : "";

    const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
    const authorUrl = authorLinkElement
      ? await authorLinkElement.getAttribute("href")
      : "";

    // Extract post content
    const contentElement = await postElement.$(".feed-shared-text");
    const content = contentElement
      ? cleanText(await contentElement.textContent())
      : "";

    // Extract timestamp
    const timeElement = await postElement.$(
      ".feed-shared-actor__sub-description time"
    );
    const timestamp = timeElement
      ? await timeElement.getAttribute("datetime")
      : "";

    // Extract location from profile (try multiple selectors)
    let location = "";
    const locationSelectors = [
      ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
      ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link--without-hover",
      ".feed-shared-actor__sub-description span[aria-label*='location']",
      ".feed-shared-actor__sub-description span[aria-label*='Location']",
    ];

    for (const selector of locationSelectors) {
      try {
        const locationElement = await postElement.$(selector);
        if (locationElement) {
          const locationText = await locationElement.textContent();
          if (locationText && locationText.trim()) {
            location = cleanText(locationText);
            break;
          }
        }
      } catch (e) {
        // Try next selector
      }
    }

    // If no location found in sub-description, try to extract from author link hover or profile
    if (!location) {
      try {
        // Try to get location from data attributes or other sources
        const subDescElement = await postElement.$(".feed-shared-actor__sub-description");
        if (subDescElement) {
          const subDescText = await subDescElement.textContent();
          // Look for location patterns (City, Province/State, Country)
          const locationMatch = subDescText.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/);
          if (locationMatch) {
            location = cleanText(locationMatch[0]);
          }
        }
      } catch (e) {
        // Location extraction failed, continue without it
      }
    }

    // Extract engagement metrics
    const likesElement = await postElement.$(".social-counts-reactions__count");
    const likesText = likesElement
      ? cleanText(await likesElement.textContent())
      : "0";

    const commentsElement = await postElement.$(
      ".social-counts-comments__count"
    );
    const commentsText = commentsElement
      ? cleanText(await commentsElement.textContent())
      : "0";

    // Note: LinkedIn search already filters by keyword semantically
    // We don't filter by content keyword match because:
    // 1. LinkedIn's search is semantic - it finds related posts, not just exact matches
    // 2. The keyword might be in comments, hashtags, or metadata, not visible text
    // 3. Posts might be about the topic without using the exact keyword
    //
    // Optional: Log if keyword appears in content (for debugging, but don't filter)
    const keywordLower = keyword.toLowerCase();
    const contentLower = content.toLowerCase();
    const hasKeywordInContent = contentLower.includes(keywordLower);
    if (!hasKeywordInContent && content.length > 50) {
      logger.debug(`ℹ️  Post doesn't contain keyword "${keyword}" in visible content, but including it (LinkedIn search matched it)`);
    }

    // Validate we have minimum required data
    if (!postId && !content) {
      logger.debug(`⏭️  Post filtered: missing both postId and content`);
      return null;
    }

    return {
      postId: cleanText(postId),
      authorName,
      authorUrl,
      profileLink: authorUrl ? (authorUrl.startsWith("http") ? authorUrl : `https://www.linkedin.com${authorUrl}`) : "",
      text: content,
      content: content,
      location: location,
      profileLocation: location, // Alias for compatibility
      timestamp,
      keyword,
      likes: extractNumber(likesText),
      comments: extractNumber(commentsText),
      extractedAt: new Date().toISOString(),
      source: "linkedin",
      parser: "linkedout-parser",
    };
  } catch (error) {
    logger.warning(`Error extracting post data: ${error.message}`);
    return null;
  }
}

/**
 * Extract numbers from text (e.g., "15 likes" -> 15)
 */
function extractNumber(text) {
  const match = text.match(/\d+/);
  return match ? parseInt(match[0]) : 0;
}

module.exports = {
  linkedinStrategy,
  extractPostsFromPage,
  extractPostData,
};