linkedout/linkedin-parser/strategies/linkedin-strategy.js

/**
 * LinkedIn Parsing Strategy
 *
 * Uses core-parser for browser management and ai-analyzer for utilities
 */

const {
  logger,
  cleanText,
  containsAnyKeyword,
  validateLocationAgainstFilters,
  extractLocationFromProfile,
  parseLocationFilters,
} = require("ai-analyzer");

/**
 * LinkedIn parsing strategy function
 */
async function linkedinStrategy(coreParser, options = {}) {
  const {
    keywords = ["layoff", "downsizing", "job cuts"],
    locationFilter = null,
    maxResults = 50,
    credentials = {},
  } = options;

  const results = [];
  const rejectedResults = [];
  const seenPosts = new Set();
  const seenProfiles = new Set();

  try {
    // Create main page
    const page = await coreParser.createPage("linkedin-main");

    // Authenticate to LinkedIn
    logger.info("🔐 Authenticating to LinkedIn...");
    await coreParser.authenticate("linkedin", credentials, "linkedin-main");
    logger.info("✅ LinkedIn authentication successful");

    // Search for posts with each keyword
    for (const keyword of keywords) {
      logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);

      const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
        keyword
      )}&sortBy=date_posted`;

      await coreParser.navigateTo(searchUrl, {
        pageId: "linkedin-main",
        retries: 2,
        waitUntil: "networkidle", // Wait for network to be idle
      });

      // Wait for page to load and content to render
      await new Promise(resolve => setTimeout(resolve, 5000)); // Give LinkedIn time to render dynamic content

      // Scroll down a bit to trigger lazy loading
      try {
        await page.evaluate(() => {
          window.scrollTo(0, 500);
        });
        await new Promise(resolve => setTimeout(resolve, 2000));
      } catch (e) {
        logger.debug(`Could not scroll page: ${e.message}`);
      }

      // Wait for search results - try multiple selectors
      let hasResults = false;
      const possibleSelectors = [
        ".feed-shared-update-v2",
        "article[data-urn*='urn:li:activity']",
        "article",
        ".search-results-container",
        ".search-results__list",
        ".reusable-search__result-container",
        "[data-test-id='search-results']",
      ];

      for (const selector of possibleSelectors) {
        try {
          await page.waitForSelector(selector, { timeout: 10000 });
          // Verify we actually have post elements
          const count = await page.$$(selector).then(elements => elements.length);
          if (count > 0) {
            hasResults = true;
            logger.info(`✅ Found ${count} post elements with selector: ${selector}`);
            break;
          }
        } catch (e) {
          // Try next selector
        }
      }

      if (!hasResults) {
        logger.warning(`⚠️  No search results container found for keyword: ${keyword}`);
        // Take screenshot for debugging
        try {
          const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
          await page.screenshot({ path: screenshotPath, fullPage: true });
          logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
        } catch (e) {
          logger.warning(`Could not take screenshot: ${e.message}`);
        }
        continue;
      }

      // Extract posts from current page
      const posts = await extractPostsFromPage(page, keyword);
      logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);

      for (const post of posts) {
        // Skip duplicates
        if (seenPosts.has(post.postId)) continue;
        seenPosts.add(post.postId);

        // Validate location if filtering enabled
        if (locationFilter) {
          const postLocation = post.location || post.profileLocation || "";
          // Parse locationFilter string into array if it's a string
          const locationFiltersArray = typeof locationFilter === 'string'
            ? parseLocationFilters(locationFilter)
            : locationFilter;
          const locationValid = validateLocationAgainstFilters(
            postLocation,
            locationFiltersArray
          );

          if (!locationValid.isValid) {
            logger.debug(`⏭️  Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`);
            rejectedResults.push({
              ...post,
              rejectionReason: locationValid.reasoning || `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
            });
            continue;
          } else {
            logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}" (${locationValid.reasoning || 'matched'})`);
          }
        }

        results.push(post);

        if (results.length >= maxResults) {
          logger.info(`📊 Reached maximum results limit: ${maxResults}`);
          break;
        }
      }

      if (results.length >= maxResults) break;
    }

    logger.info(
      `🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
    );

    return {
      results,
      rejectedResults,
      summary: {
        totalPosts: results.length,
        totalRejected: rejectedResults.length,
        keywords: keywords.join(", "),
        locationFilter,
      },
    };
  } catch (error) {
    logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
    throw error;
  }
}

/**
 * Extract posts from current search results page
 */
async function extractPostsFromPage(page, keyword) {
  const posts = [];

  try {
    // Try multiple selectors for post elements (LinkedIn changes these frequently)
    // Prioritize selectors that are more specific to actual posts
    const postSelectors = [
      "article[data-urn*='urn:li:activity']", // Most specific - posts with activity ID
      ".feed-shared-update-v2[data-urn*='urn:li:activity']",
      "article.feed-shared-update-v2",
      ".feed-shared-update-v2",
      "[data-urn*='urn:li:activity']",
      ".reusable-search__result-container",
      ".search-result__wrapper",
      "article",
    ];

    let postElements = [];
    let usedSelector = null;

    for (const selector of postSelectors) {
      try {
        // Wait a bit for elements to be available
        await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {});
        postElements = await page.$$(selector);

        // Filter to only elements that have a data-urn attribute (actual posts)
        if (postElements.length > 0) {
          const validElements = [];
          for (const elem of postElements) {
            try {
              const dataUrn = await elem.getAttribute("data-urn");
              if (dataUrn && dataUrn.includes("urn:li:activity")) {
                validElements.push(elem);
              }
            } catch (e) {
              // Element might have been detached, skip it
            }
          }

          if (validElements.length > 0) {
            postElements = validElements;
            usedSelector = selector;
            logger.info(`✅ Found ${postElements.length} valid post elements using selector: ${selector}`);
            break;
          }
        }
      } catch (e) {
        // Try next selector
      }
    }

    if (postElements.length === 0) {
      logger.warning(`⚠️  No post elements found with any selector. Page might have different structure.`);
      // Log page title and URL for debugging
      try {
        const pageTitle = await page.title();
        const pageUrl = page.url();
        logger.info(`📄 Page title: ${pageTitle}`);
        logger.info(`🔗 Page URL: ${pageUrl}`);
      } catch (e) {
        // Ignore
      }
      return posts;
    }

    logger.info(`🔍 Processing ${postElements.length} post elements...`);

    for (let i = 0; i < postElements.length; i++) {
      try {
        // Scroll element into view to ensure it's fully rendered
        try {
          await postElements[i].evaluate((el) => {
            el.scrollIntoView({ behavior: 'smooth', block: 'center' });
          });
          await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for rendering
        } catch (e) {
          // Element might already be in view or detached, continue anyway
        }

        const post = await extractPostData(postElements[i], keyword);
        if (post) {
          posts.push(post);
          const hasContent = post.content && post.content.length > 0;
          const hasAuthor = post.authorName && post.authorName.length > 0;
          logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`);
        } else {
          logger.debug(`⏭️  Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
        }
      } catch (error) {
        logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`);
      }
    }

    logger.info(`✅ Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`);
  } catch (error) {
    logger.error(`❌ Failed to extract posts from page: ${error.message}`);
    logger.error(`Stack trace: ${error.stack}`);
  }

  return posts;
}

/**
 * Extract data from individual post element
 * Uses evaluate() to extract data directly from DOM for better reliability
 */
async function extractPostData(postElement, keyword) {
  try {
    // Use evaluate to extract data directly from the DOM element
    // This is more reliable than using selectors which may not match
    const postData = await postElement.evaluate((el, keyword) => {
      const data = {
        postId: "",
        authorName: "",
        authorUrl: "",
        content: "",
        timestamp: "",
        location: "",
        likes: 0,
        comments: 0,
      };

      // Extract post ID from data-urn attribute
      data.postId = el.getAttribute("data-urn") ||
                    el.getAttribute("data-activity-id") ||
                    el.querySelector("[data-urn]")?.getAttribute("data-urn") || "";

      // Extract author name - try multiple selectors and approaches
      const authorSelectors = [
        ".feed-shared-actor__name",
        ".feed-shared-actor__name-link",
        ".update-components-actor__name",
        ".feed-shared-actor__name a",
        "[data-test-id='actor-name']",
        "span[aria-label*='name']",
        "a[href*='/in/'] span",
        ".feed-shared-actor a span",
        ".feed-shared-actor span",
        ".feed-shared-actor__name-link span",
      ];

      for (const selector of authorSelectors) {
        const elem = el.querySelector(selector);
        if (elem) {
          const text = elem.textContent?.trim() || elem.innerText?.trim();
          if (text && text.length > 0 && text.length < 100) { // Reasonable name length
            data.authorName = text;
            // Try to get link from same element or parent
            const link = elem.closest("a") || elem.querySelector("a");
            if (link) {
              data.authorUrl = link.getAttribute("href") || "";
            }
            break;
          }
        }
      }

      // If author name found but no URL, try to find link separately
      if (data.authorName && !data.authorUrl) {
        const authorLink = el.querySelector(".feed-shared-actor__name-link, .feed-shared-actor__name a, a[href*='/in/']");
        if (authorLink) {
          data.authorUrl = authorLink.getAttribute("href") || "";
        }
      }

      // Fallback: Look for any link with /in/ pattern and get the name from nearby text
      if (!data.authorName) {
        const profileLinks = el.querySelectorAll("a[href*='/in/']");
        for (const link of profileLinks) {
          // Skip if it's a company link
          if (link.getAttribute("href")?.includes("/company/")) continue;

          // Get text from the link or nearby
          const linkText = link.textContent?.trim() || link.innerText?.trim();
          if (linkText && linkText.length > 0 && linkText.length < 100 && !linkText.includes("View")) {
            data.authorName = linkText;
            data.authorUrl = link.getAttribute("href") || "";
            break;
          }
          // Try to get text from first child span
          const childSpan = link.querySelector("span");
          if (childSpan) {
            const spanText = childSpan.textContent?.trim() || childSpan.innerText?.trim();
            if (spanText && spanText.length > 0 && spanText.length < 100) {
              data.authorName = spanText;
              data.authorUrl = link.getAttribute("href") || "";
              break;
            }
          }
          // Try to get text from parent
          const parentText = link.parentElement?.textContent?.trim();
          if (parentText && parentText.length < 100 && !parentText.includes("View")) {
            // Extract just the name part (first line or first few words)
            const namePart = parentText.split("\n")[0].split("·")[0].trim();
            if (namePart.length > 0 && namePart.length < 100) {
              data.authorName = namePart;
              data.authorUrl = link.getAttribute("href") || "";
              break;
            }
          }
        }
      }

      // Last resort: Extract from actor section by looking at all text
      if (!data.authorName) {
        const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor, [class*='actor']");
        if (actorSection) {
          const actorText = actorSection.textContent || actorSection.innerText || "";
          const lines = actorText.split("\n").map(l => l.trim()).filter(l => l.length > 0);
          // First non-empty line is often the name
          for (const line of lines) {
            if (line.length > 0 && line.length < 100 &&
                !line.includes("·") &&
                !line.includes("ago") &&
                !line.match(/^\d+/) &&
                !line.toLowerCase().includes("view")) {
              data.authorName = line;
              // Try to find associated link
              const link = actorSection.querySelector("a[href*='/in/']");
              if (link) {
                data.authorUrl = link.getAttribute("href") || "";
              }
              break;
            }
          }
        }
      }

      // Extract post content - try multiple selectors
      const contentSelectors = [
        ".feed-shared-text",
        ".feed-shared-text__text-view",
        ".feed-shared-update-v2__description",
        ".update-components-text",
        "[data-test-id='post-text']",
        ".feed-shared-text span",
        ".feed-shared-update-v2__description-wrapper",
      ];

      for (const selector of contentSelectors) {
        const elem = el.querySelector(selector);
        if (elem) {
          const text = elem.textContent?.trim() || elem.innerText?.trim();
          if (text && text.length > 10) { // Only use if substantial content
            data.content = text;
            break;
          }
        }
      }

      // Extract timestamp
      const timeSelectors = [
        ".feed-shared-actor__sub-description time",
        "time[datetime]",
        "[data-test-id='timestamp']",
        ".feed-shared-actor__sub-description time[datetime]",
        "time",
        ".feed-shared-actor__sub-description time",
        "span[aria-label*='time']",
        "span[aria-label*='ago']",
      ];

      for (const selector of timeSelectors) {
        const elem = el.querySelector(selector);
        if (elem) {
          data.timestamp = elem.getAttribute("datetime") ||
                          elem.getAttribute("title") ||
                          elem.getAttribute("aria-label") ||
                          elem.textContent?.trim() || "";
          if (data.timestamp) break;
        }
      }

      // Fallback: Look for time-like patterns in sub-description
      if (!data.timestamp) {
        const subDesc = el.querySelector(".feed-shared-actor__sub-description");
        if (subDesc) {
          const subDescText = subDesc.textContent || subDesc.innerText || "";
          // Look for patterns like "2h", "3d", "1w", "2 months ago", etc.
          const timePatterns = [
            /\d+\s*(minute|hour|day|week|month|year)s?\s*ago/i,
            /\d+\s*(h|d|w|mo|yr)/i,
            /(just now|today|yesterday)/i,
          ];
          for (const pattern of timePatterns) {
            const match = subDescText.match(pattern);
            if (match) {
              data.timestamp = match[0];
              break;
            }
          }
        }
      }

      // Extract location - try multiple approaches
      const locationSelectors = [
        ".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
        ".feed-shared-actor__sub-description-link--without-hover",
        "span[aria-label*='location' i]",
        "span[aria-label*='Location']",
        ".feed-shared-actor__sub-description span",
        ".feed-shared-actor__sub-description a",
        "a[href*='/company/']",
        "a[href*='/location/']",
      ];

      for (const selector of locationSelectors) {
        const elem = el.querySelector(selector);
        if (elem) {
          const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || elem.innerText?.trim() || "";
          // Check if it looks like a location (contains comma or common location words)
          if (text && text.length > 2 && text.length < 100) {
            // More flexible location detection
            if (text.includes(",") ||
                /(city|province|state|country|region|ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut)/i.test(text) ||
                /^[A-Z][a-z]+,\s*[A-Z][a-z]+/i.test(text)) {
              data.location = text;
              break;
            }
          }
        }
      }

      // If no location found, try parsing from sub-description text
      if (!data.location) {
        const subDesc = el.querySelector(".feed-shared-actor__sub-description");
        if (subDesc) {
          const subDescText = subDesc.textContent || subDesc.innerText || "";

          // First, try to get all links in sub-description (location is often a link)
          const subDescLinks = subDesc.querySelectorAll("a");
          for (const link of subDescLinks) {
            const linkText = link.textContent?.trim() || link.innerText?.trim() || "";
            const linkHref = link.getAttribute("href") || "";

            // Skip if it's a time/date link or company link
            if (linkHref.includes("/company/") || linkText.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w)/i)) {
              continue;
            }

            // If link text looks like a location
            if (linkText && linkText.length > 2 && linkText.length < 100) {
              if (linkText.includes(",") ||
                  /(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut|toronto|vancouver|calgary|ottawa|montreal|winnipeg|edmonton|halifax|victoria|regina|saskatoon|windsor|kitchener|hamilton|london|st\.?\s*catharines|oshawa|barrie|greater sudbury|sherbrooke|kelowna|abbotsford|trois-rivières|guelph|cambridge|coquitlam|saanich|saint john|thunder bay|waterloo|delta|chatham|red deer|kamloops|brantford|whitehorse|yellowknife|iqaluit)/i.test(linkText)) {
                data.location = linkText;
                break;
              }
            }
          }

          // If still no location, try pattern matching on the full text
          if (!data.location && subDescText) {
            // Look for location patterns (City, Province/State, Country)
            const locationPatterns = [
              // Full location: "City, Province, Country"
              /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/,
              // City, Province
              /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)/,
              // Just province/state names
              /\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut|ON|AB|BC|QC|MB|SK|NS|NB|NL|PE|YT|NT|NU)\b/i,
              // Major cities
              /\b(Toronto|Vancouver|Calgary|Ottawa|Montreal|Winnipeg|Edmonton|Halifax|Victoria|Regina|Saskatoon)\b/i,
            ];

            for (const pattern of locationPatterns) {
              const match = subDescText.match(pattern);
              if (match) {
                // Get more context around the match
                const matchIndex = subDescText.indexOf(match[0]);
                const contextStart = Math.max(0, matchIndex - 30);
                const contextEnd = Math.min(subDescText.length, matchIndex + match[0].length + 30);
                const context = subDescText.substring(contextStart, contextEnd).trim();

                // Extract just the location part (remove time/date info)
                let locationText = match[0].trim();
                // If we have more context, try to get a better location string
                if (context.includes(",") && context.length < 100) {
                  // Try to extract "City, Province" pattern from context
                  const cityProvinceMatch = context.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+)/);
                  if (cityProvinceMatch) {
                    locationText = cityProvinceMatch[0].trim();
                  }
                }

                data.location = locationText;
                break;
              }
            }
          }

          // Last resort: extract any text that looks location-like from sub-description
          if (!data.location && subDescText) {
            // Split by common separators and look for location-like text
            const parts = subDescText.split(/[·•|]/).map(p => p.trim()).filter(p => p.length > 0);
            for (const part of parts) {
              // Skip if it looks like time/date
              if (part.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
                continue;
              }
              // Check if it looks like a location
              if (part.length > 2 && part.length < 100 &&
                  (part.includes(",") ||
                   /(ontario|alberta|british columbia|quebec|manitoba|toronto|vancouver|calgary|ottawa|montreal)/i.test(part))) {
                data.location = part;
                break;
              }
            }
          }
        }
      }

      // Final fallback: look anywhere in the actor section for location-like text
      if (!data.location) {
        const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor");
        if (actorSection) {
          const actorText = actorSection.textContent || actorSection.innerText || "";
          // Look for province names
          const provinceMatch = actorText.match(/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)\b/i);
          if (provinceMatch) {
            // Try to get city, province if available
            const cityProvinceMatch = actorText.match(/([A-Z][a-z]+),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
            if (cityProvinceMatch) {
              data.location = cityProvinceMatch[0].trim();
            } else {
              data.location = provinceMatch[0].trim();
            }
          }
        }
      }

      // Try to extract from any hover cards or mini profiles in the DOM
      if (!data.location) {
        // Look for mini profile cards or tooltips
        const miniProfileSelectors = [
          "[data-control-name='hovercard']",
          ".artdeco-hoverable-trigger",
          ".feed-shared-actor__meta",
          ".pv-text-details__left-panel",
        ];

        for (const selector of miniProfileSelectors) {
          const elem = el.querySelector(selector);
          if (elem) {
            const text = elem.textContent || elem.innerText || "";
            // Look for location patterns
            const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
            if (locationMatch) {
              data.location = locationMatch[0].trim();
              break;
            }
          }
        }
      }

      // Extract engagement metrics - try multiple approaches
      const likesSelectors = [
        ".social-counts-reactions__count",
        "[data-test-id='reactions-count']",
        ".social-counts__reactions-count",
        ".feed-shared-social-action-bar__reactions-count",
        "button[aria-label*='reaction']",
        "button[aria-label*='like']",
        ".social-actions-button__reactions-count",
        "[data-test-id='social-actions__reactions-count']",
      ];

      for (const selector of likesSelectors) {
        const elem = el.querySelector(selector);
        if (elem) {
          const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
          const match = text.match(/(\d+)/);
          if (match) {
            data.likes = parseInt(match[1], 10) || 0;
            break;
          }
        }
      }

      // Fallback: Look for any button or element with reaction/like text
      if (data.likes === 0) {
        const allButtons = el.querySelectorAll("button, span, div");
        for (const btn of allButtons) {
          const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
          if (/reaction|like/i.test(text)) {
            const match = text.match(/(\d+)/);
            if (match) {
              data.likes = parseInt(match[1], 10) || 0;
              break;
            }
          }
        }
      }

      const commentsSelectors = [
        ".social-counts-comments__count",
        "[data-test-id='comments-count']",
        ".social-counts__comments-count",
        ".feed-shared-social-action-bar__comments-count",
        "button[aria-label*='comment']",
        ".social-actions-button__comments-count",
        "[data-test-id='social-actions__comments-count']",
      ];

      for (const selector of commentsSelectors) {
        const elem = el.querySelector(selector);
        if (elem) {
          const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
          const match = text.match(/(\d+)/);
          if (match) {
            data.comments = parseInt(match[1], 10) || 0;
            break;
          }
        }
      }

      // Fallback: Look for any button or element with comment text
      if (data.comments === 0) {
        const allButtons = el.querySelectorAll("button, span, div");
        for (const btn of allButtons) {
          const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
          if (/comment/i.test(text)) {
            const match = text.match(/(\d+)/);
            if (match) {
              data.comments = parseInt(match[1], 10) || 0;
              break;
            }
          }
        }
      }

      return data;
    }, keyword);

    // Clean and format the extracted data
    const authorName = cleanText(postData.authorName);
    let authorUrl = postData.authorUrl || "";
    if (authorUrl && !authorUrl.startsWith("http")) {
      authorUrl = `https://www.linkedin.com${authorUrl}`;
    }

    const content = cleanText(postData.content);
    const location = cleanText(postData.location);
    const timestamp = postData.timestamp || "";

    // Validate we have minimum required data
    if (!postData.postId && !content) {
      logger.debug(`⏭️  Post filtered: missing both postId and content`);
      return null;
    }

    // Log extraction results for debugging
    const missingFields = [];
    if (!authorName) missingFields.push("authorName");
    if (!authorUrl) missingFields.push("authorUrl");
    if (!location) missingFields.push("location");
    if (!timestamp) missingFields.push("timestamp");
    if (postData.likes === 0 && postData.comments === 0) missingFields.push("engagement");

    if (missingFields.length > 0 && postData.postId) {
      logger.debug(`⚠️  Post ${postData.postId.substring(0, 20)}... missing: ${missingFields.join(", ")}`);

      // If location is missing, log sub-description content for debugging
      if (!location && process.env.DEBUG_EXTRACTION === "true") {
        try {
          const subDescInfo = await postElement.evaluate((el) => {
            const subDesc = el.querySelector(".feed-shared-actor__sub-description");
            if (subDesc) {
              return {
                text: subDesc.textContent || subDesc.innerText || "",
                html: subDesc.innerHTML.substring(0, 500),
                links: Array.from(subDesc.querySelectorAll("a")).map(a => ({
                  text: a.textContent?.trim(),
                  href: a.getAttribute("href")
                }))
              };
            }
            return null;
          });
          if (subDescInfo) {
            logger.debug(`Sub-description text: "${subDescInfo.text}"`);
            logger.debug(`Sub-description links: ${JSON.stringify(subDescInfo.links)}`);
          }
        } catch (e) {
          // Ignore errors in debugging
        }
      }

      // Optionally log HTML structure for first failed extraction (to help debug)
      if (process.env.DEBUG_EXTRACTION === "true" && missingFields.length >= 3) {
        try {
          const htmlSnippet = await postElement.evaluate((el) => {
            // Get the outer HTML of the element (limited to first 2000 chars)
            const html = el.outerHTML || "";
            return html.substring(0, 2000);
          });
          logger.debug(`HTML structure (first 2000 chars):\n${htmlSnippet}`);
        } catch (e) {
          // Ignore errors in debugging
        }
      }
    }

    return {
      postId: cleanText(postData.postId),
      authorName,
      authorUrl,
      profileLink: authorUrl,
      text: content,
      content: content,
      location: location,
      profileLocation: location, // Alias for compatibility
      timestamp,
      keyword,
      likes: postData.likes || 0,
      comments: postData.comments || 0,
      extractedAt: new Date().toISOString(),
      source: "linkedin",
      parser: "linkedout-parser",
    };
  } catch (error) {
    logger.warning(`Error extracting post data: ${error.message}`);
    logger.debug(`Stack trace: ${error.stack}`);
    return null;
  }
}

/**
 * Extract numbers from text (e.g., "15 likes" -> 15)
 */
function extractNumber(text) {
  const match = text.match(/\d+/);
  return match ? parseInt(match[0]) : 0;
}

module.exports = {
  linkedinStrategy,
  extractPostsFromPage,
  extractPostData,
};