Enhance job search parser with LinkedIn strategy and configuration updates

- Added LinkedIn jobs parsing strategy to support job extraction from LinkedIn. - Updated job search parser to include new site strategy and improved argument parsing for max pages and exclusion of rejected results. - Enhanced README documentation to reflect new features and usage examples. - Refactored existing strategies for consistency and improved error handling.
2025-12-16 23:17:12 -05:00 · 2025-12-16 23:17:12 -05:00 · 4099b23744
commit 4099b23744
parent bbfd3c84aa
8 changed files with 2431 additions and 888 deletions
--- a/core-parser/index.js
+++ b/core-parser/index.js
@ -62,3 +62,5 @@ class CoreParser {

 module.exports = CoreParser;

+
+
--- a/job-search-parser/README.md
+++ b/job-search-parser/README.md
@ -60,13 +60,48 @@ JOB_TYPES="full time,contract" node index.js --sites=skipthedrive
 node index.js --sites=skipthedrive --demo
 ```

+#### LinkedIn Jobs Parser
+
+Professional network job postings with comprehensive job data.
+
+**Features:**
+
+- LinkedIn authentication support
+- Keyword-based job search
+- Location filtering (both LinkedIn location and post-extraction filter)
+- Multi-page result parsing with pagination
+- Job type and experience level extraction
+- Automatic duplicate detection
+- Infinite scroll handling
+
+**Requirements:**
+
+- LinkedIn credentials (username and password) must be set in `.env` file:
+  ```env
+  LINKEDIN_USERNAME=tatiana.litvak25@gmail.com
+  LINKEDIN_PASSWORD=Sladkiy99(
+  LINKEDIN_JOB_LOCATION=Canada  # Optional: LinkedIn location filter
+  ```
+
+**Usage:**
+
+```bash
+# Search LinkedIn jobs
+node index.js --sites=linkedin --keywords="software engineer,developer"
+
+# Search with location filter
+node index.js --sites=linkedin --keywords="co-op" --location="Ontario"
+
+# Combine multiple sites
+node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op"
+```
+
 ### 🚧 Planned Parsers

 - **Indeed**: Comprehensive job aggregator
 - **Glassdoor**: Jobs with company reviews and salary data
 - **Monster**: Traditional job board
 - **SimplyHired**: Job aggregator with salary estimates
- **LinkedIn Jobs**: Professional network job postings
 - **AngelList**: Startup and tech jobs
 - **Remote.co**: Dedicated remote work jobs
 - **FlexJobs**: Flexible and remote positions
@ -92,23 +127,21 @@ Create a `.env` file in the parser directory:

 ```env
 # Job Search Configuration
-SEARCH_SOURCES=linkedin,indeed,glassdoor
-TARGET_ROLES=software engineer,data scientist,product manager
-LOCATION_FILTER=Toronto,Vancouver,Calgary
-EXPERIENCE_LEVELS=entry,mid,senior
-REMOTE_PREFERENCE=remote,hybrid,onsite
+SEARCH_KEYWORDS=software engineer,developer,programmer
+LOCATION_FILTER=Ontario,Canada
+MAX_PAGES=5
+
+# LinkedIn Configuration (required for LinkedIn jobs)
+LINKEDIN_USERNAME=your_email@example.com
+LINKEDIN_PASSWORD=your_password
+LINKEDIN_JOB_LOCATION=Canada  # Optional: LinkedIn location search

 # Analysis Configuration
-ENABLE_SALARY_ANALYSIS=true
-ENABLE_SKILL_ANALYSIS=true
-ENABLE_TREND_ANALYSIS=true
-MIN_SALARY=50000
-MAX_SALARY=200000
+ENABLE_AI_ANALYSIS=false
+HEADLESS=true

 # Output Configuration
-OUTPUT_FORMAT=json,csv
-SAVE_RAW_DATA=true
-ANALYSIS_INTERVAL=daily
+OUTPUT_FORMAT=json
 ```

 ### Command Line Options
--- a/job-search-parser/index.js
+++ b/job-search-parser/index.js
@ -10,6 +10,7 @@ const path = require("path");
 const fs = require("fs");
 const CoreParser = require("../core-parser");
 const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
+const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
 const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");

 // Load environment variables
@ -18,14 +19,16 @@ require("dotenv").config({ path: path.join(__dirname, ".env") });
 // Configuration from environment
 const HEADLESS = process.env.HEADLESS !== "false";
 const SEARCH_KEYWORDS =
-  process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer";
+  process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
 const LOCATION_FILTER = process.env.LOCATION_FILTER;
 const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
 const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
+const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";

 // Available site strategies
 const SITE_STRATEGIES = {
  skipthedrive: skipthedriveStrategy,
+  linkedin: linkedinJobsStrategy,
  // Add more site strategies here
  // indeed: indeedStrategy,
  // glassdoor: glassdoorStrategy,
@ -41,6 +44,7 @@ function parseArguments() {
    keywords: null,
    locationFilter: null,
    maxPages: MAX_PAGES,
+    excludeRejected: EXCLUDE_REJECTED,
  };

  args.forEach((arg) => {
@ -57,7 +61,15 @@ function parseArguments() {
    } else if (arg.startsWith("--location=")) {
      options.locationFilter = arg.split("=")[1];
    } else if (arg.startsWith("--max-pages=")) {
-      options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES;
+      const value = arg.split("=")[1];
+      // Support "all" or "0" to mean unlimited pages
+      if (value === "all" || value === "0") {
+        options.maxPages = 0; // 0 means unlimited
+      } else {
+        options.maxPages = parseInt(value) || MAX_PAGES;
+      }
+    } else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
+      options.excludeRejected = true;
    }
  });

@ -84,6 +96,7 @@ async function startJobSearchParser(options = {}) {
      finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
    const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
    const sites = finalOptions.sites;
+    const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;

    logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
    logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
@ -108,18 +121,46 @@ async function startJobSearchParser(options = {}) {
        logger.step(`\n🌐 Parsing ${site}...`);
        const startTime = Date.now();

-        const parseResult = await strategy(coreParser, {
+        // Prepare strategy options
+        const strategyOptions = {
          keywords,
          locationFilter,
          maxPages: finalOptions.maxPages,
-        });
+        };
+
+        // Add credentials for LinkedIn
+        if (site === "linkedin") {
+          const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
+          const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
+          
+          if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
+            logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
+            siteResults[site] = {
+              count: 0,
+              rejected: 0,
+              duration: "0s",
+              error: "LinkedIn credentials not found",
+            };
+            continue;
+          }
+
+          strategyOptions.credentials = {
+            username: LINKEDIN_USERNAME,
+            password: LINKEDIN_PASSWORD,
+          };
+          strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
+        }
+
+        const parseResult = await strategy(coreParser, strategyOptions);

        const { results, rejectedResults, summary } = parseResult;
        const duration = ((Date.now() - startTime) / 1000).toFixed(2);

        // Collect results
+        logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
        allResults.push(...results);
        allRejectedResults.push(...rejectedResults);
+        logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);

        siteResults[site] = {
          count: results.length,
@ -162,6 +203,9 @@ async function startJobSearchParser(options = {}) {
    }

    // Save results
+    logger.info(`💾 Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
+    logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
+    
    const outputData = {
      metadata: {
        extractedAt: new Date().toISOString(),
@ -171,12 +215,22 @@ async function startJobSearchParser(options = {}) {
        keywords: keywords.join(", "),
        locationFilter,
        analysisResults,
+        rejectedJobsExcluded: excludeRejected,
      },
      results: allResults,
-      rejectedResults: allRejectedResults,
      siteResults,
    };
    
+    // Always include rejectedResults if not excluded (make it explicit, not using spread)
+    if (!excludeRejected) {
+      outputData.rejectedResults = allRejectedResults;
+      logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
+    } else {
+      logger.info(`⏭️  Excluding rejected results (EXCLUDE_REJECTED=true)`);
+    }
+    
+    logger.info(`💾 Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`);
+
    const resultsDir = path.join(__dirname, "results");
    if (!fs.existsSync(resultsDir)) {
      fs.mkdirSync(resultsDir, { recursive: true });
--- a/job-search-parser/strategies/linkedin-jobs-strategy.js
+++ b/job-search-parser/strategies/linkedin-jobs-strategy.js
--- a/job-search-parser/strategies/skipthedrive-strategy.js
+++ b/job-search-parser/strategies/skipthedrive-strategy.js
@ -67,14 +67,11 @@ async function skipthedriveStrategy(coreParser, options = {}) {
        });

        // Wait for job listings to load
-        const hasResults = await coreParser
-          .waitForSelector(
-            "#loops-wrapper",
-            {
+        const hasResults = await page
+          .waitForSelector("#loops-wrapper", {
            timeout: 5000,
-            },
-            "skipthedrive-main"
-          )
+          })
+          .then(() => true)
          .catch(() => {
            logger.warning(`No results found for keyword: ${keyword}`);
            return false;
--- a/linkedin-parser/index.js
+++ b/linkedin-parser/index.js
@ -31,12 +31,13 @@ const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
 const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
 const HEADLESS = process.env.HEADLESS !== "false";
 const SEARCH_KEYWORDS =
-  process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts";
+  process.env.SEARCH_KEYWORDS || "layoff";//,downsizing";//,job cuts";
 const LOCATION_FILTER = process.env.LOCATION_FILTER;
 const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
 const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
 const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
 const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
+const EXTRACT_LOCATION_FROM_PROFILE = process.env.EXTRACT_LOCATION_FROM_PROFILE === "true";

 /**
 * Main LinkedIn parser function
@ -71,6 +72,7 @@ async function startLinkedInParser(options = {}) {
      keywords,
      locationFilter: LOCATION_FILTER,
      maxResults: MAX_RESULTS,
+      extractLocationFromProfile: EXTRACT_LOCATION_FROM_PROFILE,
      credentials: {
        username: LINKEDIN_USERNAME,
        password: LINKEDIN_PASSWORD,
--- a/linkedin-parser/strategies/linkedin-strategy.js
+++ b/linkedin-parser/strategies/linkedin-strategy.js
@ -21,6 +21,7 @@ async function linkedinStrategy(coreParser, options = {}) {
    keywords = ["layoff", "downsizing", "job cuts"],
    locationFilter = null,
    maxResults = 50,
+    extractLocationFromProfile = false,
    credentials = {},
  } = options;

@ -106,7 +107,7 @@ async function linkedinStrategy(coreParser, options = {}) {
      }

      // Extract posts from current page
-      const posts = await extractPostsFromPage(page, keyword);
+      const posts = await extractPostsFromPage(page, keyword, extractLocationFromProfile);
      logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);

      for (const post of posts) {
@ -172,7 +173,7 @@ async function linkedinStrategy(coreParser, options = {}) {
 /**
 * Extract posts from current search results page
 */
-async function extractPostsFromPage(page, keyword) {
+async function extractPostsFromPage(page, keyword, extractLocationFromProfile = false) {
  const posts = [];

  try {
@ -254,10 +255,26 @@ async function extractPostsFromPage(page, keyword) {

        const post = await extractPostData(postElements[i], keyword);
        if (post) {
+          // If location is missing and we're enabled to extract from profile, try to get it
+          if (!post.location && extractLocationFromProfile && post.authorUrl) {
+            try {
+              logger.debug(`📍 Location missing for post ${i + 1}, attempting to extract from profile...`);
+              const profileLocation = await extractLocationFromProfilePage(page, post.authorUrl);
+              if (profileLocation) {
+                post.location = profileLocation;
+                post.profileLocation = profileLocation;
+                logger.debug(`✅ Extracted location from profile: ${profileLocation}`);
+              }
+            } catch (error) {
+              logger.debug(`⚠️  Could not extract location from profile: ${error.message}`);
+            }
+          }
+          
          posts.push(post);
          const hasContent = post.content && post.content.length > 0;
          const hasAuthor = post.authorName && post.authorName.length > 0;
-          logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`);
+          const hasLocation = post.location && post.location.length > 0;
+          logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'}, location: ${hasLocation ? 'yes' : 'no'})`);
        } else {
          logger.debug(`⏭️  Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
        }
@ -627,6 +644,42 @@ async function extractPostData(postElement, keyword) {
        }
      }
      
+      // Try to extract from data attributes or hidden elements
+      if (!data.location) {
+        // Check for data attributes that might contain location
+        const actorSection = el.querySelector(".feed-shared-actor");
+        if (actorSection) {
+          // Check all data attributes
+          for (const attr of actorSection.attributes) {
+            if (attr.name.startsWith("data-") && attr.value) {
+              const value = attr.value.toLowerCase();
+              // Look for location-like patterns in data attributes
+              if (/(ontario|alberta|british columbia|quebec|toronto|vancouver|calgary|ottawa|montreal)/i.test(value)) {
+                // Try to extract the actual location text
+                const locationMatch = attr.value.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
+                if (locationMatch) {
+                  data.location = locationMatch[0];
+                  break;
+                }
+              }
+            }
+          }
+          
+          // Check for hidden spans or divs with location info
+          const hiddenElements = actorSection.querySelectorAll("span[style*='display: none'], div[style*='display: none'], [aria-hidden='true']");
+          for (const hiddenElem of hiddenElements) {
+            const text = hiddenElem.textContent || hiddenElem.getAttribute("aria-label") || "";
+            if (text && /(ontario|alberta|british columbia|quebec|toronto|vancouver)/i.test(text)) {
+              const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
+              if (locationMatch) {
+                data.location = locationMatch[0].trim();
+                break;
+              }
+            }
+          }
+        }
+      }
+
      // Extract engagement metrics - try multiple approaches
      const likesSelectors = [
        ".social-counts-reactions__count",
@ -799,6 +852,48 @@ async function extractPostData(postElement, keyword) {
  }
 }

+/**
+ * Extract location from a LinkedIn profile page
+ */
+async function extractLocationFromProfilePage(page, profileUrl) {
+  try {
+    // Ensure URL is complete
+    let fullUrl = profileUrl;
+    if (!fullUrl.startsWith("http")) {
+      fullUrl = `https://www.linkedin.com${fullUrl}`;
+    }
+    
+    // Remove query parameters that might cause issues
+    fullUrl = fullUrl.split("?")[0];
+    
+    // Open profile in new tab
+    const profilePage = await page.context().newPage();
+    
+    try {
+      await profilePage.goto(fullUrl, {
+        waitUntil: "domcontentloaded",
+        timeout: 15000,
+      });
+      
+      // Wait a bit for content to load
+      await new Promise(resolve => setTimeout(resolve, 2000));
+      
+      // Use the extractLocationFromProfile utility from ai-analyzer
+      const location = await extractLocationFromProfile(profilePage);
+      
+      await profilePage.close();
+      
+      return location;
+    } catch (error) {
+      await profilePage.close();
+      throw error;
+    }
+  } catch (error) {
+    logger.debug(`Failed to extract location from profile ${profileUrl}: ${error.message}`);
+    return "";
+  }
+}
+
 /**
 * Extract numbers from text (e.g., "15 likes" -> 15)
 */
				`@ -62,3 +62,5 @@ class CoreParser {`

				`module.exports = CoreParser;`