Enhance job search parser with LinkedIn strategy and configuration updates

- Added LinkedIn jobs parsing strategy to support job extraction from LinkedIn. - Updated job search parser to include new site strategy and improved argument parsing for max pages and exclusion of rejected results. - Enhanced README documentation to reflect new features and usage examples. - Refactored existing strategies for consistency and improved error handling.
2025-12-16 23:17:12 -05:00 · 2025-12-16 23:17:12 -05:00 · 4099b23744
commit 4099b23744
parent bbfd3c84aa
8 changed files with 2431 additions and 888 deletions
--- a/core-parser/index.js
+++ b/core-parser/index.js
@ -62,3 +62,5 @@ class CoreParser {
 module.exports = CoreParser;
--- a/job-search-parser/README.md
+++ b/job-search-parser/README.md
@ -60,13 +60,48 @@ JOB_TYPES="full time,contract" node index.js --sites=skipthedrive
 node index.js --sites=skipthedrive --demo
 ```
 #### LinkedIn Jobs Parser
 Professional network job postings with comprehensive job data.
 **Features:**
 - LinkedIn authentication support
 - Keyword-based job search
 - Location filtering (both LinkedIn location and post-extraction filter)
 - Multi-page result parsing with pagination
 - Job type and experience level extraction
 - Automatic duplicate detection
 - Infinite scroll handling
 **Requirements:**
 - LinkedIn credentials (username and password) must be set in `.env` file:
  ```env
  LINKEDIN_USERNAME=tatiana.litvak25@gmail.com
  LINKEDIN_PASSWORD=Sladkiy99(
  LINKEDIN_JOB_LOCATION=Canada  # Optional: LinkedIn location filter
  ```
 **Usage:**
 ```bash
 # Search LinkedIn jobs
 node index.js --sites=linkedin --keywords="software engineer,developer"
 # Search with location filter
 node index.js --sites=linkedin --keywords="co-op" --location="Ontario"
 # Combine multiple sites
 node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op"
 ```
 ### 🚧 Planned Parsers
 - **Indeed**: Comprehensive job aggregator
 - **Glassdoor**: Jobs with company reviews and salary data
 - **Monster**: Traditional job board
 - **SimplyHired**: Job aggregator with salary estimates
 - **LinkedIn Jobs**: Professional network job postings
 - **AngelList**: Startup and tech jobs
 - **Remote.co**: Dedicated remote work jobs
 - **FlexJobs**: Flexible and remote positions
@ -92,23 +127,21 @@ Create a `.env` file in the parser directory:
 ```env
 # Job Search Configuration
-SEARCH_SOURCES=linkedin,indeed,glassdoor
+SEARCH_KEYWORDS=software engineer,developer,programmer
-TARGET_ROLES=software engineer,data scientist,product manager
+LOCATION_FILTER=Ontario,Canada
-LOCATION_FILTER=Toronto,Vancouver,Calgary
+MAX_PAGES=5
-EXPERIENCE_LEVELS=entry,mid,senior
+
-REMOTE_PREFERENCE=remote,hybrid,onsite
+# LinkedIn Configuration (required for LinkedIn jobs)
 LINKEDIN_USERNAME=your_email@example.com
 LINKEDIN_PASSWORD=your_password
 LINKEDIN_JOB_LOCATION=Canada  # Optional: LinkedIn location search
 # Analysis Configuration
-ENABLE_SALARY_ANALYSIS=true
+ENABLE_AI_ANALYSIS=false
-ENABLE_SKILL_ANALYSIS=true
+HEADLESS=true
 ENABLE_TREND_ANALYSIS=true
 MIN_SALARY=50000
 MAX_SALARY=200000
 # Output Configuration
-OUTPUT_FORMAT=json,csv
+OUTPUT_FORMAT=json
 SAVE_RAW_DATA=true
 ANALYSIS_INTERVAL=daily
 ```
 ### Command Line Options
--- a/job-search-parser/index.js
+++ b/job-search-parser/index.js
@ -10,6 +10,7 @@ const path = require("path");
 const fs = require("fs");
 const CoreParser = require("../core-parser");
 const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
 const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
 const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
 // Load environment variables
@ -18,14 +19,16 @@ require("dotenv").config({ path: path.join(__dirname, ".env") });
 // Configuration from environment
 const HEADLESS = process.env.HEADLESS !== "false";
 const SEARCH_KEYWORDS =
-  process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer";
+  process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
 const LOCATION_FILTER = process.env.LOCATION_FILTER;
 const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
 const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
 const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
 // Available site strategies
 const SITE_STRATEGIES = {
  skipthedrive: skipthedriveStrategy,
  linkedin: linkedinJobsStrategy,
  // Add more site strategies here
  // indeed: indeedStrategy,
  // glassdoor: glassdoorStrategy,
@ -41,6 +44,7 @@ function parseArguments() {
    keywords: null,
    locationFilter: null,
    maxPages: MAX_PAGES,
    excludeRejected: EXCLUDE_REJECTED,
  };
  args.forEach((arg) => {
@ -57,7 +61,15 @@ function parseArguments() {
    } else if (arg.startsWith("--location=")) {
      options.locationFilter = arg.split("=")[1];
    } else if (arg.startsWith("--max-pages=")) {
-      options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES;
+      const value = arg.split("=")[1];
      // Support "all" or "0" to mean unlimited pages
      if (value === "all" || value === "0") {
        options.maxPages = 0; // 0 means unlimited
      } else {
        options.maxPages = parseInt(value) || MAX_PAGES;
      }
    } else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
      options.excludeRejected = true;
    }
  });
@ -84,6 +96,7 @@ async function startJobSearchParser(options = {}) {
      finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
    const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
    const sites = finalOptions.sites;
    const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
    logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
    logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
@ -108,18 +121,46 @@ async function startJobSearchParser(options = {}) {
        logger.step(`\n🌐 Parsing ${site}...`);
        const startTime = Date.now();
-        const parseResult = await strategy(coreParser, {
+        // Prepare strategy options
        const strategyOptions = {
          keywords,
          locationFilter,
          maxPages: finalOptions.maxPages,
-        });
+        };
        // Add credentials for LinkedIn
        if (site === "linkedin") {
          const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
          const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
          if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
            logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
            siteResults[site] = {
              count: 0,
              rejected: 0,
              duration: "0s",
              error: "LinkedIn credentials not found",
            };
            continue;
          }
          strategyOptions.credentials = {
            username: LINKEDIN_USERNAME,
            password: LINKEDIN_PASSWORD,
          };
          strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
        }
        const parseResult = await strategy(coreParser, strategyOptions);
        const { results, rejectedResults, summary } = parseResult;
        const duration = ((Date.now() - startTime) / 1000).toFixed(2);
        // Collect results
        logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
        allResults.push(...results);
        allRejectedResults.push(...rejectedResults);
        logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
        siteResults[site] = {
          count: results.length,
@ -162,6 +203,9 @@ async function startJobSearchParser(options = {}) {
    }
    // Save results
    logger.info(`💾 Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
    logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
    const outputData = {
      metadata: {
        extractedAt: new Date().toISOString(),
@ -171,12 +215,22 @@ async function startJobSearchParser(options = {}) {
        keywords: keywords.join(", "),
        locationFilter,
        analysisResults,
        rejectedJobsExcluded: excludeRejected,
      },
      results: allResults,
      rejectedResults: allRejectedResults,
      siteResults,
    };
    // Always include rejectedResults if not excluded (make it explicit, not using spread)
    if (!excludeRejected) {
      outputData.rejectedResults = allRejectedResults;
      logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
    } else {
      logger.info(`⏭️  Excluding rejected results (EXCLUDE_REJECTED=true)`);
    }
    logger.info(`💾 Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`);
    const resultsDir = path.join(__dirname, "results");
    if (!fs.existsSync(resultsDir)) {
      fs.mkdirSync(resultsDir, { recursive: true });
--- a/job-search-parser/strategies/linkedin-jobs-strategy.js
+++ b/job-search-parser/strategies/linkedin-jobs-strategy.js
--- a/job-search-parser/strategies/skipthedrive-strategy.js
+++ b/job-search-parser/strategies/skipthedrive-strategy.js
@ -67,14 +67,11 @@ async function skipthedriveStrategy(coreParser, options = {}) {
        });
        // Wait for job listings to load
-        const hasResults = await coreParser
+        const hasResults = await page
-          .waitForSelector(
+          .waitForSelector("#loops-wrapper", {
-            "#loops-wrapper",
+            timeout: 5000,
-            {
+          })
-              timeout: 5000,
+          .then(() => true)
            },
            "skipthedrive-main"
          )
          .catch(() => {
            logger.warning(`No results found for keyword: ${keyword}`);
            return false;
--- a/linkedin-parser/index.js
+++ b/linkedin-parser/index.js
@ -31,12 +31,13 @@ const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
 const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
 const HEADLESS = process.env.HEADLESS !== "false";
 const SEARCH_KEYWORDS =
-  process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts";
+  process.env.SEARCH_KEYWORDS || "layoff";//,downsizing";//,job cuts";
 const LOCATION_FILTER = process.env.LOCATION_FILTER;
 const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
 const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
 const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
 const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
 const EXTRACT_LOCATION_FROM_PROFILE = process.env.EXTRACT_LOCATION_FROM_PROFILE === "true";
 /**
 * Main LinkedIn parser function
@ -71,6 +72,7 @@ async function startLinkedInParser(options = {}) {
      keywords,
      locationFilter: LOCATION_FILTER,
      maxResults: MAX_RESULTS,
      extractLocationFromProfile: EXTRACT_LOCATION_FROM_PROFILE,
      credentials: {
        username: LINKEDIN_USERNAME,
        password: LINKEDIN_PASSWORD,
--- a/linkedin-parser/strategies/linkedin-strategy.js
+++ b/linkedin-parser/strategies/linkedin-strategy.js
@ -21,6 +21,7 @@ async function linkedinStrategy(coreParser, options = {}) {
    keywords = ["layoff", "downsizing", "job cuts"],
    locationFilter = null,
    maxResults = 50,
    extractLocationFromProfile = false,
    credentials = {},
  } = options;
@ -106,7 +107,7 @@ async function linkedinStrategy(coreParser, options = {}) {
      }
      // Extract posts from current page
-      const posts = await extractPostsFromPage(page, keyword);
+      const posts = await extractPostsFromPage(page, keyword, extractLocationFromProfile);
      logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
      for (const post of posts) {
@ -172,7 +173,7 @@ async function linkedinStrategy(coreParser, options = {}) {
 /**
 * Extract posts from current search results page
 */
-async function extractPostsFromPage(page, keyword) {
+async function extractPostsFromPage(page, keyword, extractLocationFromProfile = false) {
  const posts = [];
  try {
@ -254,10 +255,26 @@ async function extractPostsFromPage(page, keyword) {
        const post = await extractPostData(postElements[i], keyword);
        if (post) {
          // If location is missing and we're enabled to extract from profile, try to get it
          if (!post.location && extractLocationFromProfile && post.authorUrl) {
            try {
              logger.debug(`📍 Location missing for post ${i + 1}, attempting to extract from profile...`);
              const profileLocation = await extractLocationFromProfilePage(page, post.authorUrl);
              if (profileLocation) {
                post.location = profileLocation;
                post.profileLocation = profileLocation;
                logger.debug(`✅ Extracted location from profile: ${profileLocation}`);
              }
            } catch (error) {
              logger.debug(`⚠️  Could not extract location from profile: ${error.message}`);
            }
          }
          posts.push(post);
          const hasContent = post.content && post.content.length > 0;
          const hasAuthor = post.authorName && post.authorName.length > 0;
-          logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`);
+          const hasLocation = post.location && post.location.length > 0;
          logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'}, location: ${hasLocation ? 'yes' : 'no'})`);
        } else {
          logger.debug(`⏭️  Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
        }
@ -627,6 +644,42 @@ async function extractPostData(postElement, keyword) {
        }
      }
      // Try to extract from data attributes or hidden elements
      if (!data.location) {
        // Check for data attributes that might contain location
        const actorSection = el.querySelector(".feed-shared-actor");
        if (actorSection) {
          // Check all data attributes
          for (const attr of actorSection.attributes) {
            if (attr.name.startsWith("data-") && attr.value) {
              const value = attr.value.toLowerCase();
              // Look for location-like patterns in data attributes
              if (/(ontario|alberta|british columbia|quebec|toronto|vancouver|calgary|ottawa|montreal)/i.test(value)) {
                // Try to extract the actual location text
                const locationMatch = attr.value.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
                if (locationMatch) {
                  data.location = locationMatch[0];
                  break;
                }
              }
            }
          }
          // Check for hidden spans or divs with location info
          const hiddenElements = actorSection.querySelectorAll("span[style*='display: none'], div[style*='display: none'], [aria-hidden='true']");
          for (const hiddenElem of hiddenElements) {
            const text = hiddenElem.textContent || hiddenElem.getAttribute("aria-label") || "";
            if (text && /(ontario|alberta|british columbia|quebec|toronto|vancouver)/i.test(text)) {
              const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
              if (locationMatch) {
                data.location = locationMatch[0].trim();
                break;
              }
            }
          }
        }
      }
      // Extract engagement metrics - try multiple approaches
      const likesSelectors = [
        ".social-counts-reactions__count",
@ -799,6 +852,48 @@ async function extractPostData(postElement, keyword) {
  }
 }
 /**
 * Extract location from a LinkedIn profile page
 */
 async function extractLocationFromProfilePage(page, profileUrl) {
  try {
    // Ensure URL is complete
    let fullUrl = profileUrl;
    if (!fullUrl.startsWith("http")) {
      fullUrl = `https://www.linkedin.com${fullUrl}`;
    }
    // Remove query parameters that might cause issues
    fullUrl = fullUrl.split("?")[0];
    // Open profile in new tab
    const profilePage = await page.context().newPage();
    try {
      await profilePage.goto(fullUrl, {
        waitUntil: "domcontentloaded",
        timeout: 15000,
      });
      // Wait a bit for content to load
      await new Promise(resolve => setTimeout(resolve, 2000));
      // Use the extractLocationFromProfile utility from ai-analyzer
      const location = await extractLocationFromProfile(profilePage);
      await profilePage.close();
      return location;
    } catch (error) {
      await profilePage.close();
      throw error;
    }
  } catch (error) {
    logger.debug(`Failed to extract location from profile ${profileUrl}: ${error.message}`);
    return "";
  }
 }
 /**
 * Extract numbers from text (e.g., "15 likes" -> 15)
 */
`@ -62,3 +62,5 @@ class CoreParser {`

	`module.exports = CoreParser;`	`module.exports = CoreParser;`