From 4099b237446b3f7fd5437848cabe298e085dbc03 Mon Sep 17 00:00:00 2001 From: tanyar09 Date: Tue, 16 Dec 2025 23:17:12 -0500 Subject: [PATCH] Enhance job search parser with LinkedIn strategy and configuration updates - Added LinkedIn jobs parsing strategy to support job extraction from LinkedIn. - Updated job search parser to include new site strategy and improved argument parsing for max pages and exclusion of rejected results. - Enhanced README documentation to reflect new features and usage examples. - Refactored existing strategies for consistency and improved error handling. --- core-parser/index.js | 2 + job-search-parser/README.md | 1027 +++++++------ job-search-parser/index.js | 64 +- .../strategies/linkedin-jobs-strategy.js | 1360 +++++++++++++++++ .../strategies/skipthedrive-strategy.js | 601 ++++---- linkedin-parser/index.js | 4 +- .../strategies/linkedin-strategy.js | 101 +- test/ai-analyzer.test.js | 160 +- 8 files changed, 2431 insertions(+), 888 deletions(-) create mode 100644 job-search-parser/strategies/linkedin-jobs-strategy.js diff --git a/core-parser/index.js b/core-parser/index.js index ae1a0e0..a721783 100644 --- a/core-parser/index.js +++ b/core-parser/index.js @@ -62,3 +62,5 @@ class CoreParser { module.exports = CoreParser; + + diff --git a/job-search-parser/README.md b/job-search-parser/README.md index 487b08e..5896666 100644 --- a/job-search-parser/README.md +++ b/job-search-parser/README.md @@ -1,497 +1,530 @@ -# Job Search Parser - Job Market Intelligence - -Specialized parser for job market intelligence, tracking job postings, market trends, and competitive analysis. Focuses on tech roles and industry insights. - -## ๐ŸŽฏ Purpose - -The Job Search Parser is designed to: - -- **Track Job Market Trends**: Monitor demand for specific roles and skills -- **Competitive Intelligence**: Analyze salary ranges and requirements -- **Industry Insights**: Track hiring patterns across different sectors -- **Skill Gap Analysis**: Identify in-demand technologies and frameworks -- **Market Demand Forecasting**: Predict job market trends - -## ๐Ÿš€ Features - -### Core Functionality - -- **Multi-Source Aggregation**: Collect job data from multiple platforms -- **Role-Specific Tracking**: Focus on tech roles and emerging positions -- **Skill Analysis**: Extract and categorize required skills -- **Salary Intelligence**: Track compensation ranges and trends -- **Company Intelligence**: Monitor hiring companies and patterns - -### Advanced Features - -- **Market Trend Analysis**: Identify growing and declining job categories -- **Geographic Distribution**: Track job distribution by location -- **Experience Level Analysis**: Entry, mid, senior level tracking -- **Remote Work Trends**: Monitor remote/hybrid work patterns -- **Technology Stack Tracking**: Framework and tool popularity - -## ๐ŸŒ Supported Job Sites - -### โœ… Implemented Parsers - -#### SkipTheDrive Parser - -Remote job board specializing in work-from-home positions. - -**Features:** - -- Keyword-based job search with relevance sorting -- Job type filtering (full-time, part-time, contract) -- Multi-page result parsing with pagination -- Featured/sponsored job identification -- AI-powered job relevance analysis -- Automatic duplicate detection - -**Usage:** - -```bash -# Parse SkipTheDrive for QA automation jobs -node index.js --sites=skipthedrive --keywords="automation qa,qa engineer" - -# Filter by job type -JOB_TYPES="full time,contract" node index.js --sites=skipthedrive - -# Run demo with limited results -node index.js --sites=skipthedrive --demo -``` - -### ๐Ÿšง Planned Parsers - -- **Indeed**: Comprehensive job aggregator -- **Glassdoor**: Jobs with company reviews and salary data -- **Monster**: Traditional job board -- **SimplyHired**: Job aggregator with salary estimates -- **LinkedIn Jobs**: Professional network job postings -- **AngelList**: Startup and tech jobs -- **Remote.co**: Dedicated remote work jobs -- **FlexJobs**: Flexible and remote positions - -## ๐Ÿ“ฆ Installation - -```bash -# Install dependencies -npm install - -# Run tests -npm test - -# Run demo -node demo.js -``` - -## ๐Ÿ”ง Configuration - -### Environment Variables - -Create a `.env` file in the parser directory: - -```env -# Job Search Configuration -SEARCH_SOURCES=linkedin,indeed,glassdoor -TARGET_ROLES=software engineer,data scientist,product manager -LOCATION_FILTER=Toronto,Vancouver,Calgary -EXPERIENCE_LEVELS=entry,mid,senior -REMOTE_PREFERENCE=remote,hybrid,onsite - -# Analysis Configuration -ENABLE_SALARY_ANALYSIS=true -ENABLE_SKILL_ANALYSIS=true -ENABLE_TREND_ANALYSIS=true -MIN_SALARY=50000 -MAX_SALARY=200000 - -# Output Configuration -OUTPUT_FORMAT=json,csv -SAVE_RAW_DATA=true -ANALYSIS_INTERVAL=daily -``` - -### Command Line Options - -```bash -# Basic usage -node index.js - -# Specific roles -node index.js --roles="frontend developer,backend developer" - -# Geographic focus -node index.js --locations="Toronto,Vancouver" - -# Experience level -node index.js --experience="senior" - -# Output format -node index.js --output=results/job-market-analysis.json -``` - -**Available Options:** - -- `--roles="role1,role2"`: Target job roles -- `--locations="city1,city2"`: Geographic focus -- `--experience="entry|mid|senior"`: Experience level -- `--remote="remote|hybrid|onsite"`: Remote work preference -- `--salary-min=NUMBER`: Minimum salary filter -- `--salary-max=NUMBER`: Maximum salary filter -- `--output=FILE`: Output filename -- `--format=json|csv`: Output format -- `--trends`: Enable trend analysis -- `--skills`: Enable skill analysis - -## ๐Ÿ“Š Keywords - -### Role-Specific Keywords - -Place keyword CSV files in the `keywords/` directory: - -``` -job-search-parser/ -โ”œโ”€โ”€ keywords/ -โ”‚ โ”œโ”€โ”€ job-search-keywords.csv # General job search terms -โ”‚ โ”œโ”€โ”€ tech-roles.csv # Technology roles -โ”‚ โ”œโ”€โ”€ data-roles.csv # Data science roles -โ”‚ โ”œโ”€โ”€ management-roles.csv # Management positions -โ”‚ โ””โ”€โ”€ emerging-roles.csv # Emerging job categories -โ””โ”€โ”€ index.js -``` - -### Tech Roles Keywords - -```csv -keyword -software engineer -frontend developer -backend developer -full stack developer -data scientist -machine learning engineer -devops engineer -site reliability engineer -cloud architect -security engineer -mobile developer -iOS developer -Android developer -react developer -vue developer -angular developer -node.js developer -python developer -java developer -golang developer -rust developer -data engineer -analytics engineer -``` - -### Data Science Keywords - -```csv -keyword -data scientist -machine learning engineer -data analyst -business analyst -data engineer -analytics engineer -ML engineer -AI engineer -statistician -quantitative analyst -research scientist -data architect -BI developer -ETL developer -``` - -## ๐Ÿ“ˆ Usage Examples - -### Basic Job Search - -```bash -# Standard job market analysis -node index.js - -# Specific tech roles -node index.js --roles="software engineer,data scientist" - -# Geographic focus -node index.js --locations="Toronto,Vancouver,Calgary" -``` - -### Advanced Analysis - -```bash -# Senior level positions -node index.js --experience="senior" --salary-min=100000 - -# Remote work opportunities -node index.js --remote="remote" --roles="frontend developer" - -# Trend analysis -node index.js --trends --skills --output=results/trends.json -``` - -### Market Intelligence - -```bash -# Salary analysis -node index.js --salary-min=80000 --salary-max=150000 - -# Skill gap analysis -node index.js --skills --roles="machine learning engineer" - -# Competitive intelligence -node index.js --companies="Google,Microsoft,Amazon" -``` - -## ๐Ÿ“Š Output Format - -### JSON Structure - -```json -{ - "metadata": { - "timestamp": "2024-01-15T10:30:00Z", - "search_parameters": { - "roles": ["software engineer", "data scientist"], - "locations": ["Toronto", "Vancouver"], - "experience_levels": ["mid", "senior"], - "remote_preference": ["remote", "hybrid"] - }, - "total_jobs_found": 1250, - "analysis_duration_seconds": 45 - }, - "market_overview": { - "total_jobs": 1250, - "average_salary": 95000, - "salary_range": { - "min": 65000, - "max": 180000, - "median": 92000 - }, - "remote_distribution": { - "remote": 45, - "hybrid": 35, - "onsite": 20 - }, - "experience_distribution": { - "entry": 15, - "mid": 45, - "senior": 40 - } - }, - "trends": { - "growing_skills": [ - { "skill": "React", "growth_rate": 25 }, - { "skill": "Python", "growth_rate": 18 }, - { "skill": "AWS", "growth_rate": 22 } - ], - "declining_skills": [ - { "skill": "jQuery", "growth_rate": -12 }, - { "skill": "PHP", "growth_rate": -8 } - ], - "emerging_roles": ["AI Engineer", "DevSecOps Engineer", "Data Engineer"] - }, - "jobs": [ - { - "id": "job_1", - "title": "Senior Software Engineer", - "company": "TechCorp", - "location": "Toronto, Ontario", - "remote_type": "hybrid", - "salary": { - "min": 100000, - "max": 140000, - "currency": "CAD" - }, - "required_skills": ["React", "Node.js", "TypeScript", "AWS"], - "preferred_skills": ["GraphQL", "Docker", "Kubernetes"], - "experience_level": "senior", - "job_url": "https://example.com/job/1", - "posted_date": "2024-01-10T09:00:00Z", - "scraped_at": "2024-01-15T10:30:00Z" - } - ], - "analysis": { - "skill_demand": { - "React": { "count": 45, "avg_salary": 98000 }, - "Python": { "count": 38, "avg_salary": 102000 }, - "AWS": { "count": 32, "avg_salary": 105000 } - }, - "company_insights": { - "top_hirers": [ - { "company": "TechCorp", "jobs": 25 }, - { "company": "StartupXYZ", "jobs": 18 } - ], - "salary_leaders": [ - { "company": "BigTech", "avg_salary": 120000 }, - { "company": "FinTech", "avg_salary": 115000 } - ] - } - } -} -``` - -### CSV Output - -The parser can also generate CSV files for easy analysis: - -```csv -job_id,title,company,location,remote_type,salary_min,salary_max,required_skills,experience_level,posted_date -job_1,Senior Software Engineer,TechCorp,Toronto,hybrid,100000,140000,"React,Node.js,TypeScript",senior,2024-01-10 -job_2,Data Scientist,DataCorp,Vancouver,remote,90000,130000,"Python,SQL,ML",mid,2024-01-09 -``` - -## ๐Ÿ”’ Security & Best Practices - -### Data Privacy - -- Respect job site terms of service -- Implement appropriate rate limiting -- Store data securely and responsibly -- Anonymize sensitive information - -### Rate Limiting - -- Implement delays between requests -- Respect API rate limits -- Use multiple data sources -- Monitor for blocking/detection - -### Legal Compliance - -- Educational and research purposes only -- Respect website terms of service -- Implement data retention policies -- Monitor for legal changes - -## ๐Ÿงช Testing - -### Run Tests - -```bash -# All tests -npm test - -# Specific test suites -npm test -- --testNamePattern="JobSearch" -npm test -- --testNamePattern="Analysis" -npm test -- --testNamePattern="Trends" -``` - -### Test Coverage - -```bash -npm run test:coverage -``` - -## ๐Ÿš€ Performance Optimization - -### Recommended Settings - -#### Fast Analysis - -```bash -node index.js --roles="software engineer" --locations="Toronto" -``` - -#### Comprehensive Analysis - -```bash -node index.js --trends --skills --experience="all" -``` - -#### Focused Intelligence - -```bash -node index.js --salary-min=80000 --remote="remote" --trends -``` - -### Performance Tips - -- Use specific role filters to reduce data volume -- Implement caching for repeated searches -- Use parallel processing for multiple sources -- Optimize data storage and retrieval - -## ๐Ÿ”ง Troubleshooting - -### Common Issues - -#### Rate Limiting - -```bash -# Reduce request frequency -export REQUEST_DELAY=2000 -node index.js -``` - -#### Data Source Issues - -```bash -# Use specific sources -node index.js --sources="linkedin,indeed" - -# Check source availability -node index.js --test-sources -``` - -#### Output Issues - -```bash -# Check output directory -mkdir -p results -node index.js --output=results/analysis.json - -# Verify file permissions -chmod 755 results/ -``` - -## ๐Ÿ“ˆ Monitoring & Analytics - -### Key Metrics - -- **Job Volume**: Total jobs found per search -- **Salary Trends**: Average and median salary changes -- **Skill Demand**: Most requested skills -- **Remote Adoption**: Remote work trend analysis -- **Market Velocity**: Job posting frequency - -### Dashboard Integration - -- Real-time market monitoring -- Trend visualization -- Salary benchmarking -- Skill gap analysis -- Competitive intelligence - -## ๐Ÿค Contributing - -### Development Setup - -1. Fork the repository -2. Create feature branch -3. Add tests for new functionality -4. Ensure all tests pass -5. Submit pull request - -### Code Standards - -- Follow existing code style -- Add JSDoc comments -- Maintain test coverage -- Update documentation - -## ๐Ÿ“„ License - -This parser is part of the LinkedOut platform and follows the same licensing terms. - ---- - -**Note**: This tool is designed for educational and research purposes. Always respect website terms of service and implement appropriate rate limiting and ethical usage practices. +# Job Search Parser - Job Market Intelligence + +Specialized parser for job market intelligence, tracking job postings, market trends, and competitive analysis. Focuses on tech roles and industry insights. + +## ๐ŸŽฏ Purpose + +The Job Search Parser is designed to: + +- **Track Job Market Trends**: Monitor demand for specific roles and skills +- **Competitive Intelligence**: Analyze salary ranges and requirements +- **Industry Insights**: Track hiring patterns across different sectors +- **Skill Gap Analysis**: Identify in-demand technologies and frameworks +- **Market Demand Forecasting**: Predict job market trends + +## ๐Ÿš€ Features + +### Core Functionality + +- **Multi-Source Aggregation**: Collect job data from multiple platforms +- **Role-Specific Tracking**: Focus on tech roles and emerging positions +- **Skill Analysis**: Extract and categorize required skills +- **Salary Intelligence**: Track compensation ranges and trends +- **Company Intelligence**: Monitor hiring companies and patterns + +### Advanced Features + +- **Market Trend Analysis**: Identify growing and declining job categories +- **Geographic Distribution**: Track job distribution by location +- **Experience Level Analysis**: Entry, mid, senior level tracking +- **Remote Work Trends**: Monitor remote/hybrid work patterns +- **Technology Stack Tracking**: Framework and tool popularity + +## ๐ŸŒ Supported Job Sites + +### โœ… Implemented Parsers + +#### SkipTheDrive Parser + +Remote job board specializing in work-from-home positions. + +**Features:** + +- Keyword-based job search with relevance sorting +- Job type filtering (full-time, part-time, contract) +- Multi-page result parsing with pagination +- Featured/sponsored job identification +- AI-powered job relevance analysis +- Automatic duplicate detection + +**Usage:** + +```bash +# Parse SkipTheDrive for QA automation jobs +node index.js --sites=skipthedrive --keywords="automation qa,qa engineer" + +# Filter by job type +JOB_TYPES="full time,contract" node index.js --sites=skipthedrive + +# Run demo with limited results +node index.js --sites=skipthedrive --demo +``` + +#### LinkedIn Jobs Parser + +Professional network job postings with comprehensive job data. + +**Features:** + +- LinkedIn authentication support +- Keyword-based job search +- Location filtering (both LinkedIn location and post-extraction filter) +- Multi-page result parsing with pagination +- Job type and experience level extraction +- Automatic duplicate detection +- Infinite scroll handling + +**Requirements:** + +- LinkedIn credentials (username and password) must be set in `.env` file: + ```env + LINKEDIN_USERNAME=tatiana.litvak25@gmail.com + LINKEDIN_PASSWORD=Sladkiy99( + LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location filter + ``` + +**Usage:** + +```bash +# Search LinkedIn jobs +node index.js --sites=linkedin --keywords="software engineer,developer" + +# Search with location filter +node index.js --sites=linkedin --keywords="co-op" --location="Ontario" + +# Combine multiple sites +node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op" +``` + +### ๐Ÿšง Planned Parsers + +- **Indeed**: Comprehensive job aggregator +- **Glassdoor**: Jobs with company reviews and salary data +- **Monster**: Traditional job board +- **SimplyHired**: Job aggregator with salary estimates +- **AngelList**: Startup and tech jobs +- **Remote.co**: Dedicated remote work jobs +- **FlexJobs**: Flexible and remote positions + +## ๐Ÿ“ฆ Installation + +```bash +# Install dependencies +npm install + +# Run tests +npm test + +# Run demo +node demo.js +``` + +## ๐Ÿ”ง Configuration + +### Environment Variables + +Create a `.env` file in the parser directory: + +```env +# Job Search Configuration +SEARCH_KEYWORDS=software engineer,developer,programmer +LOCATION_FILTER=Ontario,Canada +MAX_PAGES=5 + +# LinkedIn Configuration (required for LinkedIn jobs) +LINKEDIN_USERNAME=your_email@example.com +LINKEDIN_PASSWORD=your_password +LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location search + +# Analysis Configuration +ENABLE_AI_ANALYSIS=false +HEADLESS=true + +# Output Configuration +OUTPUT_FORMAT=json +``` + +### Command Line Options + +```bash +# Basic usage +node index.js + +# Specific roles +node index.js --roles="frontend developer,backend developer" + +# Geographic focus +node index.js --locations="Toronto,Vancouver" + +# Experience level +node index.js --experience="senior" + +# Output format +node index.js --output=results/job-market-analysis.json +``` + +**Available Options:** + +- `--roles="role1,role2"`: Target job roles +- `--locations="city1,city2"`: Geographic focus +- `--experience="entry|mid|senior"`: Experience level +- `--remote="remote|hybrid|onsite"`: Remote work preference +- `--salary-min=NUMBER`: Minimum salary filter +- `--salary-max=NUMBER`: Maximum salary filter +- `--output=FILE`: Output filename +- `--format=json|csv`: Output format +- `--trends`: Enable trend analysis +- `--skills`: Enable skill analysis + +## ๐Ÿ“Š Keywords + +### Role-Specific Keywords + +Place keyword CSV files in the `keywords/` directory: + +``` +job-search-parser/ +โ”œโ”€โ”€ keywords/ +โ”‚ โ”œโ”€โ”€ job-search-keywords.csv # General job search terms +โ”‚ โ”œโ”€โ”€ tech-roles.csv # Technology roles +โ”‚ โ”œโ”€โ”€ data-roles.csv # Data science roles +โ”‚ โ”œโ”€โ”€ management-roles.csv # Management positions +โ”‚ โ””โ”€โ”€ emerging-roles.csv # Emerging job categories +โ””โ”€โ”€ index.js +``` + +### Tech Roles Keywords + +```csv +keyword +software engineer +frontend developer +backend developer +full stack developer +data scientist +machine learning engineer +devops engineer +site reliability engineer +cloud architect +security engineer +mobile developer +iOS developer +Android developer +react developer +vue developer +angular developer +node.js developer +python developer +java developer +golang developer +rust developer +data engineer +analytics engineer +``` + +### Data Science Keywords + +```csv +keyword +data scientist +machine learning engineer +data analyst +business analyst +data engineer +analytics engineer +ML engineer +AI engineer +statistician +quantitative analyst +research scientist +data architect +BI developer +ETL developer +``` + +## ๐Ÿ“ˆ Usage Examples + +### Basic Job Search + +```bash +# Standard job market analysis +node index.js + +# Specific tech roles +node index.js --roles="software engineer,data scientist" + +# Geographic focus +node index.js --locations="Toronto,Vancouver,Calgary" +``` + +### Advanced Analysis + +```bash +# Senior level positions +node index.js --experience="senior" --salary-min=100000 + +# Remote work opportunities +node index.js --remote="remote" --roles="frontend developer" + +# Trend analysis +node index.js --trends --skills --output=results/trends.json +``` + +### Market Intelligence + +```bash +# Salary analysis +node index.js --salary-min=80000 --salary-max=150000 + +# Skill gap analysis +node index.js --skills --roles="machine learning engineer" + +# Competitive intelligence +node index.js --companies="Google,Microsoft,Amazon" +``` + +## ๐Ÿ“Š Output Format + +### JSON Structure + +```json +{ + "metadata": { + "timestamp": "2024-01-15T10:30:00Z", + "search_parameters": { + "roles": ["software engineer", "data scientist"], + "locations": ["Toronto", "Vancouver"], + "experience_levels": ["mid", "senior"], + "remote_preference": ["remote", "hybrid"] + }, + "total_jobs_found": 1250, + "analysis_duration_seconds": 45 + }, + "market_overview": { + "total_jobs": 1250, + "average_salary": 95000, + "salary_range": { + "min": 65000, + "max": 180000, + "median": 92000 + }, + "remote_distribution": { + "remote": 45, + "hybrid": 35, + "onsite": 20 + }, + "experience_distribution": { + "entry": 15, + "mid": 45, + "senior": 40 + } + }, + "trends": { + "growing_skills": [ + { "skill": "React", "growth_rate": 25 }, + { "skill": "Python", "growth_rate": 18 }, + { "skill": "AWS", "growth_rate": 22 } + ], + "declining_skills": [ + { "skill": "jQuery", "growth_rate": -12 }, + { "skill": "PHP", "growth_rate": -8 } + ], + "emerging_roles": ["AI Engineer", "DevSecOps Engineer", "Data Engineer"] + }, + "jobs": [ + { + "id": "job_1", + "title": "Senior Software Engineer", + "company": "TechCorp", + "location": "Toronto, Ontario", + "remote_type": "hybrid", + "salary": { + "min": 100000, + "max": 140000, + "currency": "CAD" + }, + "required_skills": ["React", "Node.js", "TypeScript", "AWS"], + "preferred_skills": ["GraphQL", "Docker", "Kubernetes"], + "experience_level": "senior", + "job_url": "https://example.com/job/1", + "posted_date": "2024-01-10T09:00:00Z", + "scraped_at": "2024-01-15T10:30:00Z" + } + ], + "analysis": { + "skill_demand": { + "React": { "count": 45, "avg_salary": 98000 }, + "Python": { "count": 38, "avg_salary": 102000 }, + "AWS": { "count": 32, "avg_salary": 105000 } + }, + "company_insights": { + "top_hirers": [ + { "company": "TechCorp", "jobs": 25 }, + { "company": "StartupXYZ", "jobs": 18 } + ], + "salary_leaders": [ + { "company": "BigTech", "avg_salary": 120000 }, + { "company": "FinTech", "avg_salary": 115000 } + ] + } + } +} +``` + +### CSV Output + +The parser can also generate CSV files for easy analysis: + +```csv +job_id,title,company,location,remote_type,salary_min,salary_max,required_skills,experience_level,posted_date +job_1,Senior Software Engineer,TechCorp,Toronto,hybrid,100000,140000,"React,Node.js,TypeScript",senior,2024-01-10 +job_2,Data Scientist,DataCorp,Vancouver,remote,90000,130000,"Python,SQL,ML",mid,2024-01-09 +``` + +## ๐Ÿ”’ Security & Best Practices + +### Data Privacy + +- Respect job site terms of service +- Implement appropriate rate limiting +- Store data securely and responsibly +- Anonymize sensitive information + +### Rate Limiting + +- Implement delays between requests +- Respect API rate limits +- Use multiple data sources +- Monitor for blocking/detection + +### Legal Compliance + +- Educational and research purposes only +- Respect website terms of service +- Implement data retention policies +- Monitor for legal changes + +## ๐Ÿงช Testing + +### Run Tests + +```bash +# All tests +npm test + +# Specific test suites +npm test -- --testNamePattern="JobSearch" +npm test -- --testNamePattern="Analysis" +npm test -- --testNamePattern="Trends" +``` + +### Test Coverage + +```bash +npm run test:coverage +``` + +## ๐Ÿš€ Performance Optimization + +### Recommended Settings + +#### Fast Analysis + +```bash +node index.js --roles="software engineer" --locations="Toronto" +``` + +#### Comprehensive Analysis + +```bash +node index.js --trends --skills --experience="all" +``` + +#### Focused Intelligence + +```bash +node index.js --salary-min=80000 --remote="remote" --trends +``` + +### Performance Tips + +- Use specific role filters to reduce data volume +- Implement caching for repeated searches +- Use parallel processing for multiple sources +- Optimize data storage and retrieval + +## ๐Ÿ”ง Troubleshooting + +### Common Issues + +#### Rate Limiting + +```bash +# Reduce request frequency +export REQUEST_DELAY=2000 +node index.js +``` + +#### Data Source Issues + +```bash +# Use specific sources +node index.js --sources="linkedin,indeed" + +# Check source availability +node index.js --test-sources +``` + +#### Output Issues + +```bash +# Check output directory +mkdir -p results +node index.js --output=results/analysis.json + +# Verify file permissions +chmod 755 results/ +``` + +## ๐Ÿ“ˆ Monitoring & Analytics + +### Key Metrics + +- **Job Volume**: Total jobs found per search +- **Salary Trends**: Average and median salary changes +- **Skill Demand**: Most requested skills +- **Remote Adoption**: Remote work trend analysis +- **Market Velocity**: Job posting frequency + +### Dashboard Integration + +- Real-time market monitoring +- Trend visualization +- Salary benchmarking +- Skill gap analysis +- Competitive intelligence + +## ๐Ÿค Contributing + +### Development Setup + +1. Fork the repository +2. Create feature branch +3. Add tests for new functionality +4. Ensure all tests pass +5. Submit pull request + +### Code Standards + +- Follow existing code style +- Add JSDoc comments +- Maintain test coverage +- Update documentation + +## ๐Ÿ“„ License + +This parser is part of the LinkedOut platform and follows the same licensing terms. + +--- + +**Note**: This tool is designed for educational and research purposes. Always respect website terms of service and implement appropriate rate limiting and ethical usage practices. diff --git a/job-search-parser/index.js b/job-search-parser/index.js index 64a47df..656eed9 100644 --- a/job-search-parser/index.js +++ b/job-search-parser/index.js @@ -10,6 +10,7 @@ const path = require("path"); const fs = require("fs"); const CoreParser = require("../core-parser"); const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy"); +const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy"); const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer"); // Load environment variables @@ -18,14 +19,16 @@ require("dotenv").config({ path: path.join(__dirname, ".env") }); // Configuration from environment const HEADLESS = process.env.HEADLESS !== "false"; const SEARCH_KEYWORDS = - process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer"; + process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer"; const LOCATION_FILTER = process.env.LOCATION_FILTER; const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true"; const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5; +const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true"; // Available site strategies const SITE_STRATEGIES = { skipthedrive: skipthedriveStrategy, + linkedin: linkedinJobsStrategy, // Add more site strategies here // indeed: indeedStrategy, // glassdoor: glassdoorStrategy, @@ -41,6 +44,7 @@ function parseArguments() { keywords: null, locationFilter: null, maxPages: MAX_PAGES, + excludeRejected: EXCLUDE_REJECTED, }; args.forEach((arg) => { @@ -57,7 +61,15 @@ function parseArguments() { } else if (arg.startsWith("--location=")) { options.locationFilter = arg.split("=")[1]; } else if (arg.startsWith("--max-pages=")) { - options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES; + const value = arg.split("=")[1]; + // Support "all" or "0" to mean unlimited pages + if (value === "all" || value === "0") { + options.maxPages = 0; // 0 means unlimited + } else { + options.maxPages = parseInt(value) || MAX_PAGES; + } + } else if (arg === "--no-rejected" || arg === "--exclude-rejected") { + options.excludeRejected = true; } }); @@ -84,6 +96,7 @@ async function startJobSearchParser(options = {}) { finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim()); const locationFilter = finalOptions.locationFilter || LOCATION_FILTER; const sites = finalOptions.sites; + const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED; logger.info(`๐Ÿ“ฆ Selected job sites: ${sites.join(", ")}`); logger.info(`๐Ÿ” Search Keywords: ${keywords.join(", ")}`); @@ -108,18 +121,46 @@ async function startJobSearchParser(options = {}) { logger.step(`\n๐ŸŒ Parsing ${site}...`); const startTime = Date.now(); - const parseResult = await strategy(coreParser, { + // Prepare strategy options + const strategyOptions = { keywords, locationFilter, maxPages: finalOptions.maxPages, - }); + }; + + // Add credentials for LinkedIn + if (site === "linkedin") { + const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME; + const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD; + + if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) { + logger.error(`โŒ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`); + siteResults[site] = { + count: 0, + rejected: 0, + duration: "0s", + error: "LinkedIn credentials not found", + }; + continue; + } + + strategyOptions.credentials = { + username: LINKEDIN_USERNAME, + password: LINKEDIN_PASSWORD, + }; + strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || ""; + } + + const parseResult = await strategy(coreParser, strategyOptions); const { results, rejectedResults, summary } = parseResult; const duration = ((Date.now() - startTime) / 1000).toFixed(2); // Collect results + logger.info(`๐Ÿ“ฆ Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`); allResults.push(...results); allRejectedResults.push(...rejectedResults); + logger.info(`๐Ÿ“ฆ Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`); siteResults[site] = { count: results.length, @@ -162,6 +203,9 @@ async function startJobSearchParser(options = {}) { } // Save results + logger.info(`๐Ÿ’พ Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`); + logger.info(`๐Ÿ’พ EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`); + const outputData = { metadata: { extractedAt: new Date().toISOString(), @@ -171,11 +215,21 @@ async function startJobSearchParser(options = {}) { keywords: keywords.join(", "), locationFilter, analysisResults, + rejectedJobsExcluded: excludeRejected, }, results: allResults, - rejectedResults: allRejectedResults, siteResults, }; + + // Always include rejectedResults if not excluded (make it explicit, not using spread) + if (!excludeRejected) { + outputData.rejectedResults = allRejectedResults; + logger.info(`โœ… Including ${allRejectedResults.length} rejected results in output`); + } else { + logger.info(`โญ๏ธ Excluding rejected results (EXCLUDE_REJECTED=true)`); + } + + logger.info(`๐Ÿ’พ Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`); const resultsDir = path.join(__dirname, "results"); if (!fs.existsSync(resultsDir)) { diff --git a/job-search-parser/strategies/linkedin-jobs-strategy.js b/job-search-parser/strategies/linkedin-jobs-strategy.js new file mode 100644 index 0000000..9cc4299 --- /dev/null +++ b/job-search-parser/strategies/linkedin-jobs-strategy.js @@ -0,0 +1,1360 @@ +/** + * LinkedIn Jobs Parsing Strategy + * + * Uses core-parser for browser management and ai-analyzer for utilities + */ + +const { + logger, + cleanText, + validateLocationAgainstFilters, + parseLocationFilters, + containsAnyKeyword, +} = require("ai-analyzer"); + +/** + * LinkedIn Jobs URL builder + */ +function buildJobSearchUrl(keyword, location = "", filters = {}) { + const baseUrl = "https://www.linkedin.com/jobs/search/"; + + // Always wrap keywords in quotes to ensure exact phrase matching + // LinkedIn's search treats unquoted keywords as individual words (OR logic) + // e.g., "co-op" becomes "co" OR "op", "software engineer" becomes "software" OR "engineer" + // Wrapping in quotes forces LinkedIn to search for the exact phrase + // URLSearchParams will properly encode the quotes + const searchKeyword = `"${keyword}"`; + + const params = new URLSearchParams({ + keywords: searchKeyword, + sortBy: "DD", // Date posted (newest first) + }); + + if (location) { + params.append("location", location); + } + + // Add additional filters + if (filters.experienceLevel) { + params.append("f_E", filters.experienceLevel); + } + if (filters.jobType) { + params.append("f_JT", filters.jobType); // F=Full-time, P=Part-time, C=Contract, T=Temporary, I=Internship + } + if (filters.remote) { + params.append("f_WT", "2"); // 2 = Remote + } + + return `${baseUrl}?${params.toString()}`; +} + +/** + * LinkedIn Jobs parsing strategy function + */ +async function linkedinJobsStrategy(coreParser, options = {}) { + const { + keywords = ["software engineer", "developer"], + locationFilter = null, + maxPages = 5, + credentials = {}, + location = "", // LinkedIn location search (e.g., "Canada", "Toronto, Ontario, Canada") + } = options; + + const results = []; + const rejectedResults = []; + const seenJobs = new Set(); + + // Create a backup to track results in case of issues + let resultsBackup = []; + let rejectedResultsBackup = []; + + try { + // Create main page + const page = await coreParser.createPage("linkedin-jobs-main"); + + // Authenticate to LinkedIn + logger.info("๐Ÿ” Authenticating to LinkedIn..."); + await coreParser.authenticate("linkedin", credentials, "linkedin-jobs-main"); + logger.info("โœ… LinkedIn authentication successful"); + + logger.info("๐Ÿš€ Starting LinkedIn Jobs parser..."); + logger.info(`๐Ÿ” Keywords: ${keywords.join(", ")}`); + logger.info(`๐Ÿ“ Location Filter: ${locationFilter || "None"}`); + logger.info(`๐ŸŒ LinkedIn Location: ${location || "None"}`); + logger.info(`๐Ÿ“„ Max Pages: ${maxPages}`); + + // Search for each keyword + for (const keyword of keywords) { + logger.info(`\n๐Ÿ” Searching LinkedIn Jobs for: "${keyword}"`); + + const searchUrl = buildJobSearchUrl(keyword, location); + logger.info(`๐Ÿ”— Search URL: ${searchUrl}`); + + // Check if page is still valid before proceeding + try { + await page.evaluate(() => document.readyState).catch(() => { + throw new Error("Page is no longer valid - browser may have closed"); + }); + } catch (pageError) { + logger.error(`โŒ Page is no longer accessible: ${pageError.message}`); + logger.info(`โš ๏ธ Preserving ${results.length} jobs found so far`); + break; // Exit keyword loop if page is invalid + } + + try { + // Navigate to job search results + await coreParser.navigateTo(searchUrl, { + pageId: "linkedin-jobs-main", + retries: 2, + waitUntil: "networkidle", + }); + + // Wait for page to load - reduced delay, use networkidle from navigation + await new Promise((resolve) => setTimeout(resolve, 2000)); + + // Verify we're on the right page and check what LinkedIn shows + const currentUrl = page.url(); + logger.info(`๐Ÿ“ Current page URL: ${currentUrl}`); + + // Check if LinkedIn shows any results count + try { + const resultsText = await page.evaluate(() => { + // Look for result count text like "Showing X results" or "X jobs" + const possibleTexts = [ + document.querySelector("h1")?.textContent, + document.querySelector(".results-context-header__job-count")?.textContent, + document.querySelector("[class*='results-count']")?.textContent, + document.querySelector("[class*='job-count']")?.textContent, + ].filter(Boolean); + return possibleTexts.join(" | ") || "No results count found"; + }); + logger.info(`๐Ÿ“Š LinkedIn results info: ${resultsText}`); + } catch (e) { + logger.debug(`Could not get results count: ${e.message}`); + } + + // Scroll to trigger lazy loading - single scroll operation + try { + await page.evaluate(() => { + window.scrollTo(0, 500); + }); + await new Promise((resolve) => setTimeout(resolve, 1000)); + } catch (e) { + logger.debug(`Could not scroll page: ${e.message}`); + } + + // Wait for job listings container - try multiple selectors + let hasResults = false; + const possibleSelectors = [ + ".jobs-search-results-list", + ".jobs-search-results", + "[data-test-id='job-search-results-list']", + ".scaffold-layout__list-container", + "ul.scaffold-layout__list-container", + ".jobs-search__results-list", + "main .scaffold-layout__list", + ]; + + // Try selectors in parallel with shorter timeout + const selectorPromises = possibleSelectors.map(async (selector) => { + try { + await page.waitForSelector(selector, { timeout: 3000 }); + const count = await page.$$(selector).then((elements) => elements.length); + if (count > 0) { + return { selector, count, success: true }; + } + } catch (e) { + // Selector failed + } + return { selector, success: false }; + }); + + // Wait for first successful selector + const selectorResults = await Promise.allSettled(selectorPromises); + for (const result of selectorResults) { + if (result.status === 'fulfilled' && result.value.success) { + hasResults = true; + logger.info(`โœ… Found job results container with selector: ${result.value.selector}`); + break; + } + } + + if (!hasResults) { + logger.warning(`โš ๏ธ No job results container found for keyword: ${keyword}`); + + // Debug: Check what's actually on the page + try { + const pageTitle = await page.title(); + const pageUrl = page.url(); + logger.info(`๐Ÿ“„ Page title: ${pageTitle}`); + logger.info(`๐Ÿ”— Page URL: ${pageUrl}`); + + // Check for common LinkedIn elements + const hasMain = await page.$("main").then(el => el !== null).catch(() => false); + const hasJobsSection = await page.$("[class*='job']").then(el => el !== null).catch(() => false); + logger.info(`๐Ÿ” Debug - Has main: ${hasMain}, Has jobs section: ${hasJobsSection}`); + + // Take screenshot for debugging + const screenshotPath = `debug-linkedin-jobs-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`; + await page.screenshot({ path: screenshotPath, fullPage: true }); + logger.info(`๐Ÿ“ธ Debug screenshot saved: ${screenshotPath}`); + } catch (e) { + logger.warning(`Could not capture debug info: ${e.message}`); + } + + continue; + } + + // LinkedIn uses pagination with a "Next" button + // Extract jobs from each page and navigate to next page + const allJobs = []; + let currentPage = 1; + const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited + + logger.info(`๐Ÿ“„ Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`); + + while (currentPage <= maxPagesToProcess) { + logger.info(`๐Ÿ“„ Processing page ${currentPage}...`); + + // Wait for page to fully load + await new Promise((resolve) => setTimeout(resolve, 2000)); + + // Extract jobs from current page + const pageJobs = await extractJobsFromPage(page, keyword, locationFilter); + logger.info(`๐Ÿ“‹ Extracted ${pageJobs.length} jobs from page ${currentPage}`); + + if (pageJobs.length === 0) { + logger.warning(`โš ๏ธ No jobs found on page ${currentPage}, stopping pagination`); + break; + } + + allJobs.push(...pageJobs); + + // Check if there's a next page + const hasNext = await hasNextPageAvailable(page); + if (!hasNext) { + logger.info(`โœ… No more pages available. Total jobs extracted: ${allJobs.length}`); + break; + } + + // Navigate to next page if we haven't reached maxPages + if (currentPage < maxPagesToProcess) { + logger.info(`โžก๏ธ Navigating to page ${currentPage + 1}...`); + const navigationSuccess = await navigateToNextPage(page); + + if (!navigationSuccess) { + logger.warning(`โš ๏ธ Failed to navigate to next page, stopping pagination`); + break; + } + + currentPage++; + + // Quick verification that job elements are present (navigateToNextPage already waited for them) + const jobCount = await page.$$eval( + "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", + (elements) => elements.length + ).catch(() => 0); + + if (jobCount === 0) { + logger.warning(`โš ๏ธ No job elements found on page ${currentPage} after navigation, stopping pagination`); + break; + } + + logger.debug(`โœ… Page ${currentPage} loaded with ${jobCount} job elements`); + } else { + logger.info(`๐Ÿ“Š Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${allJobs.length}`); + break; + } + } + + logger.info(`๐Ÿ“‹ Extracted ${allJobs.length} total jobs across ${currentPage} page(s)`); + + // Verify page is still valid after extraction + try { + await page.evaluate(() => document.readyState); + } catch (pageError) { + logger.warning(`โš ๏ธ Page became invalid after extraction, but we have ${allJobs.length} jobs extracted`); + } + + // Log sample job data for debugging + if (allJobs.length > 0 && process.env.DEBUG === "true") { + const sampleJob = allJobs[0]; + logger.debug(`๐Ÿ“ Sample job: ID=${sampleJob.jobId}, Title=${sampleJob.title}, Location=${sampleJob.location || 'N/A'}, Company=${sampleJob.company || 'N/A'}`); + } + + let duplicateCount = 0; + let locationRejectedCount = 0; + let addedCount = 0; + let noJobIdCount = 0; + + for (const job of allJobs) { + // Handle jobs without jobId - use URL as fallback identifier + if (!job.jobId || job.jobId === "") { + noJobIdCount++; + // Use job URL as identifier if no jobId + if (job.jobUrl) { + const urlMatch = job.jobUrl.match(/\/jobs\/view\/(\d+)/); + if (urlMatch) { + job.jobId = urlMatch[1]; + } else { + // Generate a unique ID from URL + job.jobId = `linkedin-${job.jobUrl.replace(/[^a-zA-Z0-9]/g, '-')}`; + } + } else { + // No jobId and no URL - skip this job + logger.warning(`โš ๏ธ Job has no jobId or URL, skipping: ${job.title || 'Unknown'}`); + continue; + } + } + + // Skip duplicates + if (seenJobs.has(job.jobId)) { + duplicateCount++; + if (process.env.DEBUG === "true") { + logger.debug(`โญ๏ธ Skipping duplicate job: ${job.jobId} - ${job.title}`); + } + continue; + } + seenJobs.add(job.jobId); + + // REMOVED: Keyword validation - LinkedIn already filtered by keyword in search results + // If LinkedIn returned this job in search results, it matches the keyword. + // The snippet might not contain the keyword, but the full description does. + // Trust LinkedIn's search algorithm rather than re-validating against snippets. + + // Validate location if filtering enabled + if (locationFilter) { + // Parse locationFilter string into array if it's a string + const locationFiltersArray = typeof locationFilter === 'string' + ? parseLocationFilters(locationFilter) + : locationFilter; + + const locationValid = validateLocationAgainstFilters( + job.location, + locationFiltersArray + ); + + if (!locationValid.isValid) { + locationRejectedCount++; + rejectedResults.push({ + ...job, + rejectionReason: locationValid.reasoning || "Location filter mismatch", + }); + if (process.env.DEBUG === "true") { + logger.debug(`๐Ÿ“ Rejected location: "${job.location}" - ${locationValid.reasoning || "Location filter mismatch"}`); + } + continue; + } + } + + results.push(job); + addedCount++; + } + + // Backup results after each keyword processing + resultsBackup = [...results]; + rejectedResultsBackup = [...rejectedResults]; + + logger.info(`๐Ÿ“Š Processing complete: ${addedCount} added, ${locationRejectedCount} location-rejected, ${duplicateCount} duplicates, ${noJobIdCount} had no jobId`); + logger.info(`๐Ÿ“Š Current results count: ${results.length} jobs accumulated so far`); + logger.info(`๐Ÿ“Š Backup results count: ${resultsBackup.length} jobs in backup`); + } catch (error) { + logger.error(`Error processing keyword "${keyword}": ${error.message}`); + logger.error(`Stack: ${error.stack}`); + // Preserve results even if there's an error + logger.info(`โš ๏ธ Preserving ${results.length} jobs found before error`); + } + } + + // Log results before returning + logger.info(`๐Ÿ“Š Final results check: results.length=${results.length}, rejectedResults.length=${rejectedResults.length}`); + logger.info(`๐Ÿ“Š Backup check: resultsBackup.length=${resultsBackup.length}, rejectedResultsBackup.length=${rejectedResultsBackup.length}`); + + // If results array is empty but backup has data, use backup (defensive programming) + const finalResults = results.length > 0 ? results : resultsBackup; + const finalRejectedResults = rejectedResults.length > 0 ? rejectedResults : rejectedResultsBackup; + + if (results.length === 0 && resultsBackup.length > 0) { + logger.warning(`โš ๏ธ Results array was empty but backup has ${resultsBackup.length} jobs - using backup!`); + } + + if (finalResults.length > 0) { + logger.info(`๐Ÿ“ First result sample: ${JSON.stringify(finalResults[0], null, 2).substring(0, 200)}...`); + } + + logger.info( + `๐ŸŽฏ LinkedIn Jobs parsing completed: ${finalResults.length} jobs found, ${finalRejectedResults.length} rejected` + ); + + // Final verification - log if results seem wrong + if (finalResults.length === 0 && finalRejectedResults.length === 0) { + logger.warning(`โš ๏ธ No jobs found or rejected - this might indicate an extraction issue`); + } + + const returnValue = { + results: [...finalResults], // Create a copy to ensure we're returning the right data + rejectedResults: [...finalRejectedResults], + summary: { + totalJobs: finalResults.length, + totalRejected: finalRejectedResults.length, + keywords: keywords.join(", "), + locationFilter, + source: "linkedin-jobs", + }, + }; + + logger.info(`๐Ÿ“ฆ Returning: ${returnValue.results.length} results, ${returnValue.rejectedResults.length} rejected`); + return returnValue; + } catch (error) { + logger.error(`โŒ LinkedIn Jobs parsing failed: ${error.message}`); + logger.error(`Stack: ${error.stack}`); + // Return whatever results we have, even if there was an error + logger.info(`โš ๏ธ Returning ${results.length} jobs found before fatal error`); + return { + results, + rejectedResults, + summary: { + totalJobs: results.length, + totalRejected: rejectedResults.length, + keywords: keywords.join(", "), + locationFilter, + source: "linkedin-jobs", + error: error.message, + }, + }; + } +} + +/** + * Scroll to load more jobs (LinkedIn uses infinite scroll) - improved to load all jobs + */ +async function scrollToLoadJobs(page) { + try { + let previousJobCount = 0; + let currentJobCount = 0; + let scrollAttempts = 0; + let noChangeCount = 0; // Count how many times count hasn't changed + const maxScrollAttempts = 50; // Increased for large result sets + const maxNoChangeAttempts = 3; // Stop if count doesn't change 3 times in a row + + logger.info(`๐Ÿ“œ Starting to scroll and load jobs...`); + + // Keep scrolling until no more jobs load + while (scrollAttempts < maxScrollAttempts) { + // Count current jobs + currentJobCount = await page.$$eval( + "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", + (elements) => elements.length + ).catch(() => 0); + + // If no new jobs loaded, increment no-change counter + if (currentJobCount === previousJobCount && scrollAttempts > 0) { + noChangeCount++; + // If count hasn't changed 3 times in a row, we're probably done + if (noChangeCount >= maxNoChangeAttempts) { + logger.info(`๐Ÿ“Š Loaded ${currentJobCount} jobs after ${scrollAttempts} scrolls (no new jobs for ${noChangeCount} attempts)`); + break; + } + } else { + // Count changed, reset no-change counter + noChangeCount = 0; + } + + previousJobCount = currentJobCount; + + // Scroll down - use smooth scrolling to trigger lazy loading + await page.evaluate(() => { + window.scrollTo({ + top: document.body.scrollHeight, + behavior: 'smooth' + }); + }); + + // Wait for new content to load - LinkedIn sometimes needs more time + await new Promise((resolve) => setTimeout(resolve, 2500)); + + // Also try scrolling in smaller increments to trigger lazy loading + if (scrollAttempts % 3 === 0) { + await page.evaluate(() => { + window.scrollBy(0, 1000); + }); + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + + scrollAttempts++; + + // Log progress every 5 scrolls + if (scrollAttempts % 5 === 0) { + const newCount = await page.$$eval( + "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", + (elements) => elements.length + ).catch(() => 0); + logger.info(`๐Ÿ“œ Scrolled ${scrollAttempts} times, loaded ${newCount} jobs so far...`); + } + } + + // Final scroll to ensure everything is loaded + await page.evaluate(() => { + window.scrollTo(0, document.body.scrollHeight); + }); + await new Promise((resolve) => setTimeout(resolve, 2000)); + + // Final count + const finalCount = await page.$$eval( + "li[data-occludable-job-id], li.jobs-search-results__list-item, .scaffold-layout__list-item", + (elements) => elements.length + ).catch(() => 0); + logger.info(`โœ… Finished scrolling. Total jobs loaded: ${finalCount}`); + + } catch (error) { + logger.warning(`Could not scroll page: ${error.message}`); + } +} + +/** + * Extract jobs from current page + */ +async function extractJobsFromPage(page, keyword, locationFilter) { + const jobs = []; + + try { + // LinkedIn job listings are in