Enhance job search parser with LinkedIn strategy and configuration updates
- Added LinkedIn jobs parsing strategy to support job extraction from LinkedIn. - Updated job search parser to include new site strategy and improved argument parsing for max pages and exclusion of rejected results. - Enhanced README documentation to reflect new features and usage examples. - Refactored existing strategies for consistency and improved error handling.
This commit is contained in:
parent
bbfd3c84aa
commit
4099b23744
@ -62,3 +62,5 @@ class CoreParser {
|
||||
|
||||
module.exports = CoreParser;
|
||||
|
||||
|
||||
|
||||
|
||||
@ -60,13 +60,48 @@ JOB_TYPES="full time,contract" node index.js --sites=skipthedrive
|
||||
node index.js --sites=skipthedrive --demo
|
||||
```
|
||||
|
||||
#### LinkedIn Jobs Parser
|
||||
|
||||
Professional network job postings with comprehensive job data.
|
||||
|
||||
**Features:**
|
||||
|
||||
- LinkedIn authentication support
|
||||
- Keyword-based job search
|
||||
- Location filtering (both LinkedIn location and post-extraction filter)
|
||||
- Multi-page result parsing with pagination
|
||||
- Job type and experience level extraction
|
||||
- Automatic duplicate detection
|
||||
- Infinite scroll handling
|
||||
|
||||
**Requirements:**
|
||||
|
||||
- LinkedIn credentials (username and password) must be set in `.env` file:
|
||||
```env
|
||||
LINKEDIN_USERNAME=tatiana.litvak25@gmail.com
|
||||
LINKEDIN_PASSWORD=Sladkiy99(
|
||||
LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location filter
|
||||
```
|
||||
|
||||
**Usage:**
|
||||
|
||||
```bash
|
||||
# Search LinkedIn jobs
|
||||
node index.js --sites=linkedin --keywords="software engineer,developer"
|
||||
|
||||
# Search with location filter
|
||||
node index.js --sites=linkedin --keywords="co-op" --location="Ontario"
|
||||
|
||||
# Combine multiple sites
|
||||
node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op"
|
||||
```
|
||||
|
||||
### 🚧 Planned Parsers
|
||||
|
||||
- **Indeed**: Comprehensive job aggregator
|
||||
- **Glassdoor**: Jobs with company reviews and salary data
|
||||
- **Monster**: Traditional job board
|
||||
- **SimplyHired**: Job aggregator with salary estimates
|
||||
- **LinkedIn Jobs**: Professional network job postings
|
||||
- **AngelList**: Startup and tech jobs
|
||||
- **Remote.co**: Dedicated remote work jobs
|
||||
- **FlexJobs**: Flexible and remote positions
|
||||
@ -92,23 +127,21 @@ Create a `.env` file in the parser directory:
|
||||
|
||||
```env
|
||||
# Job Search Configuration
|
||||
SEARCH_SOURCES=linkedin,indeed,glassdoor
|
||||
TARGET_ROLES=software engineer,data scientist,product manager
|
||||
LOCATION_FILTER=Toronto,Vancouver,Calgary
|
||||
EXPERIENCE_LEVELS=entry,mid,senior
|
||||
REMOTE_PREFERENCE=remote,hybrid,onsite
|
||||
SEARCH_KEYWORDS=software engineer,developer,programmer
|
||||
LOCATION_FILTER=Ontario,Canada
|
||||
MAX_PAGES=5
|
||||
|
||||
# LinkedIn Configuration (required for LinkedIn jobs)
|
||||
LINKEDIN_USERNAME=your_email@example.com
|
||||
LINKEDIN_PASSWORD=your_password
|
||||
LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location search
|
||||
|
||||
# Analysis Configuration
|
||||
ENABLE_SALARY_ANALYSIS=true
|
||||
ENABLE_SKILL_ANALYSIS=true
|
||||
ENABLE_TREND_ANALYSIS=true
|
||||
MIN_SALARY=50000
|
||||
MAX_SALARY=200000
|
||||
ENABLE_AI_ANALYSIS=false
|
||||
HEADLESS=true
|
||||
|
||||
# Output Configuration
|
||||
OUTPUT_FORMAT=json,csv
|
||||
SAVE_RAW_DATA=true
|
||||
ANALYSIS_INTERVAL=daily
|
||||
OUTPUT_FORMAT=json
|
||||
```
|
||||
|
||||
### Command Line Options
|
||||
|
||||
@ -10,6 +10,7 @@ const path = require("path");
|
||||
const fs = require("fs");
|
||||
const CoreParser = require("../core-parser");
|
||||
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
|
||||
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
|
||||
const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
|
||||
|
||||
// Load environment variables
|
||||
@ -18,14 +19,16 @@ require("dotenv").config({ path: path.join(__dirname, ".env") });
|
||||
// Configuration from environment
|
||||
const HEADLESS = process.env.HEADLESS !== "false";
|
||||
const SEARCH_KEYWORDS =
|
||||
process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer";
|
||||
process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
|
||||
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
||||
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
|
||||
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
||||
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
|
||||
|
||||
// Available site strategies
|
||||
const SITE_STRATEGIES = {
|
||||
skipthedrive: skipthedriveStrategy,
|
||||
linkedin: linkedinJobsStrategy,
|
||||
// Add more site strategies here
|
||||
// indeed: indeedStrategy,
|
||||
// glassdoor: glassdoorStrategy,
|
||||
@ -41,6 +44,7 @@ function parseArguments() {
|
||||
keywords: null,
|
||||
locationFilter: null,
|
||||
maxPages: MAX_PAGES,
|
||||
excludeRejected: EXCLUDE_REJECTED,
|
||||
};
|
||||
|
||||
args.forEach((arg) => {
|
||||
@ -57,7 +61,15 @@ function parseArguments() {
|
||||
} else if (arg.startsWith("--location=")) {
|
||||
options.locationFilter = arg.split("=")[1];
|
||||
} else if (arg.startsWith("--max-pages=")) {
|
||||
options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES;
|
||||
const value = arg.split("=")[1];
|
||||
// Support "all" or "0" to mean unlimited pages
|
||||
if (value === "all" || value === "0") {
|
||||
options.maxPages = 0; // 0 means unlimited
|
||||
} else {
|
||||
options.maxPages = parseInt(value) || MAX_PAGES;
|
||||
}
|
||||
} else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
|
||||
options.excludeRejected = true;
|
||||
}
|
||||
});
|
||||
|
||||
@ -84,6 +96,7 @@ async function startJobSearchParser(options = {}) {
|
||||
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
||||
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
||||
const sites = finalOptions.sites;
|
||||
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
|
||||
|
||||
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
||||
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
||||
@ -108,18 +121,46 @@ async function startJobSearchParser(options = {}) {
|
||||
logger.step(`\n🌐 Parsing ${site}...`);
|
||||
const startTime = Date.now();
|
||||
|
||||
const parseResult = await strategy(coreParser, {
|
||||
// Prepare strategy options
|
||||
const strategyOptions = {
|
||||
keywords,
|
||||
locationFilter,
|
||||
maxPages: finalOptions.maxPages,
|
||||
});
|
||||
};
|
||||
|
||||
// Add credentials for LinkedIn
|
||||
if (site === "linkedin") {
|
||||
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
||||
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||
|
||||
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
|
||||
logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
|
||||
siteResults[site] = {
|
||||
count: 0,
|
||||
rejected: 0,
|
||||
duration: "0s",
|
||||
error: "LinkedIn credentials not found",
|
||||
};
|
||||
continue;
|
||||
}
|
||||
|
||||
strategyOptions.credentials = {
|
||||
username: LINKEDIN_USERNAME,
|
||||
password: LINKEDIN_PASSWORD,
|
||||
};
|
||||
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
|
||||
}
|
||||
|
||||
const parseResult = await strategy(coreParser, strategyOptions);
|
||||
|
||||
const { results, rejectedResults, summary } = parseResult;
|
||||
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
||||
|
||||
// Collect results
|
||||
logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
|
||||
allResults.push(...results);
|
||||
allRejectedResults.push(...rejectedResults);
|
||||
logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||
|
||||
siteResults[site] = {
|
||||
count: results.length,
|
||||
@ -162,6 +203,9 @@ async function startJobSearchParser(options = {}) {
|
||||
}
|
||||
|
||||
// Save results
|
||||
logger.info(`💾 Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
|
||||
|
||||
const outputData = {
|
||||
metadata: {
|
||||
extractedAt: new Date().toISOString(),
|
||||
@ -171,12 +215,22 @@ async function startJobSearchParser(options = {}) {
|
||||
keywords: keywords.join(", "),
|
||||
locationFilter,
|
||||
analysisResults,
|
||||
rejectedJobsExcluded: excludeRejected,
|
||||
},
|
||||
results: allResults,
|
||||
rejectedResults: allRejectedResults,
|
||||
siteResults,
|
||||
};
|
||||
|
||||
// Always include rejectedResults if not excluded (make it explicit, not using spread)
|
||||
if (!excludeRejected) {
|
||||
outputData.rejectedResults = allRejectedResults;
|
||||
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
|
||||
} else {
|
||||
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
|
||||
}
|
||||
|
||||
logger.info(`💾 Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`);
|
||||
|
||||
const resultsDir = path.join(__dirname, "results");
|
||||
if (!fs.existsSync(resultsDir)) {
|
||||
fs.mkdirSync(resultsDir, { recursive: true });
|
||||
|
||||
1360
job-search-parser/strategies/linkedin-jobs-strategy.js
Normal file
1360
job-search-parser/strategies/linkedin-jobs-strategy.js
Normal file
File diff suppressed because it is too large
Load Diff
@ -67,14 +67,11 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
||||
});
|
||||
|
||||
// Wait for job listings to load
|
||||
const hasResults = await coreParser
|
||||
.waitForSelector(
|
||||
"#loops-wrapper",
|
||||
{
|
||||
timeout: 5000,
|
||||
},
|
||||
"skipthedrive-main"
|
||||
)
|
||||
const hasResults = await page
|
||||
.waitForSelector("#loops-wrapper", {
|
||||
timeout: 5000,
|
||||
})
|
||||
.then(() => true)
|
||||
.catch(() => {
|
||||
logger.warning(`No results found for keyword: ${keyword}`);
|
||||
return false;
|
||||
|
||||
@ -31,12 +31,13 @@ const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
||||
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||
const HEADLESS = process.env.HEADLESS !== "false";
|
||||
const SEARCH_KEYWORDS =
|
||||
process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts";
|
||||
process.env.SEARCH_KEYWORDS || "layoff";//,downsizing";//,job cuts";
|
||||
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
||||
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
|
||||
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
|
||||
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
||||
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
|
||||
const EXTRACT_LOCATION_FROM_PROFILE = process.env.EXTRACT_LOCATION_FROM_PROFILE === "true";
|
||||
|
||||
/**
|
||||
* Main LinkedIn parser function
|
||||
@ -71,6 +72,7 @@ async function startLinkedInParser(options = {}) {
|
||||
keywords,
|
||||
locationFilter: LOCATION_FILTER,
|
||||
maxResults: MAX_RESULTS,
|
||||
extractLocationFromProfile: EXTRACT_LOCATION_FROM_PROFILE,
|
||||
credentials: {
|
||||
username: LINKEDIN_USERNAME,
|
||||
password: LINKEDIN_PASSWORD,
|
||||
|
||||
@ -21,6 +21,7 @@ async function linkedinStrategy(coreParser, options = {}) {
|
||||
keywords = ["layoff", "downsizing", "job cuts"],
|
||||
locationFilter = null,
|
||||
maxResults = 50,
|
||||
extractLocationFromProfile = false,
|
||||
credentials = {},
|
||||
} = options;
|
||||
|
||||
@ -106,7 +107,7 @@ async function linkedinStrategy(coreParser, options = {}) {
|
||||
}
|
||||
|
||||
// Extract posts from current page
|
||||
const posts = await extractPostsFromPage(page, keyword);
|
||||
const posts = await extractPostsFromPage(page, keyword, extractLocationFromProfile);
|
||||
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
|
||||
|
||||
for (const post of posts) {
|
||||
@ -172,7 +173,7 @@ async function linkedinStrategy(coreParser, options = {}) {
|
||||
/**
|
||||
* Extract posts from current search results page
|
||||
*/
|
||||
async function extractPostsFromPage(page, keyword) {
|
||||
async function extractPostsFromPage(page, keyword, extractLocationFromProfile = false) {
|
||||
const posts = [];
|
||||
|
||||
try {
|
||||
@ -254,10 +255,26 @@ async function extractPostsFromPage(page, keyword) {
|
||||
|
||||
const post = await extractPostData(postElements[i], keyword);
|
||||
if (post) {
|
||||
// If location is missing and we're enabled to extract from profile, try to get it
|
||||
if (!post.location && extractLocationFromProfile && post.authorUrl) {
|
||||
try {
|
||||
logger.debug(`📍 Location missing for post ${i + 1}, attempting to extract from profile...`);
|
||||
const profileLocation = await extractLocationFromProfilePage(page, post.authorUrl);
|
||||
if (profileLocation) {
|
||||
post.location = profileLocation;
|
||||
post.profileLocation = profileLocation;
|
||||
logger.debug(`✅ Extracted location from profile: ${profileLocation}`);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.debug(`⚠️ Could not extract location from profile: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
posts.push(post);
|
||||
const hasContent = post.content && post.content.length > 0;
|
||||
const hasAuthor = post.authorName && post.authorName.length > 0;
|
||||
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`);
|
||||
const hasLocation = post.location && post.location.length > 0;
|
||||
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'}, location: ${hasLocation ? 'yes' : 'no'})`);
|
||||
} else {
|
||||
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
|
||||
}
|
||||
@ -627,6 +644,42 @@ async function extractPostData(postElement, keyword) {
|
||||
}
|
||||
}
|
||||
|
||||
// Try to extract from data attributes or hidden elements
|
||||
if (!data.location) {
|
||||
// Check for data attributes that might contain location
|
||||
const actorSection = el.querySelector(".feed-shared-actor");
|
||||
if (actorSection) {
|
||||
// Check all data attributes
|
||||
for (const attr of actorSection.attributes) {
|
||||
if (attr.name.startsWith("data-") && attr.value) {
|
||||
const value = attr.value.toLowerCase();
|
||||
// Look for location-like patterns in data attributes
|
||||
if (/(ontario|alberta|british columbia|quebec|toronto|vancouver|calgary|ottawa|montreal)/i.test(value)) {
|
||||
// Try to extract the actual location text
|
||||
const locationMatch = attr.value.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
|
||||
if (locationMatch) {
|
||||
data.location = locationMatch[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for hidden spans or divs with location info
|
||||
const hiddenElements = actorSection.querySelectorAll("span[style*='display: none'], div[style*='display: none'], [aria-hidden='true']");
|
||||
for (const hiddenElem of hiddenElements) {
|
||||
const text = hiddenElem.textContent || hiddenElem.getAttribute("aria-label") || "";
|
||||
if (text && /(ontario|alberta|british columbia|quebec|toronto|vancouver)/i.test(text)) {
|
||||
const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
|
||||
if (locationMatch) {
|
||||
data.location = locationMatch[0].trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract engagement metrics - try multiple approaches
|
||||
const likesSelectors = [
|
||||
".social-counts-reactions__count",
|
||||
@ -799,6 +852,48 @@ async function extractPostData(postElement, keyword) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract location from a LinkedIn profile page
|
||||
*/
|
||||
async function extractLocationFromProfilePage(page, profileUrl) {
|
||||
try {
|
||||
// Ensure URL is complete
|
||||
let fullUrl = profileUrl;
|
||||
if (!fullUrl.startsWith("http")) {
|
||||
fullUrl = `https://www.linkedin.com${fullUrl}`;
|
||||
}
|
||||
|
||||
// Remove query parameters that might cause issues
|
||||
fullUrl = fullUrl.split("?")[0];
|
||||
|
||||
// Open profile in new tab
|
||||
const profilePage = await page.context().newPage();
|
||||
|
||||
try {
|
||||
await profilePage.goto(fullUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
// Wait a bit for content to load
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
// Use the extractLocationFromProfile utility from ai-analyzer
|
||||
const location = await extractLocationFromProfile(profilePage);
|
||||
|
||||
await profilePage.close();
|
||||
|
||||
return location;
|
||||
} catch (error) {
|
||||
await profilePage.close();
|
||||
throw error;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.debug(`Failed to extract location from profile ${profileUrl}: ${error.message}`);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract numbers from text (e.g., "15 likes" -> 15)
|
||||
*/
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user