Enhance job search parser with advanced keyword filtering and job detail extraction
- Implemented grouped AND/OR logic for keyword searches, allowing for more flexible job matching criteria. - Added a minimum date filter to restrict job results to postings after a specified date. - Enhanced job detail extraction to include role duties and job requirements from job descriptions. - Updated README with new command line options and examples for using date filters and keyword logic. - Improved logging to provide clearer insights into keyword matching logic and job search parameters.
This commit is contained in:
parent
00c4cf1b6f
commit
47cdc03fb8
@ -69,34 +69,83 @@ async function analyzeBatch(
|
|||||||
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
|
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const prompt = `Analyze ${posts.length} LinkedIn posts for relevance to: "${context}"
|
// Detect if context is about a student profile
|
||||||
|
const isStudentContext = /student|undergraduate|first year|second year|third year|fourth year|freshman|sophomore|junior|senior|co-op|internship/i.test(context);
|
||||||
|
|
||||||
|
// Build enhanced prompt based on context type
|
||||||
|
let analysisInstructions = "";
|
||||||
|
if (isStudentContext) {
|
||||||
|
analysisInstructions = `
|
||||||
|
ANALYSIS FOCUS (Student Context Detected):
|
||||||
|
- Pay special attention to the "Requirements" section
|
||||||
|
- Evaluate if the job requirements match the student's level (${context})
|
||||||
|
- Consider: Are requirements too advanced? Are they appropriate for entry-level/co-op/internship?
|
||||||
|
- Check if the role duties are suitable for a student's skill level
|
||||||
|
- Look for keywords like "co-op", "internship", "entry-level", "student", "junior"
|
||||||
|
- If requirements mention "years of experience", "senior", "expert", "PhD", etc., this may not be suitable
|
||||||
|
- If requirements are reasonable for a student (basic skills, willingness to learn), mark as relevant`;
|
||||||
|
} else {
|
||||||
|
analysisInstructions = `
|
||||||
|
ANALYSIS FOCUS:
|
||||||
|
- Evaluate overall relevance to: "${context}"
|
||||||
|
- Consider job title, description, duties, and requirements
|
||||||
|
- Assess if the job matches the specified criteria`;
|
||||||
|
}
|
||||||
|
|
||||||
POSTS:
|
const prompt = `Analyze ${posts.length} job postings for relevance to: "${context}"
|
||||||
|
|
||||||
|
${analysisInstructions}
|
||||||
|
|
||||||
|
JOB POSTINGS:
|
||||||
${posts
|
${posts
|
||||||
.map(
|
.map(
|
||||||
(post, i) => `
|
(post, i) => {
|
||||||
POST ${i + 1}:
|
// For student contexts, prioritize Requirements section if text is too long
|
||||||
"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}"
|
let jobText = post.text;
|
||||||
`
|
if (isStudentContext && jobText.length > 1200) {
|
||||||
|
// Try to extract Requirements section if present
|
||||||
|
const requirementsMatch = jobText.match(/Requirements?:[\s\S]{0,600}/i);
|
||||||
|
const dutiesMatch = jobText.match(/Role Duties?:[\s\S]{0,300}/i);
|
||||||
|
const titleMatch = jobText.match(/Title:[\s\S]{0,100}/i);
|
||||||
|
|
||||||
|
if (requirementsMatch) {
|
||||||
|
// Prioritize: Title + Requirements (most important for students)
|
||||||
|
jobText = (titleMatch ? titleMatch[0] + "\n\n" : "") +
|
||||||
|
(requirementsMatch ? requirementsMatch[0] : "") +
|
||||||
|
(dutiesMatch ? "\n\n" + dutiesMatch[0] : "");
|
||||||
|
} else {
|
||||||
|
// Fallback to truncation
|
||||||
|
jobText = jobText.substring(0, 1200) + "...";
|
||||||
|
}
|
||||||
|
} else if (jobText.length > 1200) {
|
||||||
|
jobText = jobText.substring(0, 1200) + "...";
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
JOB ${i + 1}:
|
||||||
|
${jobText}
|
||||||
|
`;
|
||||||
|
}
|
||||||
)
|
)
|
||||||
.join("")}
|
.join("")}
|
||||||
|
|
||||||
REQUIRED FORMAT - Respond with EXACTLY ${posts.length} lines, one per post:
|
REQUIRED FORMAT - Respond with EXACTLY ${posts.length} lines, one per post:
|
||||||
POST 1: YES | 0.8 | reason here
|
JOB 1: YES | 0.8 | reason here
|
||||||
POST 2: NO | 0.2 | reason here
|
JOB 2: NO | 0.2 | reason here
|
||||||
POST 3: YES | 0.9 | reason here
|
JOB 3: YES | 0.9 | reason here
|
||||||
|
|
||||||
RULES:
|
RULES:
|
||||||
- Use YES or NO (uppercase)
|
- Use YES or NO (uppercase)
|
||||||
- Use pipe character | as separator
|
- Use pipe character | as separator
|
||||||
- Confidence must be 0.0 to 1.0 (decimal number)
|
- Confidence must be 0.0 to 1.0 (decimal number)
|
||||||
- Keep reasoning brief (one sentence)
|
- Keep reasoning brief (one sentence)
|
||||||
- MUST include all ${posts.length} posts in order
|
- MUST include all ${posts.length} jobs in order
|
||||||
|
${isStudentContext ? "- When analyzing requirements, explicitly mention if requirements are too advanced or appropriate for the student level" : ""}
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
POST 1: YES | 0.9 | mentions layoffs and job cuts
|
JOB 1: YES | 0.9 | co-op position suitable for first year students
|
||||||
POST 2: NO | 0.1 | unrelated topic about vacation
|
JOB 2: NO | 0.2 | requires 5+ years experience, too advanced
|
||||||
POST 3: YES | 0.7 | discusses workforce reduction`;
|
JOB 3: YES | 0.7 | entry-level role with basic requirements appropriate for students`;
|
||||||
|
|
||||||
// Add timeout to prevent hanging (5 minutes max)
|
// Add timeout to prevent hanging (5 minutes max)
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
|
|||||||
@ -45,6 +45,43 @@ function containsAnyKeyword(text, keywords) {
|
|||||||
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if text contains all of the specified keywords (case insensitive)
|
||||||
|
*/
|
||||||
|
function containsAllKeywords(text, keywords) {
|
||||||
|
if (!text || !Array.isArray(keywords)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lowerText = text.toLowerCase();
|
||||||
|
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
|
||||||
|
* @param {string} text - Text to search in
|
||||||
|
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
|
||||||
|
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
|
||||||
|
*/
|
||||||
|
function matchesKeywordGroups(text, keywordGroups) {
|
||||||
|
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lowerText = text.toLowerCase();
|
||||||
|
|
||||||
|
// All groups must match (AND logic)
|
||||||
|
return keywordGroups.every((group) => {
|
||||||
|
if (!Array.isArray(group) || group.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// At least one keyword in the group must match (OR logic)
|
||||||
|
return group.some((keyword) =>
|
||||||
|
lowerText.includes(keyword.toLowerCase().trim())
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate if text meets basic quality criteria
|
* Validate if text meets basic quality criteria
|
||||||
*/
|
*/
|
||||||
@ -101,6 +138,8 @@ function normalizeUrl(url) {
|
|||||||
module.exports = {
|
module.exports = {
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
|
matchesKeywordGroups,
|
||||||
isValidText,
|
isValidText,
|
||||||
extractDomain,
|
extractDomain,
|
||||||
normalizeUrl,
|
normalizeUrl,
|
||||||
|
|||||||
@ -92,10 +92,32 @@ node index.js --sites=linkedin --keywords="software engineer,developer"
|
|||||||
# Search with location filter
|
# Search with location filter
|
||||||
node index.js --sites=linkedin --keywords="co-op" --location="Ontario"
|
node index.js --sites=linkedin --keywords="co-op" --location="Ontario"
|
||||||
|
|
||||||
|
# Search with date filter (jobs posted after specific date)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op" --min-date="2025-12-01"
|
||||||
|
|
||||||
|
# Combine filters
|
||||||
|
node index.js --sites=linkedin --keywords="co-op" --location="Ontario" --min-date="2025-12-01"
|
||||||
|
|
||||||
# Combine multiple sites
|
# Combine multiple sites
|
||||||
node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op"
|
node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op"
|
||||||
|
|
||||||
|
# Use AND logic - jobs must match ALL keywords (e.g., "co-op" AND "summer 2026")
|
||||||
|
node index.js --sites=linkedin --keywords="co-op,summer 2026" --and
|
||||||
|
|
||||||
|
# Use grouped AND/OR logic - (co-op OR intern) AND (summer 2026)
|
||||||
|
# Use | (pipe) for OR within groups, , (comma) to separate AND groups
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026" --and
|
||||||
|
|
||||||
|
# Multiple AND groups - (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --and
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Date Filter Notes:**
|
||||||
|
- The date filter uses LinkedIn's `f_TPR` parameter to filter at the LinkedIn level before parsing
|
||||||
|
- Format: `YYYY-MM-DD` (e.g., `2025-12-01`)
|
||||||
|
- LinkedIn supports relative timeframes up to ~30 days
|
||||||
|
- For dates older than 30 days, LinkedIn may limit results to the maximum supported timeframe
|
||||||
|
|
||||||
### 🚧 Planned Parsers
|
### 🚧 Planned Parsers
|
||||||
|
|
||||||
- **Indeed**: Comprehensive job aggregator
|
- **Indeed**: Comprehensive job aggregator
|
||||||
@ -128,6 +150,9 @@ Create a `.env` file in the parser directory:
|
|||||||
```env
|
```env
|
||||||
# Job Search Configuration
|
# Job Search Configuration
|
||||||
SEARCH_KEYWORDS=software engineer,developer,programmer
|
SEARCH_KEYWORDS=software engineer,developer,programmer
|
||||||
|
# For grouped AND/OR logic, use pipe (|) for OR within groups and comma (,) for AND groups:
|
||||||
|
# SEARCH_KEYWORDS=co-op|intern,summer 2026,remote # (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
|
USE_AND_LOGIC=false # Set to "true" to enable AND logic (required for grouped keywords)
|
||||||
LOCATION_FILTER=Ontario,Canada
|
LOCATION_FILTER=Ontario,Canada
|
||||||
MAX_PAGES=5
|
MAX_PAGES=5
|
||||||
|
|
||||||
@ -136,6 +161,9 @@ LINKEDIN_USERNAME=your_email@example.com
|
|||||||
LINKEDIN_PASSWORD=your_password
|
LINKEDIN_PASSWORD=your_password
|
||||||
LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location search
|
LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location search
|
||||||
|
|
||||||
|
# Date Filter (LinkedIn only - filters at LinkedIn level before parsing)
|
||||||
|
MIN_DATE=2025-12-01 # Format: YYYY-MM-DD (jobs posted after this date)
|
||||||
|
|
||||||
# Analysis Configuration
|
# Analysis Configuration
|
||||||
ENABLE_AI_ANALYSIS=false
|
ENABLE_AI_ANALYSIS=false
|
||||||
HEADLESS=true
|
HEADLESS=true
|
||||||
@ -144,6 +172,22 @@ HEADLESS=true
|
|||||||
OUTPUT_FORMAT=json # Options: "json", "csv", or "both"
|
OUTPUT_FORMAT=json # Options: "json", "csv", or "both"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Keyword Examples in .env:**
|
||||||
|
|
||||||
|
```env
|
||||||
|
# Simple OR logic (default) - matches ANY keyword
|
||||||
|
SEARCH_KEYWORDS=co-op,intern
|
||||||
|
USE_AND_LOGIC=false
|
||||||
|
|
||||||
|
# Simple AND logic - matches ALL keywords
|
||||||
|
SEARCH_KEYWORDS=co-op,summer 2026
|
||||||
|
USE_AND_LOGIC=true
|
||||||
|
|
||||||
|
# Grouped AND/OR logic - (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
|
SEARCH_KEYWORDS=co-op|intern,summer 2026,remote
|
||||||
|
USE_AND_LOGIC=true
|
||||||
|
```
|
||||||
|
|
||||||
### Command Line Options
|
### Command Line Options
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -168,16 +212,34 @@ node index.js --no-rejected
|
|||||||
# Output format (json, csv, or both)
|
# Output format (json, csv, or both)
|
||||||
node index.js --output=csv
|
node index.js --output=csv
|
||||||
node index.js --output=both
|
node index.js --output=both
|
||||||
|
|
||||||
|
# Date filter (LinkedIn only - filters at LinkedIn level)
|
||||||
|
node index.js --sites=linkedin --min-date="2025-12-01"
|
||||||
|
|
||||||
|
# Use AND logic for keywords (all keywords must match)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op,summer 2026" --and
|
||||||
|
|
||||||
|
# Use grouped AND/OR logic: (co-op OR intern) AND (summer 2026)
|
||||||
|
# Use | (pipe) for OR within groups, , (comma) to separate AND groups
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026" --and
|
||||||
|
|
||||||
|
# Multiple AND groups: (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --and
|
||||||
```
|
```
|
||||||
|
|
||||||
**Available Options:**
|
**Available Options:**
|
||||||
|
|
||||||
- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive)
|
- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive)
|
||||||
- `--keywords="keyword1,keyword2"`: Search keywords
|
- `--keywords="keyword1,keyword2"`: Search keywords
|
||||||
|
- Use `|` (pipe) to separate OR keywords within a group: `"co-op|intern"` means "co-op" OR "intern"
|
||||||
|
- Use `,` (comma) to separate AND groups when using `--and`: `"co-op|intern,summer 2026"` means (co-op OR intern) AND (summer 2026)
|
||||||
- `--location="LOCATION"`: Location filter
|
- `--location="LOCATION"`: Location filter
|
||||||
- `--max-pages=NUMBER`: Maximum pages to parse (0 or "all" for unlimited)
|
- `--max-pages=NUMBER`: Maximum pages to parse (0 or "all" for unlimited)
|
||||||
|
- `--min-date="YYYY-MM-DD"`: Minimum posted date filter (LinkedIn only - filters at LinkedIn level before parsing)
|
||||||
- `--no-rejected` or `--exclude-rejected`: Exclude rejected results from output
|
- `--no-rejected` or `--exclude-rejected`: Exclude rejected results from output
|
||||||
- `--output=FORMAT` or `--format=FORMAT`: Output format - "json", "csv", or "both" (default: "json")
|
- `--output=FORMAT` or `--format=FORMAT`: Output format - "json", "csv", or "both" (default: "json")
|
||||||
|
- `--and` or `--all-keywords`: Use AND logic for keywords (all keywords must match). Default is OR logic (any keyword matches)
|
||||||
|
- When combined with `|` (pipe) in keywords, enables grouped AND/OR logic
|
||||||
|
|
||||||
## 📊 Keywords
|
## 📊 Keywords
|
||||||
|
|
||||||
|
|||||||
@ -28,6 +28,8 @@ const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
|||||||
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
||||||
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
|
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
|
||||||
const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both"
|
const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both"
|
||||||
|
const MIN_DATE = process.env.MIN_DATE; // Minimum posted date (format: YYYY-MM-DD)
|
||||||
|
const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for keywords
|
||||||
|
|
||||||
// Available site strategies
|
// Available site strategies
|
||||||
const SITE_STRATEGIES = {
|
const SITE_STRATEGIES = {
|
||||||
@ -50,6 +52,8 @@ function parseArguments() {
|
|||||||
maxPages: MAX_PAGES,
|
maxPages: MAX_PAGES,
|
||||||
excludeRejected: EXCLUDE_REJECTED,
|
excludeRejected: EXCLUDE_REJECTED,
|
||||||
outputFormat: OUTPUT_FORMAT,
|
outputFormat: OUTPUT_FORMAT,
|
||||||
|
minDate: MIN_DATE,
|
||||||
|
useAndLogic: USE_AND_LOGIC, // Use AND logic instead of OR logic for keywords (from env or CLI)
|
||||||
};
|
};
|
||||||
|
|
||||||
args.forEach((arg) => {
|
args.forEach((arg) => {
|
||||||
@ -82,6 +86,10 @@ function parseArguments() {
|
|||||||
} else {
|
} else {
|
||||||
logger.warning(`⚠️ Unknown output format: ${format}. Using default: json`);
|
logger.warning(`⚠️ Unknown output format: ${format}. Using default: json`);
|
||||||
}
|
}
|
||||||
|
} else if (arg.startsWith("--min-date=")) {
|
||||||
|
options.minDate = arg.split("=")[1];
|
||||||
|
} else if (arg === "--and" || arg === "--all-keywords") {
|
||||||
|
options.useAndLogic = true; // CLI flag overrides env variable
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -104,15 +112,35 @@ async function startJobSearchParser(options = {}) {
|
|||||||
logger.step("🚀 Job Search Parser Starting...");
|
logger.step("🚀 Job Search Parser Starting...");
|
||||||
|
|
||||||
// Parse keywords
|
// Parse keywords
|
||||||
const keywords =
|
let keywords =
|
||||||
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
||||||
|
|
||||||
|
// Parse keyword groups if AND logic is enabled and keywords contain pipe (|) separator
|
||||||
|
// Format: "co-op|intern,summer 2026" means (co-op OR intern) AND (summer 2026)
|
||||||
|
let keywordGroups = null;
|
||||||
|
if (finalOptions.useAndLogic && keywords.some(k => k.includes('|'))) {
|
||||||
|
keywordGroups = keywords.map(group =>
|
||||||
|
group.split('|').map(k => k.trim()).filter(k => k.length > 0)
|
||||||
|
);
|
||||||
|
logger.info(`🔍 Keyword Groups: ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
}
|
||||||
|
|
||||||
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
||||||
const sites = finalOptions.sites;
|
const sites = finalOptions.sites;
|
||||||
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
|
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
|
||||||
|
|
||||||
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
||||||
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
||||||
|
if (keywordGroups) {
|
||||||
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
} else {
|
||||||
|
logger.info(`🔗 Keyword Logic: ${finalOptions.useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
|
}
|
||||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
|
const minDate = finalOptions.minDate || MIN_DATE;
|
||||||
|
if (minDate) {
|
||||||
|
logger.info(`📅 Min Date Filter: ${minDate} (jobs posted after this date)`);
|
||||||
|
}
|
||||||
logger.info(
|
logger.info(
|
||||||
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
|
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
|
||||||
);
|
);
|
||||||
@ -124,6 +152,96 @@ async function startJobSearchParser(options = {}) {
|
|||||||
const allResults = [];
|
const allResults = [];
|
||||||
const allRejectedResults = [];
|
const allRejectedResults = [];
|
||||||
const siteResults = {};
|
const siteResults = {};
|
||||||
|
let analysisResults = null;
|
||||||
|
|
||||||
|
// Initialize results directory and file for incremental saving
|
||||||
|
const resultsDir = path.join(__dirname, "results");
|
||||||
|
if (!fs.existsSync(resultsDir)) {
|
||||||
|
fs.mkdirSync(resultsDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||||
|
const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT;
|
||||||
|
let incrementalJsonFilepath = null;
|
||||||
|
let incrementalCsvFilepath = null;
|
||||||
|
|
||||||
|
// Initialize incremental save files
|
||||||
|
if (outputFormat === "json" || outputFormat === "both") {
|
||||||
|
const jsonFilename = `job-search-results-${timestamp}.json`;
|
||||||
|
incrementalJsonFilepath = path.join(resultsDir, jsonFilename);
|
||||||
|
}
|
||||||
|
if (outputFormat === "csv" || outputFormat === "both") {
|
||||||
|
const csvFilename = `job-search-results-${timestamp}.csv`;
|
||||||
|
incrementalCsvFilepath = path.join(resultsDir, csvFilename);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save results incrementally as they're found
|
||||||
|
*/
|
||||||
|
const saveIncrementalResults = (currentResults, currentRejectedResults, currentSiteResults, currentAnalysisResults = null, isComplete = false) => {
|
||||||
|
try {
|
||||||
|
const outputData = {
|
||||||
|
metadata: {
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
parser: "job-search-parser",
|
||||||
|
version: "2.0.0",
|
||||||
|
sites: sites,
|
||||||
|
keywords: keywords.join(", "),
|
||||||
|
locationFilter,
|
||||||
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
||||||
|
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
|
||||||
|
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
|
||||||
|
analysisResults: currentAnalysisResults,
|
||||||
|
rejectedJobsExcluded: excludeRejected,
|
||||||
|
isComplete: isComplete,
|
||||||
|
lastUpdated: new Date().toISOString(),
|
||||||
|
},
|
||||||
|
results: currentResults,
|
||||||
|
siteResults: currentSiteResults,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!excludeRejected) {
|
||||||
|
outputData.rejectedResults = currentRejectedResults;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save JSON incrementally
|
||||||
|
if (incrementalJsonFilepath) {
|
||||||
|
fs.writeFileSync(incrementalJsonFilepath, JSON.stringify(outputData, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save CSV incrementally (convert on each save)
|
||||||
|
if (incrementalCsvFilepath) {
|
||||||
|
const csvContent = convertResultsToCsv(outputData);
|
||||||
|
fs.writeFileSync(incrementalCsvFilepath, csvContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isComplete) {
|
||||||
|
logger.info(`💾 Incremental save: ${currentResults.length} results saved to ${incrementalJsonFilepath || incrementalCsvFilepath}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`⚠️ Failed to save incremental results: ${error.message}`);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Save initial empty state
|
||||||
|
saveIncrementalResults([], [], {}, null, false);
|
||||||
|
|
||||||
|
// Set up signal handlers for graceful shutdown
|
||||||
|
let isShuttingDown = false;
|
||||||
|
const gracefulShutdown = async (signal) => {
|
||||||
|
if (isShuttingDown) return;
|
||||||
|
isShuttingDown = true;
|
||||||
|
|
||||||
|
logger.warning(`\n⚠️ Received ${signal}, saving current results before exit...`);
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
|
logger.info(`💾 Saved ${allResults.length} results before shutdown`);
|
||||||
|
|
||||||
|
await coreParser.cleanup();
|
||||||
|
process.exit(0);
|
||||||
|
};
|
||||||
|
|
||||||
|
process.on('SIGINT', () => gracefulShutdown('SIGINT'));
|
||||||
|
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
|
||||||
|
|
||||||
// Process each selected site
|
// Process each selected site
|
||||||
for (const site of sites) {
|
for (const site of sites) {
|
||||||
@ -140,8 +258,10 @@ async function startJobSearchParser(options = {}) {
|
|||||||
// Prepare strategy options
|
// Prepare strategy options
|
||||||
const strategyOptions = {
|
const strategyOptions = {
|
||||||
keywords,
|
keywords,
|
||||||
|
keywordGroups, // Pass grouped keywords if available
|
||||||
locationFilter,
|
locationFilter,
|
||||||
maxPages: finalOptions.maxPages,
|
maxPages: finalOptions.maxPages,
|
||||||
|
useAndLogic: finalOptions.useAndLogic || false,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Add credentials for LinkedIn
|
// Add credentials for LinkedIn
|
||||||
@ -165,6 +285,7 @@ async function startJobSearchParser(options = {}) {
|
|||||||
password: LINKEDIN_PASSWORD,
|
password: LINKEDIN_PASSWORD,
|
||||||
};
|
};
|
||||||
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
|
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
|
||||||
|
strategyOptions.minDate = minDate; // Add date filter for LinkedIn
|
||||||
}
|
}
|
||||||
|
|
||||||
const parseResult = await strategy(coreParser, strategyOptions);
|
const parseResult = await strategy(coreParser, strategyOptions);
|
||||||
@ -188,6 +309,9 @@ async function startJobSearchParser(options = {}) {
|
|||||||
logger.success(
|
logger.success(
|
||||||
`✅ ${site} completed in ${duration}s - Found ${results.length} jobs`
|
`✅ ${site} completed in ${duration}s - Found ${results.length} jobs`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Save results incrementally after each site
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`❌ ${site} parsing failed: ${error.message}`);
|
logger.error(`❌ ${site} parsing failed: ${error.message}`);
|
||||||
siteResults[site] = {
|
siteResults[site] = {
|
||||||
@ -196,23 +320,41 @@ async function startJobSearchParser(options = {}) {
|
|||||||
duration: "0s",
|
duration: "0s",
|
||||||
error: error.message,
|
error: error.message,
|
||||||
};
|
};
|
||||||
|
// Save even on error to preserve what we have
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// AI Analysis if enabled
|
// AI Analysis if enabled
|
||||||
let analysisResults = null;
|
// Save results before AI analysis (in case AI analysis takes a long time)
|
||||||
|
if (allResults.length > 0) {
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, null, false);
|
||||||
|
}
|
||||||
|
|
||||||
if (ENABLE_AI_ANALYSIS && allResults.length > 0) {
|
if (ENABLE_AI_ANALYSIS && allResults.length > 0) {
|
||||||
logger.step("🧠 Running AI Analysis...");
|
logger.step("🧠 Running AI Analysis...");
|
||||||
|
|
||||||
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL);
|
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL);
|
||||||
if (ollamaAvailable) {
|
if (ollamaAvailable) {
|
||||||
// Prepare data for analysis (analyzeBatch expects objects with 'text' field)
|
// Prepare data for analysis (analyzeBatch expects objects with 'text' field)
|
||||||
const analysisData = allResults.map((job) => ({
|
const analysisData = allResults.map((job) => {
|
||||||
text: `${job.title || ""} at ${job.company || ""}. ${job.description || ""}`.trim(),
|
// Build comprehensive text including all available job information
|
||||||
location: job.location || "",
|
const parts = [];
|
||||||
keyword: job.keyword || "",
|
if (job.title) parts.push(`Title: ${job.title}`);
|
||||||
timestamp: job.extractedAt || job.postedDate || "",
|
if (job.company) parts.push(`Company: ${job.company}`);
|
||||||
}));
|
if (job.description) parts.push(`Description: ${job.description}`);
|
||||||
|
if (job.roleDuties) parts.push(`Role Duties: ${job.roleDuties}`);
|
||||||
|
if (job.jobRequirements) parts.push(`Requirements: ${job.jobRequirements}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: parts.join("\n\n"),
|
||||||
|
location: job.location || "",
|
||||||
|
keyword: job.keyword || "",
|
||||||
|
timestamp: job.extractedAt || job.postedDate || "",
|
||||||
|
roleDuties: job.roleDuties || "",
|
||||||
|
jobRequirements: job.jobRequirements || "",
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
// Process in smaller batches to avoid timeouts (5 jobs per batch)
|
// Process in smaller batches to avoid timeouts (5 jobs per batch)
|
||||||
const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5;
|
const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5;
|
||||||
@ -263,68 +405,32 @@ async function startJobSearchParser(options = {}) {
|
|||||||
logger.success(
|
logger.success(
|
||||||
`✅ AI Analysis completed for ${allResults.length} jobs`
|
`✅ AI Analysis completed for ${allResults.length} jobs`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Save results after AI analysis completes
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
} else {
|
} else {
|
||||||
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save results
|
// Final save with complete flag
|
||||||
logger.info(`💾 Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
logger.info(`💾 Preparing final save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||||
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
|
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
|
||||||
|
|
||||||
const outputData = {
|
|
||||||
metadata: {
|
|
||||||
extractedAt: new Date().toISOString(),
|
|
||||||
parser: "job-search-parser",
|
|
||||||
version: "2.0.0",
|
|
||||||
sites: sites,
|
|
||||||
keywords: keywords.join(", "),
|
|
||||||
locationFilter,
|
|
||||||
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
|
||||||
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
|
|
||||||
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
|
|
||||||
analysisResults,
|
|
||||||
rejectedJobsExcluded: excludeRejected,
|
|
||||||
},
|
|
||||||
results: allResults,
|
|
||||||
siteResults,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Always include rejectedResults if not excluded (make it explicit, not using spread)
|
|
||||||
if (!excludeRejected) {
|
if (!excludeRejected) {
|
||||||
outputData.rejectedResults = allRejectedResults;
|
|
||||||
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
|
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
|
||||||
} else {
|
} else {
|
||||||
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
|
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(`💾 Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`);
|
logger.info(`💾 Final output: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||||
|
|
||||||
const resultsDir = path.join(__dirname, "results");
|
// Final save with isComplete flag
|
||||||
if (!fs.existsSync(resultsDir)) {
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, true);
|
||||||
fs.mkdirSync(resultsDir, { recursive: true });
|
|
||||||
}
|
|
||||||
|
|
||||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
||||||
const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT;
|
|
||||||
const savedFiles = [];
|
const savedFiles = [];
|
||||||
|
if (incrementalJsonFilepath) savedFiles.push(incrementalJsonFilepath);
|
||||||
// Save JSON if format is "json" or "both"
|
if (incrementalCsvFilepath) savedFiles.push(incrementalCsvFilepath);
|
||||||
if (outputFormat === "json" || outputFormat === "both") {
|
|
||||||
const jsonFilename = `job-search-results-${timestamp}.json`;
|
|
||||||
const jsonFilepath = path.join(resultsDir, jsonFilename);
|
|
||||||
fs.writeFileSync(jsonFilepath, JSON.stringify(outputData, null, 2));
|
|
||||||
savedFiles.push(jsonFilepath);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save CSV if format is "csv" or "both"
|
|
||||||
if (outputFormat === "csv" || outputFormat === "both") {
|
|
||||||
const csvFilename = `job-search-results-${timestamp}.csv`;
|
|
||||||
const csvFilepath = path.join(resultsDir, csvFilename);
|
|
||||||
const csvContent = convertResultsToCsv(outputData);
|
|
||||||
fs.writeFileSync(csvFilepath, csvContent);
|
|
||||||
savedFiles.push(csvFilepath);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Final summary
|
// Final summary
|
||||||
logger.step("\n📊 Job Search Parser Summary");
|
logger.step("\n📊 Job Search Parser Summary");
|
||||||
@ -348,6 +454,31 @@ async function startJobSearchParser(options = {}) {
|
|||||||
|
|
||||||
logger.success("\n✅ Job Search Parser completed successfully!");
|
logger.success("\n✅ Job Search Parser completed successfully!");
|
||||||
|
|
||||||
|
// Construct output data for return
|
||||||
|
const outputData = {
|
||||||
|
metadata: {
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
parser: "job-search-parser",
|
||||||
|
version: "2.0.0",
|
||||||
|
sites: sites,
|
||||||
|
keywords: keywords.join(", "),
|
||||||
|
locationFilter,
|
||||||
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
||||||
|
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
|
||||||
|
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
|
||||||
|
analysisResults: analysisResults,
|
||||||
|
rejectedJobsExcluded: excludeRejected,
|
||||||
|
isComplete: true,
|
||||||
|
lastUpdated: new Date().toISOString(),
|
||||||
|
},
|
||||||
|
results: allResults,
|
||||||
|
siteResults: siteResults,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!excludeRejected) {
|
||||||
|
outputData.rejectedResults = allRejectedResults;
|
||||||
|
}
|
||||||
|
|
||||||
return outputData;
|
return outputData;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`❌ Job Search Parser failed: ${error.message}`);
|
logger.error(`❌ Job Search Parser failed: ${error.message}`);
|
||||||
|
|||||||
@ -13,6 +13,7 @@ const {
|
|||||||
logger,
|
logger,
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
parseLocationFilters,
|
parseLocationFilters,
|
||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
extractLocationFromProfile,
|
extractLocationFromProfile,
|
||||||
@ -125,10 +126,12 @@ async function parseSkipTheDrive(options = {}) {
|
|||||||
headless = process.env.HEADLESS !== "false",
|
headless = process.env.HEADLESS !== "false",
|
||||||
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
||||||
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
||||||
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
logger.step("Starting SkipTheDrive parser...");
|
logger.step("Starting SkipTheDrive parser...");
|
||||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
logger.info(
|
logger.info(
|
||||||
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
||||||
);
|
);
|
||||||
@ -154,8 +157,12 @@ async function parseSkipTheDrive(options = {}) {
|
|||||||
const seenJobs = new Set();
|
const seenJobs = new Set();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Search for each keyword
|
// For AND logic, combine all keywords into a single search query
|
||||||
for (const keyword of keywords) {
|
// For OR logic, search each keyword separately
|
||||||
|
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
|
||||||
|
|
||||||
|
// Search for each keyword (or combined keyword for AND logic)
|
||||||
|
for (const keyword of searchKeywords) {
|
||||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||||
|
|
||||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||||
@ -208,11 +215,17 @@ async function parseSkipTheDrive(options = {}) {
|
|||||||
|
|
||||||
// Validate job against keywords
|
// Validate job against keywords
|
||||||
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
||||||
if (!containsAnyKeyword(fullText, keywords)) {
|
const keywordMatch = useAndLogic
|
||||||
|
? containsAllKeywords(fullText, keywords)
|
||||||
|
: containsAnyKeyword(fullText, keywords);
|
||||||
|
|
||||||
|
if (!keywordMatch) {
|
||||||
rejectedResults.push({
|
rejectedResults.push({
|
||||||
...jobData,
|
...jobData,
|
||||||
rejected: true,
|
rejected: true,
|
||||||
reason: "Keywords not found in job listing",
|
reason: useAndLogic
|
||||||
|
? "Not all keywords found in job listing"
|
||||||
|
: "Keywords not found in job listing",
|
||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,6 +44,8 @@ function convertJobsToCsv(jobs, metadata = null) {
|
|||||||
"jobUrl",
|
"jobUrl",
|
||||||
"postedDate",
|
"postedDate",
|
||||||
"description",
|
"description",
|
||||||
|
"roleDuties",
|
||||||
|
"jobRequirements",
|
||||||
"jobType",
|
"jobType",
|
||||||
"experienceLevel",
|
"experienceLevel",
|
||||||
"keyword",
|
"keyword",
|
||||||
|
|||||||
@ -10,6 +10,8 @@ const {
|
|||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
parseLocationFilters,
|
parseLocationFilters,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
|
matchesKeywordGroups,
|
||||||
} = require("ai-analyzer");
|
} = require("ai-analyzer");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -34,6 +36,28 @@ function buildJobSearchUrl(keyword, location = "", filters = {}) {
|
|||||||
params.append("location", location);
|
params.append("location", location);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add date filter if provided (f_TPR parameter)
|
||||||
|
// LinkedIn uses f_TPR=r<seconds> where seconds is the time range
|
||||||
|
if (filters.minDate) {
|
||||||
|
try {
|
||||||
|
const minDate = new Date(filters.minDate);
|
||||||
|
const now = new Date();
|
||||||
|
const secondsDiff = Math.floor((now - minDate) / 1000);
|
||||||
|
|
||||||
|
// LinkedIn supports relative timeframes (f_TPR parameter)
|
||||||
|
// If date is in the future, don't add filter
|
||||||
|
if (secondsDiff > 0) {
|
||||||
|
// LinkedIn typically supports up to ~30 days (2592000 seconds)
|
||||||
|
// For dates older than 30 days, we'll still add it but LinkedIn may limit results
|
||||||
|
const maxSeconds = 2592000; // 30 days
|
||||||
|
const timeRange = Math.min(secondsDiff, maxSeconds);
|
||||||
|
params.append("f_TPR", `r${timeRange}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`⚠️ Invalid date format for minDate: ${filters.minDate}. Expected format: YYYY-MM-DD`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Add additional filters
|
// Add additional filters
|
||||||
if (filters.experienceLevel) {
|
if (filters.experienceLevel) {
|
||||||
params.append("f_E", filters.experienceLevel);
|
params.append("f_E", filters.experienceLevel);
|
||||||
@ -54,10 +78,13 @@ function buildJobSearchUrl(keyword, location = "", filters = {}) {
|
|||||||
async function linkedinJobsStrategy(coreParser, options = {}) {
|
async function linkedinJobsStrategy(coreParser, options = {}) {
|
||||||
const {
|
const {
|
||||||
keywords = ["software engineer", "developer"],
|
keywords = ["software engineer", "developer"],
|
||||||
|
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
|
||||||
locationFilter = null,
|
locationFilter = null,
|
||||||
maxPages = 5,
|
maxPages = 5,
|
||||||
credentials = {},
|
credentials = {},
|
||||||
location = "", // LinkedIn location search (e.g., "Canada", "Toronto, Ontario, Canada")
|
location = "", // LinkedIn location search (e.g., "Canada", "Toronto, Ontario, Canada")
|
||||||
|
minDate = null, // Minimum posted date (format: YYYY-MM-DD)
|
||||||
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
@ -79,15 +106,39 @@ async function linkedinJobsStrategy(coreParser, options = {}) {
|
|||||||
|
|
||||||
logger.info("🚀 Starting LinkedIn Jobs parser...");
|
logger.info("🚀 Starting LinkedIn Jobs parser...");
|
||||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
|
if (keywordGroups) {
|
||||||
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
} else {
|
||||||
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
|
}
|
||||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
logger.info(`🌍 LinkedIn Location: ${location || "None"}`);
|
logger.info(`🌍 LinkedIn Location: ${location || "None"}`);
|
||||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||||
|
if (minDate) {
|
||||||
|
logger.info(`📅 Min Date Filter: ${minDate} (jobs posted after this date)`);
|
||||||
|
}
|
||||||
|
|
||||||
// Search for each keyword
|
// Determine search keywords based on logic type
|
||||||
for (const keyword of keywords) {
|
let searchKeywords;
|
||||||
|
if (keywordGroups) {
|
||||||
|
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
|
||||||
|
// We'll combine results and filter to ensure all groups match (AND between groups)
|
||||||
|
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// For simple AND logic, combine all keywords into a single search query
|
||||||
|
searchKeywords = [keywords.join(" ")];
|
||||||
|
} else {
|
||||||
|
// For OR logic, search each keyword separately
|
||||||
|
searchKeywords = keywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search for each keyword (or combined keyword for AND logic)
|
||||||
|
for (const keyword of searchKeywords) {
|
||||||
logger.info(`\n🔍 Searching LinkedIn Jobs for: "${keyword}"`);
|
logger.info(`\n🔍 Searching LinkedIn Jobs for: "${keyword}"`);
|
||||||
|
|
||||||
const searchUrl = buildJobSearchUrl(keyword, location);
|
const searchUrl = buildJobSearchUrl(keyword, location, {
|
||||||
|
minDate: minDate,
|
||||||
|
});
|
||||||
logger.info(`🔗 Search URL: ${searchUrl}`);
|
logger.info(`🔗 Search URL: ${searchUrl}`);
|
||||||
|
|
||||||
// Check if page is still valid before proceeding
|
// Check if page is still valid before proceeding
|
||||||
@ -220,7 +271,7 @@ async function linkedinJobsStrategy(coreParser, options = {}) {
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||||
|
|
||||||
// Extract jobs from current page
|
// Extract jobs from current page
|
||||||
const pageJobs = await extractJobsFromPage(page, keyword, locationFilter);
|
const pageJobs = await extractJobsFromPage(page, keyword, locationFilter, coreParser);
|
||||||
logger.info(`📋 Extracted ${pageJobs.length} jobs from page ${currentPage}`);
|
logger.info(`📋 Extracted ${pageJobs.length} jobs from page ${currentPage}`);
|
||||||
|
|
||||||
if (pageJobs.length === 0) {
|
if (pageJobs.length === 0) {
|
||||||
@ -317,10 +368,35 @@ async function linkedinJobsStrategy(coreParser, options = {}) {
|
|||||||
}
|
}
|
||||||
seenJobs.add(job.jobId);
|
seenJobs.add(job.jobId);
|
||||||
|
|
||||||
// REMOVED: Keyword validation - LinkedIn already filtered by keyword in search results
|
// Validate keywords based on logic type
|
||||||
// If LinkedIn returned this job in search results, it matches the keyword.
|
if (keywordGroups) {
|
||||||
// The snippet might not contain the keyword, but the full description does.
|
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
|
||||||
// Trust LinkedIn's search algorithm rather than re-validating against snippets.
|
const fullText = `${job.title} ${job.description} ${job.company}`;
|
||||||
|
if (!matchesKeywordGroups(fullText, keywordGroups)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Job does not match all keyword groups",
|
||||||
|
});
|
||||||
|
if (process.env.DEBUG === "true") {
|
||||||
|
logger.debug(`🔍 Rejected (grouped logic): "${job.title}" - does not match all groups`);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// Simple AND logic: all keywords must match
|
||||||
|
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
|
||||||
|
if (!containsAllKeywords(fullText, keywords)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Not all keywords found in job listing",
|
||||||
|
});
|
||||||
|
if (process.env.DEBUG === "true") {
|
||||||
|
logger.debug(`🔍 Rejected (AND logic): "${job.title}" - not all keywords found`);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// For OR logic, trust LinkedIn's search results (already filtered)
|
||||||
|
|
||||||
// Validate location if filtering enabled
|
// Validate location if filtering enabled
|
||||||
if (locationFilter) {
|
if (locationFilter) {
|
||||||
@ -514,7 +590,7 @@ async function scrollToLoadJobs(page) {
|
|||||||
/**
|
/**
|
||||||
* Extract jobs from current page
|
* Extract jobs from current page
|
||||||
*/
|
*/
|
||||||
async function extractJobsFromPage(page, keyword, locationFilter) {
|
async function extractJobsFromPage(page, keyword, locationFilter, coreParser = null) {
|
||||||
const jobs = [];
|
const jobs = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -644,7 +720,7 @@ async function extractJobsFromPage(page, keyword, locationFilter) {
|
|||||||
logger.debug(`Could not scroll/hover job element ${i}: ${scrollError.message}`);
|
logger.debug(`Could not scroll/hover job element ${i}: ${scrollError.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const job = await extractJobData(jobElement, keyword);
|
const job = await extractJobData(jobElement, keyword, page, coreParser);
|
||||||
if (job && (job.title || job.jobId)) {
|
if (job && (job.title || job.jobId)) {
|
||||||
// Only add if we have at least a title or jobId
|
// Only add if we have at least a title or jobId
|
||||||
jobs.push(job);
|
jobs.push(job);
|
||||||
@ -671,10 +747,240 @@ async function extractJobsFromPage(page, keyword, locationFilter) {
|
|||||||
return jobs;
|
return jobs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract full job description from job detail page
|
||||||
|
*/
|
||||||
|
async function extractFullJobDescription(coreParser, jobUrl) {
|
||||||
|
try {
|
||||||
|
if (!jobUrl) {
|
||||||
|
return { fullDescription: "", roleDuties: "", jobRequirements: "" };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a separate page for detail extraction to avoid disrupting search results
|
||||||
|
const detailPage = await coreParser.createPage(`linkedin-job-detail-${Date.now()}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Navigate to job detail page
|
||||||
|
await detailPage.goto(jobUrl, { waitUntil: "networkidle2", timeout: 30000 }).catch(() => {});
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait for content to load
|
||||||
|
|
||||||
|
const jobDetails = await detailPage.evaluate(() => {
|
||||||
|
const details = {
|
||||||
|
fullDescription: "",
|
||||||
|
roleDuties: "",
|
||||||
|
jobRequirements: "",
|
||||||
|
};
|
||||||
|
|
||||||
|
// Try multiple selectors for job description container
|
||||||
|
const descriptionSelectors = [
|
||||||
|
".description__text",
|
||||||
|
".show-more-less-html__markup",
|
||||||
|
"[class*='description__text']",
|
||||||
|
"[class*='job-description']",
|
||||||
|
".jobs-description__text",
|
||||||
|
".jobs-box__html-content",
|
||||||
|
"[data-test-id='job-description']",
|
||||||
|
".jobs-details__main-content",
|
||||||
|
".jobs-description-content__text",
|
||||||
|
];
|
||||||
|
|
||||||
|
let descriptionElement = null;
|
||||||
|
for (const selector of descriptionSelectors) {
|
||||||
|
descriptionElement = document.querySelector(selector);
|
||||||
|
if (descriptionElement) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (descriptionElement) {
|
||||||
|
details.fullDescription = descriptionElement.textContent?.trim() ||
|
||||||
|
descriptionElement.innerText?.trim() || "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we didn't find description, try to get from main content area
|
||||||
|
if (!details.fullDescription) {
|
||||||
|
const mainContent = document.querySelector("main") ||
|
||||||
|
document.querySelector("[class*='jobs-details']") ||
|
||||||
|
document.querySelector("[class*='job-details']");
|
||||||
|
if (mainContent) {
|
||||||
|
details.fullDescription = mainContent.textContent?.trim() ||
|
||||||
|
mainContent.innerText?.trim() || "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return details;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Parse duties and requirements from full description
|
||||||
|
const parsed = parseDutiesAndRequirements(jobDetails.fullDescription);
|
||||||
|
|
||||||
|
return {
|
||||||
|
fullDescription: jobDetails.fullDescription,
|
||||||
|
roleDuties: parsed.duties,
|
||||||
|
jobRequirements: parsed.requirements,
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
// Close the detail page to free resources
|
||||||
|
try {
|
||||||
|
await detailPage.close();
|
||||||
|
} catch (closeError) {
|
||||||
|
// Ignore close errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`Failed to extract full job description from ${jobUrl}: ${error.message}`);
|
||||||
|
return { fullDescription: "", roleDuties: "", jobRequirements: "" };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse job description to separate role duties from job requirements
|
||||||
|
*/
|
||||||
|
function parseDutiesAndRequirements(description) {
|
||||||
|
if (!description || description.trim().length === 0) {
|
||||||
|
return { duties: "", requirements: "" };
|
||||||
|
}
|
||||||
|
|
||||||
|
const duties = [];
|
||||||
|
const requirements = [];
|
||||||
|
|
||||||
|
// Common section headers that indicate duties/responsibilities
|
||||||
|
const dutiesKeywords = [
|
||||||
|
/responsibilities?:/i,
|
||||||
|
/duties?:/i,
|
||||||
|
/what you['\u2019]ll do/i,
|
||||||
|
/key responsibilities/i,
|
||||||
|
/your role/i,
|
||||||
|
/position overview/i,
|
||||||
|
/about the role/i,
|
||||||
|
/role overview/i,
|
||||||
|
/what we need/i,
|
||||||
|
/you will:/i,
|
||||||
|
/you['\u2019]ll be responsible/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Common section headers that indicate requirements/qualifications
|
||||||
|
const requirementsKeywords = [
|
||||||
|
/requirements?:/i,
|
||||||
|
/qualifications?:/i,
|
||||||
|
/must have/i,
|
||||||
|
/required:/i,
|
||||||
|
/what you['\u2019]ll bring/i,
|
||||||
|
/you have:/i,
|
||||||
|
/skills required/i,
|
||||||
|
/minimum requirements/i,
|
||||||
|
/preferred qualifications/i,
|
||||||
|
/education:/i,
|
||||||
|
/experience:/i,
|
||||||
|
/you must have/i,
|
||||||
|
/we['\u2019]re looking for/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Split description into sections (by common delimiters)
|
||||||
|
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
|
||||||
|
|
||||||
|
let currentSection = "duties"; // Default to duties
|
||||||
|
let dutiesText = "";
|
||||||
|
let requirementsText = "";
|
||||||
|
|
||||||
|
for (const section of sections) {
|
||||||
|
const sectionLower = section.toLowerCase();
|
||||||
|
|
||||||
|
// Check if this section is about requirements
|
||||||
|
let isRequirementsSection = false;
|
||||||
|
for (const keyword of requirementsKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
isRequirementsSection = true;
|
||||||
|
currentSection = "requirements";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this section is about duties/responsibilities
|
||||||
|
if (!isRequirementsSection) {
|
||||||
|
for (const keyword of dutiesKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
currentSection = "duties";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to appropriate section
|
||||||
|
if (currentSection === "requirements") {
|
||||||
|
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
|
||||||
|
} else {
|
||||||
|
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we couldn't split by sections, try to find bullet points or numbered lists
|
||||||
|
if (!dutiesText && !requirementsText) {
|
||||||
|
const lines = description.split(/\n/);
|
||||||
|
let foundRequirementsHeader = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
const line = lines[i].trim();
|
||||||
|
if (line.length === 0) continue;
|
||||||
|
|
||||||
|
// Check if this line is a requirements header
|
||||||
|
for (const keyword of requirementsKeywords) {
|
||||||
|
if (keyword.test(line)) {
|
||||||
|
foundRequirementsHeader = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (foundRequirementsHeader) {
|
||||||
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
||||||
|
} else {
|
||||||
|
// Check if it's a duties header
|
||||||
|
let isDutiesHeader = false;
|
||||||
|
for (const keyword of dutiesKeywords) {
|
||||||
|
if (keyword.test(line)) {
|
||||||
|
isDutiesHeader = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isDutiesHeader) {
|
||||||
|
// Add to duties if we haven't found requirements header yet
|
||||||
|
if (!foundRequirementsHeader) {
|
||||||
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
||||||
|
} else {
|
||||||
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
|
||||||
|
if (!dutiesText && !requirementsText && description) {
|
||||||
|
const midPoint = Math.floor(description.length * 0.6);
|
||||||
|
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
|
||||||
|
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
|
||||||
|
const splitPoint = Math.max(
|
||||||
|
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
|
||||||
|
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
|
||||||
|
);
|
||||||
|
|
||||||
|
dutiesText = description.substring(0, splitPoint).trim();
|
||||||
|
requirementsText = description.substring(splitPoint).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
duties: dutiesText.trim(),
|
||||||
|
requirements: requirementsText.trim(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract data from individual job element
|
* Extract data from individual job element
|
||||||
*/
|
*/
|
||||||
async function extractJobData(jobElement, keyword) {
|
async function extractJobData(jobElement, keyword, page = null, coreParser = null) {
|
||||||
try {
|
try {
|
||||||
const jobData = await jobElement.evaluate((el) => {
|
const jobData = await jobElement.evaluate((el) => {
|
||||||
const data = {
|
const data = {
|
||||||
@ -1191,6 +1497,20 @@ async function extractJobData(jobElement, keyword) {
|
|||||||
// Generate job ID if not found
|
// Generate job ID if not found
|
||||||
const jobId = jobData.jobId || `linkedin-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
const jobId = jobData.jobId || `linkedin-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
||||||
|
|
||||||
|
// Extract full job details if coreParser and jobUrl are provided
|
||||||
|
let fullDetails = { fullDescription: "", roleDuties: "", jobRequirements: "" };
|
||||||
|
if (coreParser && jobUrl) {
|
||||||
|
try {
|
||||||
|
fullDetails = await extractFullJobDescription(coreParser, jobUrl);
|
||||||
|
// If we got full description, update the description field
|
||||||
|
if (fullDetails.fullDescription) {
|
||||||
|
jobData.description = fullDetails.fullDescription;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.debug(`Could not extract full job details for ${jobUrl}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
jobId,
|
jobId,
|
||||||
title,
|
title,
|
||||||
@ -1198,7 +1518,9 @@ async function extractJobData(jobElement, keyword) {
|
|||||||
location: cleanText(jobData.location),
|
location: cleanText(jobData.location),
|
||||||
jobUrl,
|
jobUrl,
|
||||||
postedDate: jobData.postedDate,
|
postedDate: jobData.postedDate,
|
||||||
description: cleanText(jobData.description),
|
description: cleanText(fullDetails.fullDescription || jobData.description),
|
||||||
|
roleDuties: cleanText(fullDetails.roleDuties),
|
||||||
|
jobRequirements: cleanText(fullDetails.jobRequirements),
|
||||||
jobType: jobData.jobType,
|
jobType: jobData.jobType,
|
||||||
experienceLevel: jobData.experienceLevel,
|
experienceLevel: jobData.experienceLevel,
|
||||||
keyword,
|
keyword,
|
||||||
|
|||||||
@ -8,6 +8,8 @@ const {
|
|||||||
logger,
|
logger,
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
|
matchesKeywordGroups,
|
||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
} = require("ai-analyzer");
|
} = require("ai-analyzer");
|
||||||
|
|
||||||
@ -34,9 +36,11 @@ function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
|||||||
async function skipthedriveStrategy(coreParser, options = {}) {
|
async function skipthedriveStrategy(coreParser, options = {}) {
|
||||||
const {
|
const {
|
||||||
keywords = ["software engineer", "developer", "programmer"],
|
keywords = ["software engineer", "developer", "programmer"],
|
||||||
|
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
|
||||||
locationFilter = null,
|
locationFilter = null,
|
||||||
maxPages = 5,
|
maxPages = 5,
|
||||||
jobTypes = [],
|
jobTypes = [],
|
||||||
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
@ -49,11 +53,29 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
|
|
||||||
logger.info("🚀 Starting SkipTheDrive parser...");
|
logger.info("🚀 Starting SkipTheDrive parser...");
|
||||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
|
if (keywordGroups) {
|
||||||
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
} else {
|
||||||
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
|
}
|
||||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||||
|
|
||||||
// Search for each keyword
|
// Determine search keywords based on logic type
|
||||||
for (const keyword of keywords) {
|
let searchKeywords;
|
||||||
|
if (keywordGroups) {
|
||||||
|
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
|
||||||
|
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// For simple AND logic, combine all keywords into a single search query
|
||||||
|
searchKeywords = [keywords.join(" ")];
|
||||||
|
} else {
|
||||||
|
// For OR logic, search each keyword separately
|
||||||
|
searchKeywords = keywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search for each keyword (or combined keyword for AND logic)
|
||||||
|
for (const keyword of searchKeywords) {
|
||||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||||
|
|
||||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||||
@ -92,7 +114,10 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
const pageJobs = await extractJobsFromPage(
|
const pageJobs = await extractJobsFromPage(
|
||||||
page,
|
page,
|
||||||
keyword,
|
keyword,
|
||||||
locationFilter
|
locationFilter,
|
||||||
|
keywords,
|
||||||
|
keywordGroups,
|
||||||
|
useAndLogic
|
||||||
);
|
);
|
||||||
|
|
||||||
for (const job of pageJobs) {
|
for (const job of pageJobs) {
|
||||||
@ -100,6 +125,29 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
if (seenJobs.has(job.jobId)) continue;
|
if (seenJobs.has(job.jobId)) continue;
|
||||||
seenJobs.add(job.jobId);
|
seenJobs.add(job.jobId);
|
||||||
|
|
||||||
|
// Validate keywords based on logic type
|
||||||
|
if (keywordGroups) {
|
||||||
|
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
|
||||||
|
const fullText = `${job.title} ${job.description} ${job.company}`;
|
||||||
|
if (!matchesKeywordGroups(fullText, keywordGroups)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Job does not match all keyword groups",
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// Simple AND logic: all keywords must match
|
||||||
|
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
|
||||||
|
if (!containsAllKeywords(fullText, keywords)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Not all keywords found in job listing",
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Validate location if filtering enabled
|
// Validate location if filtering enabled
|
||||||
if (locationFilter) {
|
if (locationFilter) {
|
||||||
const locationValid = validateLocationAgainstFilters(
|
const locationValid = validateLocationAgainstFilters(
|
||||||
@ -160,7 +208,7 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
/**
|
/**
|
||||||
* Extract jobs from current page
|
* Extract jobs from current page
|
||||||
*/
|
*/
|
||||||
async function extractJobsFromPage(page, keyword, locationFilter) {
|
async function extractJobsFromPage(page, keyword, locationFilter, allKeywords = [], keywordGroups = null, useAndLogic = false) {
|
||||||
const jobs = [];
|
const jobs = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -184,6 +232,147 @@ async function extractJobsFromPage(page, keyword, locationFilter) {
|
|||||||
return jobs;
|
return jobs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse job description to separate role duties from job requirements
|
||||||
|
*/
|
||||||
|
function parseDutiesAndRequirements(description) {
|
||||||
|
if (!description || description.trim().length === 0) {
|
||||||
|
return { duties: "", requirements: "" };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common section headers that indicate duties/responsibilities
|
||||||
|
const dutiesKeywords = [
|
||||||
|
/responsibilities?:/i,
|
||||||
|
/duties?:/i,
|
||||||
|
/what you['\u2019]ll do/i,
|
||||||
|
/key responsibilities/i,
|
||||||
|
/your role/i,
|
||||||
|
/position overview/i,
|
||||||
|
/about the role/i,
|
||||||
|
/role overview/i,
|
||||||
|
/what we need/i,
|
||||||
|
/you will:/i,
|
||||||
|
/you['\u2019]ll be responsible/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Common section headers that indicate requirements/qualifications
|
||||||
|
const requirementsKeywords = [
|
||||||
|
/requirements?:/i,
|
||||||
|
/qualifications?:/i,
|
||||||
|
/must have/i,
|
||||||
|
/required:/i,
|
||||||
|
/what you['\u2019]ll bring/i,
|
||||||
|
/you have:/i,
|
||||||
|
/skills required/i,
|
||||||
|
/minimum requirements/i,
|
||||||
|
/preferred qualifications/i,
|
||||||
|
/education:/i,
|
||||||
|
/experience:/i,
|
||||||
|
/you must have/i,
|
||||||
|
/we['\u2019]re looking for/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Split description into sections (by common delimiters)
|
||||||
|
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
|
||||||
|
|
||||||
|
let currentSection = "duties"; // Default to duties
|
||||||
|
let dutiesText = "";
|
||||||
|
let requirementsText = "";
|
||||||
|
|
||||||
|
for (const section of sections) {
|
||||||
|
const sectionLower = section.toLowerCase();
|
||||||
|
|
||||||
|
// Check if this section is about requirements
|
||||||
|
let isRequirementsSection = false;
|
||||||
|
for (const keyword of requirementsKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
isRequirementsSection = true;
|
||||||
|
currentSection = "requirements";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this section is about duties/responsibilities
|
||||||
|
if (!isRequirementsSection) {
|
||||||
|
for (const keyword of dutiesKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
currentSection = "duties";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to appropriate section
|
||||||
|
if (currentSection === "requirements") {
|
||||||
|
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
|
||||||
|
} else {
|
||||||
|
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we couldn't split by sections, try to find bullet points or numbered lists
|
||||||
|
if (!dutiesText && !requirementsText) {
|
||||||
|
const lines = description.split(/\n/);
|
||||||
|
let foundRequirementsHeader = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
const line = lines[i].trim();
|
||||||
|
if (line.length === 0) continue;
|
||||||
|
|
||||||
|
// Check if this line is a requirements header
|
||||||
|
for (const keyword of requirementsKeywords) {
|
||||||
|
if (keyword.test(line)) {
|
||||||
|
foundRequirementsHeader = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (foundRequirementsHeader) {
|
||||||
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
||||||
|
} else {
|
||||||
|
// Check if it's a duties header
|
||||||
|
let isDutiesHeader = false;
|
||||||
|
for (const keyword of dutiesKeywords) {
|
||||||
|
if (keyword.test(line)) {
|
||||||
|
isDutiesHeader = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isDutiesHeader) {
|
||||||
|
// Add to duties if we haven't found requirements header yet
|
||||||
|
if (!foundRequirementsHeader) {
|
||||||
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
||||||
|
} else {
|
||||||
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
|
||||||
|
if (!dutiesText && !requirementsText && description) {
|
||||||
|
const midPoint = Math.floor(description.length * 0.6);
|
||||||
|
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
|
||||||
|
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
|
||||||
|
const splitPoint = Math.max(
|
||||||
|
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
|
||||||
|
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
|
||||||
|
);
|
||||||
|
|
||||||
|
dutiesText = description.substring(0, splitPoint).trim();
|
||||||
|
requirementsText = description.substring(splitPoint).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
duties: dutiesText.trim(),
|
||||||
|
requirements: requirementsText.trim(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract data from individual job element
|
* Extract data from individual job element
|
||||||
*/
|
*/
|
||||||
@ -242,6 +431,9 @@ async function extractJobData(jobElement, keyword) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse duties and requirements from description if available
|
||||||
|
const parsed = parseDutiesAndRequirements(description);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
jobId,
|
jobId,
|
||||||
title,
|
title,
|
||||||
@ -252,6 +444,8 @@ async function extractJobData(jobElement, keyword) {
|
|||||||
dateText,
|
dateText,
|
||||||
daysAgo,
|
daysAgo,
|
||||||
description,
|
description,
|
||||||
|
roleDuties: parsed.duties,
|
||||||
|
jobRequirements: parsed.requirements,
|
||||||
isFeatured,
|
isFeatured,
|
||||||
keyword,
|
keyword,
|
||||||
extractedAt: new Date().toISOString(),
|
extractedAt: new Date().toISOString(),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user