- Introduced a new Indeed parsing strategy to support job extraction from Indeed, including advanced filtering options. - Updated job search parser to include Indeed in the site strategies, allowing for combined searches with other job sites. - Enhanced README documentation with detailed usage instructions for the Indeed parser, including examples for keyword and location filtering. - Improved logging for Indeed parsing to provide insights into job extraction processes and potential CAPTCHA handling.
503 lines
18 KiB
JavaScript
503 lines
18 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
/**
|
|
* Job Search Parser - Refactored
|
|
*
|
|
* Uses core-parser for browser management and site-specific strategies for parsing logic
|
|
*/
|
|
|
|
const path = require("path");
|
|
const fs = require("fs");
|
|
const CoreParser = require("../core-parser");
|
|
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
|
|
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
|
|
const { indeedStrategy } = require("./strategies/indeed-strategy");
|
|
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
|
|
const { convertResultsToCsv } = require("./src/csv-utils");
|
|
|
|
// Load environment variables
|
|
require("dotenv").config({ path: path.join(__dirname, ".env") });
|
|
|
|
// Configuration from environment
|
|
const HEADLESS = process.env.HEADLESS !== "false";
|
|
const SEARCH_KEYWORDS =
|
|
process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
|
|
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
|
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
|
|
const AI_CONTEXT = process.env.AI_CONTEXT || "Job market analysis focusing on job postings, skills, and trends";
|
|
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
|
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
|
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
|
|
const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both"
|
|
const MIN_DATE = process.env.MIN_DATE; // Minimum posted date (format: YYYY-MM-DD)
|
|
const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for keywords
|
|
|
|
// Available site strategies
|
|
const SITE_STRATEGIES = {
|
|
skipthedrive: skipthedriveStrategy,
|
|
linkedin: linkedinJobsStrategy,
|
|
indeed: indeedStrategy,
|
|
// Add more site strategies here
|
|
// glassdoor: glassdoorStrategy,
|
|
};
|
|
|
|
/**
|
|
* Parse command line arguments
|
|
*/
|
|
function parseArguments() {
|
|
const args = process.argv.slice(2);
|
|
const options = {
|
|
sites: ["skipthedrive"], // default
|
|
keywords: null,
|
|
locationFilter: null,
|
|
maxPages: MAX_PAGES,
|
|
excludeRejected: EXCLUDE_REJECTED,
|
|
outputFormat: OUTPUT_FORMAT,
|
|
minDate: MIN_DATE,
|
|
useAndLogic: USE_AND_LOGIC, // Use AND logic instead of OR logic for keywords (from env or CLI)
|
|
};
|
|
|
|
args.forEach((arg) => {
|
|
if (arg.startsWith("--sites=")) {
|
|
options.sites = arg
|
|
.split("=")[1]
|
|
.split(",")
|
|
.map((s) => s.trim());
|
|
} else if (arg.startsWith("--keywords=")) {
|
|
options.keywords = arg
|
|
.split("=")[1]
|
|
.split(",")
|
|
.map((k) => k.trim());
|
|
} else if (arg.startsWith("--location=")) {
|
|
options.locationFilter = arg.split("=")[1];
|
|
} else if (arg.startsWith("--max-pages=")) {
|
|
const value = arg.split("=")[1];
|
|
// Support "all" or "0" to mean unlimited pages
|
|
if (value === "all" || value === "0") {
|
|
options.maxPages = 0; // 0 means unlimited
|
|
} else {
|
|
options.maxPages = parseInt(value) || MAX_PAGES;
|
|
}
|
|
} else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
|
|
options.excludeRejected = true;
|
|
} else if (arg.startsWith("--output=") || arg.startsWith("--format=")) {
|
|
const format = arg.split("=")[1].toLowerCase();
|
|
if (["json", "csv", "both"].includes(format)) {
|
|
options.outputFormat = format;
|
|
} else {
|
|
logger.warning(`⚠️ Unknown output format: ${format}. Using default: json`);
|
|
}
|
|
} else if (arg.startsWith("--min-date=")) {
|
|
options.minDate = arg.split("=")[1];
|
|
} else if (arg === "--and" || arg === "--all-keywords") {
|
|
options.useAndLogic = true; // CLI flag overrides env variable
|
|
}
|
|
});
|
|
|
|
return options;
|
|
}
|
|
|
|
/**
|
|
* Main job search parser function
|
|
*/
|
|
async function startJobSearchParser(options = {}) {
|
|
const cliOptions = parseArguments();
|
|
const finalOptions = { ...cliOptions, ...options };
|
|
|
|
const coreParser = new CoreParser({
|
|
headless: HEADLESS,
|
|
timeout: 30000,
|
|
});
|
|
|
|
try {
|
|
logger.step("🚀 Job Search Parser Starting...");
|
|
|
|
// Parse keywords
|
|
let keywords =
|
|
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
|
|
|
// Parse keyword groups if AND logic is enabled and keywords contain pipe (|) separator
|
|
// Format: "co-op|intern,summer 2026" means (co-op OR intern) AND (summer 2026)
|
|
let keywordGroups = null;
|
|
if (finalOptions.useAndLogic && keywords.some(k => k.includes('|'))) {
|
|
keywordGroups = keywords.map(group =>
|
|
group.split('|').map(k => k.trim()).filter(k => k.length > 0)
|
|
);
|
|
logger.info(`🔍 Keyword Groups: ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
|
}
|
|
|
|
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
|
const sites = finalOptions.sites;
|
|
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
|
|
|
|
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
|
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
|
if (keywordGroups) {
|
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
|
} else {
|
|
logger.info(`🔗 Keyword Logic: ${finalOptions.useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
|
}
|
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
|
const minDate = finalOptions.minDate || MIN_DATE;
|
|
if (minDate) {
|
|
logger.info(`📅 Min Date Filter: ${minDate} (jobs posted after this date)`);
|
|
}
|
|
logger.info(
|
|
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
|
|
);
|
|
if (ENABLE_AI_ANALYSIS) {
|
|
logger.info(` Context: "${AI_CONTEXT}"`);
|
|
logger.info(` Model: ${OLLAMA_MODEL}`);
|
|
}
|
|
|
|
const allResults = [];
|
|
const allRejectedResults = [];
|
|
const siteResults = {};
|
|
let analysisResults = null;
|
|
|
|
// Initialize results directory and file for incremental saving
|
|
const resultsDir = path.join(__dirname, "results");
|
|
if (!fs.existsSync(resultsDir)) {
|
|
fs.mkdirSync(resultsDir, { recursive: true });
|
|
}
|
|
|
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT;
|
|
let incrementalJsonFilepath = null;
|
|
let incrementalCsvFilepath = null;
|
|
|
|
// Initialize incremental save files
|
|
if (outputFormat === "json" || outputFormat === "both") {
|
|
const jsonFilename = `job-search-results-${timestamp}.json`;
|
|
incrementalJsonFilepath = path.join(resultsDir, jsonFilename);
|
|
}
|
|
if (outputFormat === "csv" || outputFormat === "both") {
|
|
const csvFilename = `job-search-results-${timestamp}.csv`;
|
|
incrementalCsvFilepath = path.join(resultsDir, csvFilename);
|
|
}
|
|
|
|
/**
|
|
* Save results incrementally as they're found
|
|
*/
|
|
const saveIncrementalResults = (currentResults, currentRejectedResults, currentSiteResults, currentAnalysisResults = null, isComplete = false) => {
|
|
try {
|
|
const outputData = {
|
|
metadata: {
|
|
extractedAt: new Date().toISOString(),
|
|
parser: "job-search-parser",
|
|
version: "2.0.0",
|
|
sites: sites,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
|
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
|
|
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
|
|
analysisResults: currentAnalysisResults,
|
|
rejectedJobsExcluded: excludeRejected,
|
|
isComplete: isComplete,
|
|
lastUpdated: new Date().toISOString(),
|
|
},
|
|
results: currentResults,
|
|
siteResults: currentSiteResults,
|
|
};
|
|
|
|
if (!excludeRejected) {
|
|
outputData.rejectedResults = currentRejectedResults;
|
|
}
|
|
|
|
// Save JSON incrementally
|
|
if (incrementalJsonFilepath) {
|
|
fs.writeFileSync(incrementalJsonFilepath, JSON.stringify(outputData, null, 2));
|
|
}
|
|
|
|
// Save CSV incrementally (convert on each save)
|
|
if (incrementalCsvFilepath) {
|
|
const csvContent = convertResultsToCsv(outputData);
|
|
fs.writeFileSync(incrementalCsvFilepath, csvContent);
|
|
}
|
|
|
|
if (!isComplete) {
|
|
logger.info(`💾 Incremental save: ${currentResults.length} results saved to ${incrementalJsonFilepath || incrementalCsvFilepath}`);
|
|
}
|
|
} catch (error) {
|
|
logger.warning(`⚠️ Failed to save incremental results: ${error.message}`);
|
|
}
|
|
};
|
|
|
|
// Save initial empty state
|
|
saveIncrementalResults([], [], {}, null, false);
|
|
|
|
// Set up signal handlers for graceful shutdown
|
|
let isShuttingDown = false;
|
|
const gracefulShutdown = async (signal) => {
|
|
if (isShuttingDown) return;
|
|
isShuttingDown = true;
|
|
|
|
logger.warning(`\n⚠️ Received ${signal}, saving current results before exit...`);
|
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
|
logger.info(`💾 Saved ${allResults.length} results before shutdown`);
|
|
|
|
await coreParser.cleanup();
|
|
process.exit(0);
|
|
};
|
|
|
|
process.on('SIGINT', () => gracefulShutdown('SIGINT'));
|
|
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
|
|
|
|
// Process each selected site
|
|
for (const site of sites) {
|
|
const strategy = SITE_STRATEGIES[site];
|
|
if (!strategy) {
|
|
logger.error(`❌ Unknown site strategy: ${site}`);
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
logger.step(`\n🌐 Parsing ${site}...`);
|
|
const startTime = Date.now();
|
|
|
|
// Prepare strategy options
|
|
const strategyOptions = {
|
|
keywords,
|
|
keywordGroups, // Pass grouped keywords if available
|
|
locationFilter,
|
|
maxPages: finalOptions.maxPages,
|
|
useAndLogic: finalOptions.useAndLogic || false,
|
|
};
|
|
|
|
// Add credentials for LinkedIn
|
|
if (site === "linkedin") {
|
|
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
|
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
|
|
|
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
|
|
logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
|
|
siteResults[site] = {
|
|
count: 0,
|
|
rejected: 0,
|
|
duration: "0s",
|
|
error: "LinkedIn credentials not found",
|
|
};
|
|
continue;
|
|
}
|
|
|
|
strategyOptions.credentials = {
|
|
username: LINKEDIN_USERNAME,
|
|
password: LINKEDIN_PASSWORD,
|
|
};
|
|
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
|
|
strategyOptions.minDate = minDate; // Add date filter for LinkedIn
|
|
}
|
|
|
|
const parseResult = await strategy(coreParser, strategyOptions);
|
|
|
|
const { results, rejectedResults, summary } = parseResult;
|
|
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
|
|
|
// Collect results
|
|
logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
|
|
allResults.push(...results);
|
|
allRejectedResults.push(...rejectedResults);
|
|
logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
|
|
|
siteResults[site] = {
|
|
count: results.length,
|
|
rejected: rejectedResults.length,
|
|
duration: `${duration}s`,
|
|
summary,
|
|
};
|
|
|
|
logger.success(
|
|
`✅ ${site} completed in ${duration}s - Found ${results.length} jobs`
|
|
);
|
|
|
|
// Save results incrementally after each site
|
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
|
} catch (error) {
|
|
logger.error(`❌ ${site} parsing failed: ${error.message}`);
|
|
siteResults[site] = {
|
|
count: 0,
|
|
rejected: 0,
|
|
duration: "0s",
|
|
error: error.message,
|
|
};
|
|
// Save even on error to preserve what we have
|
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
|
}
|
|
}
|
|
|
|
// AI Analysis if enabled
|
|
// Save results before AI analysis (in case AI analysis takes a long time)
|
|
if (allResults.length > 0) {
|
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, null, false);
|
|
}
|
|
|
|
if (ENABLE_AI_ANALYSIS && allResults.length > 0) {
|
|
logger.step("🧠 Running AI Analysis...");
|
|
|
|
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL);
|
|
if (ollamaAvailable) {
|
|
// Prepare data for analysis (analyzeBatch expects objects with 'text' field)
|
|
const analysisData = allResults.map((job) => {
|
|
// Build comprehensive text including all available job information
|
|
const parts = [];
|
|
if (job.title) parts.push(`Title: ${job.title}`);
|
|
if (job.company) parts.push(`Company: ${job.company}`);
|
|
if (job.description) parts.push(`Description: ${job.description}`);
|
|
if (job.roleDuties) parts.push(`Role Duties: ${job.roleDuties}`);
|
|
if (job.jobRequirements) parts.push(`Requirements: ${job.jobRequirements}`);
|
|
|
|
return {
|
|
text: parts.join("\n\n"),
|
|
location: job.location || "",
|
|
keyword: job.keyword || "",
|
|
timestamp: job.extractedAt || job.postedDate || "",
|
|
roleDuties: job.roleDuties || "",
|
|
jobRequirements: job.jobRequirements || "",
|
|
};
|
|
});
|
|
|
|
// Process in smaller batches to avoid timeouts (5 jobs per batch)
|
|
const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5;
|
|
analysisResults = [];
|
|
|
|
for (let i = 0; i < analysisData.length; i += BATCH_SIZE) {
|
|
const batch = analysisData.slice(i, i + BATCH_SIZE);
|
|
const batchNumber = Math.floor(i / BATCH_SIZE) + 1;
|
|
const totalBatches = Math.ceil(analysisData.length / BATCH_SIZE);
|
|
|
|
logger.info(` Processing batch ${batchNumber}/${totalBatches} (${batch.length} jobs)...`);
|
|
|
|
try {
|
|
const batchResults = await analyzeBatch(
|
|
batch,
|
|
AI_CONTEXT,
|
|
OLLAMA_MODEL
|
|
);
|
|
analysisResults.push(...batchResults);
|
|
logger.success(` ✅ Batch ${batchNumber} completed`);
|
|
} catch (error) {
|
|
logger.error(` ❌ Batch ${batchNumber} failed: ${error.message}`);
|
|
// Add fallback results for this batch
|
|
const fallbackResults = batch.map((_, idx) => ({
|
|
postIndex: i + idx + 1,
|
|
isRelevant: true,
|
|
confidence: 0.3,
|
|
reasoning: `Analysis failed: ${error.message}`,
|
|
}));
|
|
analysisResults.push(...fallbackResults);
|
|
}
|
|
}
|
|
|
|
// Embed AI analysis into each job result
|
|
allResults.forEach((job, index) => {
|
|
if (analysisResults && analysisResults[index]) {
|
|
job.aiAnalysis = {
|
|
isRelevant: analysisResults[index].isRelevant,
|
|
confidence: analysisResults[index].confidence,
|
|
reasoning: analysisResults[index].reasoning,
|
|
context: AI_CONTEXT,
|
|
model: OLLAMA_MODEL,
|
|
analyzedAt: new Date().toISOString(),
|
|
};
|
|
}
|
|
});
|
|
|
|
logger.success(
|
|
`✅ AI Analysis completed for ${allResults.length} jobs`
|
|
);
|
|
|
|
// Save results after AI analysis completes
|
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
|
} else {
|
|
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
|
}
|
|
}
|
|
|
|
// Final save with complete flag
|
|
logger.info(`💾 Preparing final save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
|
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
|
|
|
|
if (!excludeRejected) {
|
|
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
|
|
} else {
|
|
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
|
|
}
|
|
|
|
logger.info(`💾 Final output: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
|
|
|
// Final save with isComplete flag
|
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, true);
|
|
|
|
const savedFiles = [];
|
|
if (incrementalJsonFilepath) savedFiles.push(incrementalJsonFilepath);
|
|
if (incrementalCsvFilepath) savedFiles.push(incrementalCsvFilepath);
|
|
|
|
// Final summary
|
|
logger.step("\n📊 Job Search Parser Summary");
|
|
logger.success(`✅ Total jobs found: ${allResults.length}`);
|
|
logger.info(`❌ Total rejected: ${allRejectedResults.length}`);
|
|
logger.info(`📁 Results saved to:`);
|
|
savedFiles.forEach(filepath => {
|
|
logger.info(` ${filepath}`);
|
|
});
|
|
|
|
logger.info("\n📈 Results by site:");
|
|
for (const [site, stats] of Object.entries(siteResults)) {
|
|
if (stats.error) {
|
|
logger.error(` ${site}: ERROR - ${stats.error}`);
|
|
} else {
|
|
logger.info(
|
|
` ${site}: ${stats.count} jobs found, ${stats.rejected} rejected (${stats.duration})`
|
|
);
|
|
}
|
|
}
|
|
|
|
logger.success("\n✅ Job Search Parser completed successfully!");
|
|
|
|
// Construct output data for return
|
|
const outputData = {
|
|
metadata: {
|
|
extractedAt: new Date().toISOString(),
|
|
parser: "job-search-parser",
|
|
version: "2.0.0",
|
|
sites: sites,
|
|
keywords: keywords.join(", "),
|
|
locationFilter,
|
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
|
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
|
|
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
|
|
analysisResults: analysisResults,
|
|
rejectedJobsExcluded: excludeRejected,
|
|
isComplete: true,
|
|
lastUpdated: new Date().toISOString(),
|
|
},
|
|
results: allResults,
|
|
siteResults: siteResults,
|
|
};
|
|
|
|
if (!excludeRejected) {
|
|
outputData.rejectedResults = allRejectedResults;
|
|
}
|
|
|
|
return outputData;
|
|
} catch (error) {
|
|
logger.error(`❌ Job Search Parser failed: ${error.message}`);
|
|
throw error;
|
|
} finally {
|
|
await coreParser.cleanup();
|
|
}
|
|
}
|
|
|
|
// CLI handling
|
|
if (require.main === module) {
|
|
startJobSearchParser()
|
|
.then(() => process.exit(0))
|
|
.catch((error) => {
|
|
console.error("Fatal error:", error.message);
|
|
process.exit(1);
|
|
});
|
|
}
|
|
|
|
module.exports = { startJobSearchParser };
|