#!/usr/bin/env node /** * Job Search Parser - Refactored * * Uses core-parser for browser management and site-specific strategies for parsing logic */ const path = require("path"); const fs = require("fs"); const CoreParser = require("../core-parser"); const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy"); const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy"); const { indeedStrategy } = require("./strategies/indeed-strategy"); const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer"); const { convertResultsToCsv } = require("./src/csv-utils"); // Load environment variables require("dotenv").config({ path: path.join(__dirname, ".env") }); // Configuration from environment const HEADLESS = process.env.HEADLESS !== "false"; const SEARCH_KEYWORDS = process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer"; const LOCATION_FILTER = process.env.LOCATION_FILTER; const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true"; const AI_CONTEXT = process.env.AI_CONTEXT || "Job market analysis focusing on job postings, skills, and trends"; const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL; const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5; const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true"; const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both" const MIN_DATE = process.env.MIN_DATE; // Minimum posted date (format: YYYY-MM-DD) const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for keywords // Available site strategies const SITE_STRATEGIES = { skipthedrive: skipthedriveStrategy, linkedin: linkedinJobsStrategy, indeed: indeedStrategy, // Add more site strategies here // glassdoor: glassdoorStrategy, }; /** * Parse command line arguments */ function parseArguments() { const args = process.argv.slice(2); const options = { sites: ["skipthedrive"], // default keywords: null, locationFilter: null, maxPages: MAX_PAGES, excludeRejected: EXCLUDE_REJECTED, outputFormat: OUTPUT_FORMAT, minDate: MIN_DATE, useAndLogic: USE_AND_LOGIC, // Use AND logic instead of OR logic for keywords (from env or CLI) }; args.forEach((arg) => { if (arg.startsWith("--sites=")) { options.sites = arg .split("=")[1] .split(",") .map((s) => s.trim()); } else if (arg.startsWith("--keywords=")) { options.keywords = arg .split("=")[1] .split(",") .map((k) => k.trim()); } else if (arg.startsWith("--location=")) { options.locationFilter = arg.split("=")[1]; } else if (arg.startsWith("--max-pages=")) { const value = arg.split("=")[1]; // Support "all" or "0" to mean unlimited pages if (value === "all" || value === "0") { options.maxPages = 0; // 0 means unlimited } else { options.maxPages = parseInt(value) || MAX_PAGES; } } else if (arg === "--no-rejected" || arg === "--exclude-rejected") { options.excludeRejected = true; } else if (arg.startsWith("--output=") || arg.startsWith("--format=")) { const format = arg.split("=")[1].toLowerCase(); if (["json", "csv", "both"].includes(format)) { options.outputFormat = format; } else { logger.warning(`āš ļø Unknown output format: ${format}. Using default: json`); } } else if (arg.startsWith("--min-date=")) { options.minDate = arg.split("=")[1]; } else if (arg === "--and" || arg === "--all-keywords") { options.useAndLogic = true; // CLI flag overrides env variable } }); return options; } /** * Main job search parser function */ async function startJobSearchParser(options = {}) { const cliOptions = parseArguments(); const finalOptions = { ...cliOptions, ...options }; const coreParser = new CoreParser({ headless: HEADLESS, timeout: 30000, }); try { logger.step("šŸš€ Job Search Parser Starting..."); // Parse keywords let keywords = finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim()); // Parse keyword groups if AND logic is enabled and keywords contain pipe (|) separator // Format: "co-op|intern,summer 2026" means (co-op OR intern) AND (summer 2026) let keywordGroups = null; if (finalOptions.useAndLogic && keywords.some(k => k.includes('|'))) { keywordGroups = keywords.map(group => group.split('|').map(k => k.trim()).filter(k => k.length > 0) ); logger.info(`šŸ” Keyword Groups: ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); } const locationFilter = finalOptions.locationFilter || LOCATION_FILTER; const sites = finalOptions.sites; const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED; logger.info(`šŸ“¦ Selected job sites: ${sites.join(", ")}`); logger.info(`šŸ” Search Keywords: ${keywords.join(", ")}`); if (keywordGroups) { logger.info(`šŸ”— Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); } else { logger.info(`šŸ”— Keyword Logic: ${finalOptions.useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); } logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); const minDate = finalOptions.minDate || MIN_DATE; if (minDate) { logger.info(`šŸ“… Min Date Filter: ${minDate} (jobs posted after this date)`); } logger.info( `🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}` ); if (ENABLE_AI_ANALYSIS) { logger.info(` Context: "${AI_CONTEXT}"`); logger.info(` Model: ${OLLAMA_MODEL}`); } const allResults = []; const allRejectedResults = []; const siteResults = {}; let analysisResults = null; // Initialize results directory and file for incremental saving const resultsDir = path.join(__dirname, "results"); if (!fs.existsSync(resultsDir)) { fs.mkdirSync(resultsDir, { recursive: true }); } const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT; let incrementalJsonFilepath = null; let incrementalCsvFilepath = null; // Initialize incremental save files if (outputFormat === "json" || outputFormat === "both") { const jsonFilename = `job-search-results-${timestamp}.json`; incrementalJsonFilepath = path.join(resultsDir, jsonFilename); } if (outputFormat === "csv" || outputFormat === "both") { const csvFilename = `job-search-results-${timestamp}.csv`; incrementalCsvFilepath = path.join(resultsDir, csvFilename); } /** * Save results incrementally as they're found */ const saveIncrementalResults = (currentResults, currentRejectedResults, currentSiteResults, currentAnalysisResults = null, isComplete = false) => { try { const outputData = { metadata: { extractedAt: new Date().toISOString(), parser: "job-search-parser", version: "2.0.0", sites: sites, keywords: keywords.join(", "), locationFilter, aiAnalysisEnabled: ENABLE_AI_ANALYSIS, aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined, aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined, analysisResults: currentAnalysisResults, rejectedJobsExcluded: excludeRejected, isComplete: isComplete, lastUpdated: new Date().toISOString(), }, results: currentResults, siteResults: currentSiteResults, }; if (!excludeRejected) { outputData.rejectedResults = currentRejectedResults; } // Save JSON incrementally if (incrementalJsonFilepath) { fs.writeFileSync(incrementalJsonFilepath, JSON.stringify(outputData, null, 2)); } // Save CSV incrementally (convert on each save) if (incrementalCsvFilepath) { const csvContent = convertResultsToCsv(outputData); fs.writeFileSync(incrementalCsvFilepath, csvContent); } if (!isComplete) { logger.info(`šŸ’¾ Incremental save: ${currentResults.length} results saved to ${incrementalJsonFilepath || incrementalCsvFilepath}`); } } catch (error) { logger.warning(`āš ļø Failed to save incremental results: ${error.message}`); } }; // Save initial empty state saveIncrementalResults([], [], {}, null, false); // Set up signal handlers for graceful shutdown let isShuttingDown = false; const gracefulShutdown = async (signal) => { if (isShuttingDown) return; isShuttingDown = true; logger.warning(`\nāš ļø Received ${signal}, saving current results before exit...`); saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); logger.info(`šŸ’¾ Saved ${allResults.length} results before shutdown`); await coreParser.cleanup(); process.exit(0); }; process.on('SIGINT', () => gracefulShutdown('SIGINT')); process.on('SIGTERM', () => gracefulShutdown('SIGTERM')); // Process each selected site for (const site of sites) { const strategy = SITE_STRATEGIES[site]; if (!strategy) { logger.error(`āŒ Unknown site strategy: ${site}`); continue; } try { logger.step(`\n🌐 Parsing ${site}...`); const startTime = Date.now(); // Prepare strategy options const strategyOptions = { keywords, keywordGroups, // Pass grouped keywords if available locationFilter, maxPages: finalOptions.maxPages, useAndLogic: finalOptions.useAndLogic || false, }; // Add credentials for LinkedIn if (site === "linkedin") { const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME; const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD; if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) { logger.error(`āŒ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`); siteResults[site] = { count: 0, rejected: 0, duration: "0s", error: "LinkedIn credentials not found", }; continue; } strategyOptions.credentials = { username: LINKEDIN_USERNAME, password: LINKEDIN_PASSWORD, }; strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || ""; strategyOptions.minDate = minDate; // Add date filter for LinkedIn } const parseResult = await strategy(coreParser, strategyOptions); const { results, rejectedResults, summary } = parseResult; const duration = ((Date.now() - startTime) / 1000).toFixed(2); // Collect results logger.info(`šŸ“¦ Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`); allResults.push(...results); allRejectedResults.push(...rejectedResults); logger.info(`šŸ“¦ Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`); siteResults[site] = { count: results.length, rejected: rejectedResults.length, duration: `${duration}s`, summary, }; logger.success( `āœ… ${site} completed in ${duration}s - Found ${results.length} jobs` ); // Save results incrementally after each site saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); } catch (error) { logger.error(`āŒ ${site} parsing failed: ${error.message}`); siteResults[site] = { count: 0, rejected: 0, duration: "0s", error: error.message, }; // Save even on error to preserve what we have saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); } } // AI Analysis if enabled // Save results before AI analysis (in case AI analysis takes a long time) if (allResults.length > 0) { saveIncrementalResults(allResults, allRejectedResults, siteResults, null, false); } if (ENABLE_AI_ANALYSIS && allResults.length > 0) { logger.step("🧠 Running AI Analysis..."); const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL); if (ollamaAvailable) { // Prepare data for analysis (analyzeBatch expects objects with 'text' field) const analysisData = allResults.map((job) => { // Build comprehensive text including all available job information const parts = []; if (job.title) parts.push(`Title: ${job.title}`); if (job.company) parts.push(`Company: ${job.company}`); if (job.description) parts.push(`Description: ${job.description}`); if (job.roleDuties) parts.push(`Role Duties: ${job.roleDuties}`); if (job.jobRequirements) parts.push(`Requirements: ${job.jobRequirements}`); return { text: parts.join("\n\n"), location: job.location || "", keyword: job.keyword || "", timestamp: job.extractedAt || job.postedDate || "", roleDuties: job.roleDuties || "", jobRequirements: job.jobRequirements || "", }; }); // Process in smaller batches to avoid timeouts (5 jobs per batch) const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5; analysisResults = []; for (let i = 0; i < analysisData.length; i += BATCH_SIZE) { const batch = analysisData.slice(i, i + BATCH_SIZE); const batchNumber = Math.floor(i / BATCH_SIZE) + 1; const totalBatches = Math.ceil(analysisData.length / BATCH_SIZE); logger.info(` Processing batch ${batchNumber}/${totalBatches} (${batch.length} jobs)...`); try { const batchResults = await analyzeBatch( batch, AI_CONTEXT, OLLAMA_MODEL ); analysisResults.push(...batchResults); logger.success(` āœ… Batch ${batchNumber} completed`); } catch (error) { logger.error(` āŒ Batch ${batchNumber} failed: ${error.message}`); // Add fallback results for this batch const fallbackResults = batch.map((_, idx) => ({ postIndex: i + idx + 1, isRelevant: true, confidence: 0.3, reasoning: `Analysis failed: ${error.message}`, })); analysisResults.push(...fallbackResults); } } // Embed AI analysis into each job result allResults.forEach((job, index) => { if (analysisResults && analysisResults[index]) { job.aiAnalysis = { isRelevant: analysisResults[index].isRelevant, confidence: analysisResults[index].confidence, reasoning: analysisResults[index].reasoning, context: AI_CONTEXT, model: OLLAMA_MODEL, analyzedAt: new Date().toISOString(), }; } }); logger.success( `āœ… AI Analysis completed for ${allResults.length} jobs` ); // Save results after AI analysis completes saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); } else { logger.warning("āš ļø Ollama not available, skipping AI analysis"); } } // Final save with complete flag logger.info(`šŸ’¾ Preparing final save: ${allResults.length} results, ${allRejectedResults.length} rejected`); logger.info(`šŸ’¾ EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`); if (!excludeRejected) { logger.info(`āœ… Including ${allRejectedResults.length} rejected results in output`); } else { logger.info(`ā­ļø Excluding rejected results (EXCLUDE_REJECTED=true)`); } logger.info(`šŸ’¾ Final output: ${allResults.length} results, ${allRejectedResults.length} rejected`); // Final save with isComplete flag saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, true); const savedFiles = []; if (incrementalJsonFilepath) savedFiles.push(incrementalJsonFilepath); if (incrementalCsvFilepath) savedFiles.push(incrementalCsvFilepath); // Final summary logger.step("\nšŸ“Š Job Search Parser Summary"); logger.success(`āœ… Total jobs found: ${allResults.length}`); logger.info(`āŒ Total rejected: ${allRejectedResults.length}`); logger.info(`šŸ“ Results saved to:`); savedFiles.forEach(filepath => { logger.info(` ${filepath}`); }); logger.info("\nšŸ“ˆ Results by site:"); for (const [site, stats] of Object.entries(siteResults)) { if (stats.error) { logger.error(` ${site}: ERROR - ${stats.error}`); } else { logger.info( ` ${site}: ${stats.count} jobs found, ${stats.rejected} rejected (${stats.duration})` ); } } logger.success("\nāœ… Job Search Parser completed successfully!"); // Construct output data for return const outputData = { metadata: { extractedAt: new Date().toISOString(), parser: "job-search-parser", version: "2.0.0", sites: sites, keywords: keywords.join(", "), locationFilter, aiAnalysisEnabled: ENABLE_AI_ANALYSIS, aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined, aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined, analysisResults: analysisResults, rejectedJobsExcluded: excludeRejected, isComplete: true, lastUpdated: new Date().toISOString(), }, results: allResults, siteResults: siteResults, }; if (!excludeRejected) { outputData.rejectedResults = allRejectedResults; } return outputData; } catch (error) { logger.error(`āŒ Job Search Parser failed: ${error.message}`); throw error; } finally { await coreParser.cleanup(); } } // CLI handling if (require.main === module) { startJobSearchParser() .then(() => process.exit(0)) .catch((error) => { console.error("Fatal error:", error.message); process.exit(1); }); } module.exports = { startJobSearchParser };