tanyar09 00c4cf1b6f Enhance job search parser with CSV output support
- Added functionality to export job search results in CSV format alongside JSON.
- Introduced command line options for specifying output format: "json", "csv", or "both".
- Updated README to include usage instructions for CSV output and detailed CSV structure.
- Created utility functions for converting job results to CSV format, ensuring proper field escaping.
2025-12-17 16:13:21 -05:00

371 lines
13 KiB
JavaScript

#!/usr/bin/env node
/**
* Job Search Parser - Refactored
*
* Uses core-parser for browser management and site-specific strategies for parsing logic
*/
const path = require("path");
const fs = require("fs");
const CoreParser = require("../core-parser");
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
const { convertResultsToCsv } = require("./src/csv-utils");
// Load environment variables
require("dotenv").config({ path: path.join(__dirname, ".env") });
// Configuration from environment
const HEADLESS = process.env.HEADLESS !== "false";
const SEARCH_KEYWORDS =
process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
const LOCATION_FILTER = process.env.LOCATION_FILTER;
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
const AI_CONTEXT = process.env.AI_CONTEXT || "Job market analysis focusing on job postings, skills, and trends";
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both"
// Available site strategies
const SITE_STRATEGIES = {
skipthedrive: skipthedriveStrategy,
linkedin: linkedinJobsStrategy,
// Add more site strategies here
// indeed: indeedStrategy,
// glassdoor: glassdoorStrategy,
};
/**
* Parse command line arguments
*/
function parseArguments() {
const args = process.argv.slice(2);
const options = {
sites: ["skipthedrive"], // default
keywords: null,
locationFilter: null,
maxPages: MAX_PAGES,
excludeRejected: EXCLUDE_REJECTED,
outputFormat: OUTPUT_FORMAT,
};
args.forEach((arg) => {
if (arg.startsWith("--sites=")) {
options.sites = arg
.split("=")[1]
.split(",")
.map((s) => s.trim());
} else if (arg.startsWith("--keywords=")) {
options.keywords = arg
.split("=")[1]
.split(",")
.map((k) => k.trim());
} else if (arg.startsWith("--location=")) {
options.locationFilter = arg.split("=")[1];
} else if (arg.startsWith("--max-pages=")) {
const value = arg.split("=")[1];
// Support "all" or "0" to mean unlimited pages
if (value === "all" || value === "0") {
options.maxPages = 0; // 0 means unlimited
} else {
options.maxPages = parseInt(value) || MAX_PAGES;
}
} else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
options.excludeRejected = true;
} else if (arg.startsWith("--output=") || arg.startsWith("--format=")) {
const format = arg.split("=")[1].toLowerCase();
if (["json", "csv", "both"].includes(format)) {
options.outputFormat = format;
} else {
logger.warning(`⚠️ Unknown output format: ${format}. Using default: json`);
}
}
});
return options;
}
/**
* Main job search parser function
*/
async function startJobSearchParser(options = {}) {
const cliOptions = parseArguments();
const finalOptions = { ...cliOptions, ...options };
const coreParser = new CoreParser({
headless: HEADLESS,
timeout: 30000,
});
try {
logger.step("🚀 Job Search Parser Starting...");
// Parse keywords
const keywords =
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
const sites = finalOptions.sites;
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
);
if (ENABLE_AI_ANALYSIS) {
logger.info(` Context: "${AI_CONTEXT}"`);
logger.info(` Model: ${OLLAMA_MODEL}`);
}
const allResults = [];
const allRejectedResults = [];
const siteResults = {};
// Process each selected site
for (const site of sites) {
const strategy = SITE_STRATEGIES[site];
if (!strategy) {
logger.error(`❌ Unknown site strategy: ${site}`);
continue;
}
try {
logger.step(`\n🌐 Parsing ${site}...`);
const startTime = Date.now();
// Prepare strategy options
const strategyOptions = {
keywords,
locationFilter,
maxPages: finalOptions.maxPages,
};
// Add credentials for LinkedIn
if (site === "linkedin") {
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
siteResults[site] = {
count: 0,
rejected: 0,
duration: "0s",
error: "LinkedIn credentials not found",
};
continue;
}
strategyOptions.credentials = {
username: LINKEDIN_USERNAME,
password: LINKEDIN_PASSWORD,
};
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
}
const parseResult = await strategy(coreParser, strategyOptions);
const { results, rejectedResults, summary } = parseResult;
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
// Collect results
logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
allResults.push(...results);
allRejectedResults.push(...rejectedResults);
logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
siteResults[site] = {
count: results.length,
rejected: rejectedResults.length,
duration: `${duration}s`,
summary,
};
logger.success(
`${site} completed in ${duration}s - Found ${results.length} jobs`
);
} catch (error) {
logger.error(`${site} parsing failed: ${error.message}`);
siteResults[site] = {
count: 0,
rejected: 0,
duration: "0s",
error: error.message,
};
}
}
// AI Analysis if enabled
let analysisResults = null;
if (ENABLE_AI_ANALYSIS && allResults.length > 0) {
logger.step("🧠 Running AI Analysis...");
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL);
if (ollamaAvailable) {
// Prepare data for analysis (analyzeBatch expects objects with 'text' field)
const analysisData = allResults.map((job) => ({
text: `${job.title || ""} at ${job.company || ""}. ${job.description || ""}`.trim(),
location: job.location || "",
keyword: job.keyword || "",
timestamp: job.extractedAt || job.postedDate || "",
}));
// Process in smaller batches to avoid timeouts (5 jobs per batch)
const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5;
analysisResults = [];
for (let i = 0; i < analysisData.length; i += BATCH_SIZE) {
const batch = analysisData.slice(i, i + BATCH_SIZE);
const batchNumber = Math.floor(i / BATCH_SIZE) + 1;
const totalBatches = Math.ceil(analysisData.length / BATCH_SIZE);
logger.info(` Processing batch ${batchNumber}/${totalBatches} (${batch.length} jobs)...`);
try {
const batchResults = await analyzeBatch(
batch,
AI_CONTEXT,
OLLAMA_MODEL
);
analysisResults.push(...batchResults);
logger.success(` ✅ Batch ${batchNumber} completed`);
} catch (error) {
logger.error(` ❌ Batch ${batchNumber} failed: ${error.message}`);
// Add fallback results for this batch
const fallbackResults = batch.map((_, idx) => ({
postIndex: i + idx + 1,
isRelevant: true,
confidence: 0.3,
reasoning: `Analysis failed: ${error.message}`,
}));
analysisResults.push(...fallbackResults);
}
}
// Embed AI analysis into each job result
allResults.forEach((job, index) => {
if (analysisResults && analysisResults[index]) {
job.aiAnalysis = {
isRelevant: analysisResults[index].isRelevant,
confidence: analysisResults[index].confidence,
reasoning: analysisResults[index].reasoning,
context: AI_CONTEXT,
model: OLLAMA_MODEL,
analyzedAt: new Date().toISOString(),
};
}
});
logger.success(
`✅ AI Analysis completed for ${allResults.length} jobs`
);
} else {
logger.warning("⚠️ Ollama not available, skipping AI analysis");
}
}
// Save results
logger.info(`💾 Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
const outputData = {
metadata: {
extractedAt: new Date().toISOString(),
parser: "job-search-parser",
version: "2.0.0",
sites: sites,
keywords: keywords.join(", "),
locationFilter,
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
analysisResults,
rejectedJobsExcluded: excludeRejected,
},
results: allResults,
siteResults,
};
// Always include rejectedResults if not excluded (make it explicit, not using spread)
if (!excludeRejected) {
outputData.rejectedResults = allRejectedResults;
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
} else {
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
}
logger.info(`💾 Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`);
const resultsDir = path.join(__dirname, "results");
if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir, { recursive: true });
}
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT;
const savedFiles = [];
// Save JSON if format is "json" or "both"
if (outputFormat === "json" || outputFormat === "both") {
const jsonFilename = `job-search-results-${timestamp}.json`;
const jsonFilepath = path.join(resultsDir, jsonFilename);
fs.writeFileSync(jsonFilepath, JSON.stringify(outputData, null, 2));
savedFiles.push(jsonFilepath);
}
// Save CSV if format is "csv" or "both"
if (outputFormat === "csv" || outputFormat === "both") {
const csvFilename = `job-search-results-${timestamp}.csv`;
const csvFilepath = path.join(resultsDir, csvFilename);
const csvContent = convertResultsToCsv(outputData);
fs.writeFileSync(csvFilepath, csvContent);
savedFiles.push(csvFilepath);
}
// Final summary
logger.step("\n📊 Job Search Parser Summary");
logger.success(`✅ Total jobs found: ${allResults.length}`);
logger.info(`❌ Total rejected: ${allRejectedResults.length}`);
logger.info(`📁 Results saved to:`);
savedFiles.forEach(filepath => {
logger.info(` ${filepath}`);
});
logger.info("\n📈 Results by site:");
for (const [site, stats] of Object.entries(siteResults)) {
if (stats.error) {
logger.error(` ${site}: ERROR - ${stats.error}`);
} else {
logger.info(
` ${site}: ${stats.count} jobs found, ${stats.rejected} rejected (${stats.duration})`
);
}
}
logger.success("\n✅ Job Search Parser completed successfully!");
return outputData;
} catch (error) {
logger.error(`❌ Job Search Parser failed: ${error.message}`);
throw error;
} finally {
await coreParser.cleanup();
}
}
// CLI handling
if (require.main === module) {
startJobSearchParser()
.then(() => process.exit(0))
.catch((error) => {
console.error("Fatal error:", error.message);
process.exit(1);
});
}
module.exports = { startJobSearchParser };