Enhance job search parser with CSV output support

- Added functionality to export job search results in CSV format alongside JSON. - Introduced command line options for specifying output format: "json", "csv", or "both". - Updated README to include usage instructions for CSV output and detailed CSV structure. - Created utility functions for converting job results to CSV format, ensuring proper field escaping.
2025-12-17 16:13:21 -05:00 · 2025-12-17 16:13:21 -05:00 · 00c4cf1b6f
commit 00c4cf1b6f
parent 8f526b3518
3 changed files with 206 additions and 27 deletions
--- a/job-search-parser/README.md
+++ b/job-search-parser/README.md
@ -141,7 +141,7 @@ ENABLE_AI_ANALYSIS=false
 HEADLESS=true

 # Output Configuration
-OUTPUT_FORMAT=json
+OUTPUT_FORMAT=json  # Options: "json", "csv", or "both"
 ```

 ### Command Line Options
@ -150,31 +150,34 @@ OUTPUT_FORMAT=json
 # Basic usage
 node index.js

-# Specific roles
-node index.js --roles="frontend developer,backend developer"
+# Select sites to parse
+node index.js --sites=linkedin,skipthedrive

-# Geographic focus
-node index.js --locations="Toronto,Vancouver"
+# Search keywords
+node index.js --keywords="software engineer,developer"

-# Experience level
-node index.js --experience="senior"
+# Location filter
+node index.js --location="Ontario"

-# Output format
-node index.js --output=results/job-market-analysis.json
+# Max pages to parse
+node index.js --max-pages=10
+
+# Exclude rejected results
+node index.js --no-rejected
+
+# Output format (json, csv, or both)
+node index.js --output=csv
+node index.js --output=both
 ```

 **Available Options:**

- `--roles="role1,role2"`: Target job roles
- `--locations="city1,city2"`: Geographic focus
- `--experience="entry|mid|senior"`: Experience level
- `--remote="remote|hybrid|onsite"`: Remote work preference
- `--salary-min=NUMBER`: Minimum salary filter
- `--salary-max=NUMBER`: Maximum salary filter
- `--output=FILE`: Output filename
- `--format=json|csv`: Output format
- `--trends`: Enable trend analysis
- `--skills`: Enable skill analysis
+- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive)
+- `--keywords="keyword1,keyword2"`: Search keywords
+- `--location="LOCATION"`: Location filter
+- `--max-pages=NUMBER`: Maximum pages to parse (0 or "all" for unlimited)
+- `--no-rejected` or `--exclude-rejected`: Exclude rejected results from output
+- `--output=FORMAT` or `--format=FORMAT`: Output format - "json", "csv", or "both" (default: "json")

 ## 📊 Keywords

@ -373,12 +376,46 @@ node index.js --companies="Google,Microsoft,Amazon"

 ### CSV Output

-The parser can also generate CSV files for easy analysis:
+The parser can generate CSV files for easy spreadsheet analysis. Use `--output=csv` or `OUTPUT_FORMAT=csv` to export results as CSV.
+
+**CSV Columns:**
+- `jobId`: Unique job identifier
+- `title`: Job title
+- `company`: Company name
+- `location`: Job location
+- `jobUrl`: Link to job posting
+- `postedDate`: Date job was posted
+- `description`: Job description
+- `jobType`: Type of job (full-time, part-time, contract, etc.)
+- `experienceLevel`: Required experience level
+- `keyword`: Search keyword that matched
+- `extractedAt`: Timestamp when job was extracted
+- `source`: Source site (e.g., "linkedin-jobs", "skipthedrive")
+- `aiRelevant`: AI analysis relevance (Yes/No)
+- `aiConfidence`: AI confidence score (0-1)
+- `aiReasoning`: AI reasoning for relevance
+- `aiContext`: AI analysis context
+- `aiModel`: AI model used for analysis
+- `aiAnalyzedAt`: Timestamp of AI analysis
+
+**Example CSV Output:**

 ```csv
-job_id,title,company,location,remote_type,salary_min,salary_max,required_skills,experience_level,posted_date
-job_1,Senior Software Engineer,TechCorp,Toronto,hybrid,100000,140000,"React,Node.js,TypeScript",senior,2024-01-10
-job_2,Data Scientist,DataCorp,Vancouver,remote,90000,130000,"Python,SQL,ML",mid,2024-01-09
+jobId,title,company,location,jobUrl,postedDate,description,jobType,experienceLevel,keyword,extractedAt,source,aiRelevant,aiConfidence,aiReasoning,aiContext,aiModel,aiAnalyzedAt
+4344137241,Web Applications Co-op/Intern,Nokia,Kanata ON (Hybrid),https://www.linkedin.com/jobs/view/4344137241,,"Web Applications Co-op/Intern",,co-op,2025-12-17T04:50:05.600Z,linkedin-jobs,Yes,0.8,"The post mentions a co-op/intern position",co-op and internship opportunities for First year Math students,mistral,2025-12-17T04:58:33.479Z
+```
+
+**Usage:**
+
+```bash
+# Export as CSV only
+node index.js --output=csv
+
+# Export both JSON and CSV
+node index.js --output=both
+
+# Using environment variable
+OUTPUT_FORMAT=csv node index.js
 ```

 ## 🔒 Security & Best Practices
--- a/job-search-parser/index.js
+++ b/job-search-parser/index.js
@ -12,6 +12,7 @@ const CoreParser = require("../core-parser");
 const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
 const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
 const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
+const { convertResultsToCsv } = require("./src/csv-utils");

 // Load environment variables
 require("dotenv").config({ path: path.join(__dirname, ".env") });
@ -26,6 +27,7 @@ const AI_CONTEXT = process.env.AI_CONTEXT || "Job market analysis focusing on jo
 const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
 const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
 const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
+const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both"

 // Available site strategies
 const SITE_STRATEGIES = {
@ -47,6 +49,7 @@ function parseArguments() {
    locationFilter: null,
    maxPages: MAX_PAGES,
    excludeRejected: EXCLUDE_REJECTED,
+    outputFormat: OUTPUT_FORMAT,
  };

  args.forEach((arg) => {
@ -72,6 +75,13 @@ function parseArguments() {
      }
    } else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
      options.excludeRejected = true;
+    } else if (arg.startsWith("--output=") || arg.startsWith("--format=")) {
+      const format = arg.split("=")[1].toLowerCase();
+      if (["json", "csv", "both"].includes(format)) {
+        options.outputFormat = format;
+      } else {
+        logger.warning(`⚠️  Unknown output format: ${format}. Using default: json`);
+      }
    }
  });

@ -296,16 +306,34 @@ async function startJobSearchParser(options = {}) {
    }

    const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
-    const filename = `job-search-results-${timestamp}.json`;
-    const filepath = path.join(resultsDir, filename);
+    const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT;
+    const savedFiles = [];

-    fs.writeFileSync(filepath, JSON.stringify(outputData, null, 2));
+    // Save JSON if format is "json" or "both"
+    if (outputFormat === "json" || outputFormat === "both") {
+      const jsonFilename = `job-search-results-${timestamp}.json`;
+      const jsonFilepath = path.join(resultsDir, jsonFilename);
+      fs.writeFileSync(jsonFilepath, JSON.stringify(outputData, null, 2));
+      savedFiles.push(jsonFilepath);
+    }
+
+    // Save CSV if format is "csv" or "both"
+    if (outputFormat === "csv" || outputFormat === "both") {
+      const csvFilename = `job-search-results-${timestamp}.csv`;
+      const csvFilepath = path.join(resultsDir, csvFilename);
+      const csvContent = convertResultsToCsv(outputData);
+      fs.writeFileSync(csvFilepath, csvContent);
+      savedFiles.push(csvFilepath);
+    }

    // Final summary
    logger.step("\n📊 Job Search Parser Summary");
    logger.success(`✅ Total jobs found: ${allResults.length}`);
    logger.info(`❌ Total rejected: ${allRejectedResults.length}`);
-    logger.info(`📁 Results saved to: ${filepath}`);
+    logger.info(`📁 Results saved to:`);
+    savedFiles.forEach(filepath => {
+      logger.info(`   ${filepath}`);
+    });

    logger.info("\n📈 Results by site:");
    for (const [site, stats] of Object.entries(siteResults)) {
--- a/job-search-parser/src/csv-utils.js
+++ b/job-search-parser/src/csv-utils.js
@ -0,0 +1,114 @@
+/**
+ * CSV Utilities
+ * 
+ * Functions for converting job search results to CSV format
+ */
+
+/**
+ * Escapes a CSV field value
+ * @param {string} value - The value to escape
+ * @returns {string} - The escaped value
+ */
+function escapeCsvField(value) {
+  if (value === null || value === undefined) {
+    return "";
+  }
+  
+  const stringValue = String(value);
+  
+  // If the value contains comma, newline, or double quote, wrap it in quotes and escape quotes
+  if (stringValue.includes(",") || stringValue.includes("\n") || stringValue.includes('"')) {
+    return `"${stringValue.replace(/"/g, '""')}"`;
+  }
+  
+  return stringValue;
+}
+
+/**
+ * Converts job results to CSV format
+ * @param {Array} jobs - Array of job objects
+ * @param {Object} metadata - Metadata object (optional)
+ * @returns {string} - CSV string
+ */
+function convertJobsToCsv(jobs, metadata = null) {
+  if (!jobs || jobs.length === 0) {
+    return "";
+  }
+
+  // Define CSV columns based on job object structure
+  const columns = [
+    "jobId",
+    "title",
+    "company",
+    "location",
+    "jobUrl",
+    "postedDate",
+    "description",
+    "jobType",
+    "experienceLevel",
+    "keyword",
+    "extractedAt",
+    "source",
+    "aiRelevant",
+    "aiConfidence",
+    "aiReasoning",
+    "aiContext",
+    "aiModel",
+    "aiAnalyzedAt"
+  ];
+
+  // Create header row
+  const headerRow = columns.map(col => escapeCsvField(col)).join(",");
+
+  // Create data rows
+  const dataRows = jobs.map(job => {
+    const row = columns.map(col => {
+      if (col.startsWith("ai")) {
+        // Handle AI analysis fields
+        const aiField = col.substring(2).charAt(0).toLowerCase() + col.substring(3);
+        if (job.aiAnalysis) {
+          if (aiField === "relevant") {
+            return escapeCsvField(job.aiAnalysis.isRelevant ? "Yes" : "No");
+          } else if (aiField === "confidence") {
+            return escapeCsvField(job.aiAnalysis.confidence || "");
+          } else if (aiField === "reasoning") {
+            return escapeCsvField(job.aiAnalysis.reasoning || "");
+          } else if (aiField === "context") {
+            return escapeCsvField(job.aiAnalysis.context || "");
+          } else if (aiField === "model") {
+            return escapeCsvField(job.aiAnalysis.model || "");
+          } else if (aiField === "analyzedAt") {
+            return escapeCsvField(job.aiAnalysis.analyzedAt || "");
+          }
+        }
+        return "";
+      } else {
+        return escapeCsvField(job[col] || "");
+      }
+    });
+    return row.join(",");
+  });
+
+  // Combine header and data rows
+  return [headerRow, ...dataRows].join("\n");
+}
+
+/**
+ * Converts full results object (with metadata) to CSV
+ * @param {Object} resultsData - Full results object with metadata, results, etc.
+ * @returns {string} - CSV string
+ */
+function convertResultsToCsv(resultsData) {
+  if (!resultsData || !resultsData.results) {
+    return "";
+  }
+
+  return convertJobsToCsv(resultsData.results, resultsData.metadata);
+}
+
+module.exports = {
+  convertJobsToCsv,
+  convertResultsToCsv,
+  escapeCsvField,
+};
+