diff --git a/ai-analyzer/src/ai-utils.js b/ai-analyzer/src/ai-utils.js index a94c9ba..fb49687 100644 --- a/ai-analyzer/src/ai-utils.js +++ b/ai-analyzer/src/ai-utils.js @@ -69,34 +69,83 @@ async function analyzeBatch( logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`); try { - const prompt = `Analyze ${posts.length} LinkedIn posts for relevance to: "${context}" + // Detect if context is about a student profile + const isStudentContext = /student|undergraduate|first year|second year|third year|fourth year|freshman|sophomore|junior|senior|co-op|internship/i.test(context); + + // Build enhanced prompt based on context type + let analysisInstructions = ""; + if (isStudentContext) { + analysisInstructions = ` +ANALYSIS FOCUS (Student Context Detected): +- Pay special attention to the "Requirements" section +- Evaluate if the job requirements match the student's level (${context}) +- Consider: Are requirements too advanced? Are they appropriate for entry-level/co-op/internship? +- Check if the role duties are suitable for a student's skill level +- Look for keywords like "co-op", "internship", "entry-level", "student", "junior" +- If requirements mention "years of experience", "senior", "expert", "PhD", etc., this may not be suitable +- If requirements are reasonable for a student (basic skills, willingness to learn), mark as relevant`; + } else { + analysisInstructions = ` +ANALYSIS FOCUS: +- Evaluate overall relevance to: "${context}" +- Consider job title, description, duties, and requirements +- Assess if the job matches the specified criteria`; + } -POSTS: + const prompt = `Analyze ${posts.length} job postings for relevance to: "${context}" + +${analysisInstructions} + +JOB POSTINGS: ${posts .map( - (post, i) => ` -POST ${i + 1}: -"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}" -` + (post, i) => { + // For student contexts, prioritize Requirements section if text is too long + let jobText = post.text; + if (isStudentContext && jobText.length > 1200) { + // Try to extract Requirements section if present + const requirementsMatch = jobText.match(/Requirements?:[\s\S]{0,600}/i); + const dutiesMatch = jobText.match(/Role Duties?:[\s\S]{0,300}/i); + const titleMatch = jobText.match(/Title:[\s\S]{0,100}/i); + + if (requirementsMatch) { + // Prioritize: Title + Requirements (most important for students) + jobText = (titleMatch ? titleMatch[0] + "\n\n" : "") + + (requirementsMatch ? requirementsMatch[0] : "") + + (dutiesMatch ? "\n\n" + dutiesMatch[0] : ""); + } else { + // Fallback to truncation + jobText = jobText.substring(0, 1200) + "..."; + } + } else if (jobText.length > 1200) { + jobText = jobText.substring(0, 1200) + "..."; + } + + return ` +JOB ${i + 1}: +${jobText} +`; + } ) .join("")} REQUIRED FORMAT - Respond with EXACTLY ${posts.length} lines, one per post: -POST 1: YES | 0.8 | reason here -POST 2: NO | 0.2 | reason here -POST 3: YES | 0.9 | reason here +JOB 1: YES | 0.8 | reason here +JOB 2: NO | 0.2 | reason here +JOB 3: YES | 0.9 | reason here RULES: - Use YES or NO (uppercase) - Use pipe character | as separator - Confidence must be 0.0 to 1.0 (decimal number) - Keep reasoning brief (one sentence) -- MUST include all ${posts.length} posts in order +- MUST include all ${posts.length} jobs in order +${isStudentContext ? "- When analyzing requirements, explicitly mention if requirements are too advanced or appropriate for the student level" : ""} Examples: -POST 1: YES | 0.9 | mentions layoffs and job cuts -POST 2: NO | 0.1 | unrelated topic about vacation -POST 3: YES | 0.7 | discusses workforce reduction`; +JOB 1: YES | 0.9 | co-op position suitable for first year students +JOB 2: NO | 0.2 | requires 5+ years experience, too advanced +JOB 3: YES | 0.7 | entry-level role with basic requirements appropriate for students`; // Add timeout to prevent hanging (5 minutes max) const controller = new AbortController(); diff --git a/ai-analyzer/src/text-utils.js b/ai-analyzer/src/text-utils.js index 7635741..1cf4b8f 100644 --- a/ai-analyzer/src/text-utils.js +++ b/ai-analyzer/src/text-utils.js @@ -45,6 +45,43 @@ function containsAnyKeyword(text, keywords) { return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase())); } +/** + * Check if text contains all of the specified keywords (case insensitive) + */ +function containsAllKeywords(text, keywords) { + if (!text || !Array.isArray(keywords)) { + return false; + } + + const lowerText = text.toLowerCase(); + return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase())); +} + +/** + * Check if text matches keyword groups with AND logic between groups and OR logic within groups + * @param {string} text - Text to search in + * @param {Array>} keywordGroups - Array of keyword groups, each group is an array of OR keywords + * @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic) + */ +function matchesKeywordGroups(text, keywordGroups) { + if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) { + return false; + } + + const lowerText = text.toLowerCase(); + + // All groups must match (AND logic) + return keywordGroups.every((group) => { + if (!Array.isArray(group) || group.length === 0) { + return false; + } + // At least one keyword in the group must match (OR logic) + return group.some((keyword) => + lowerText.includes(keyword.toLowerCase().trim()) + ); + }); +} + /** * Validate if text meets basic quality criteria */ @@ -101,6 +138,8 @@ function normalizeUrl(url) { module.exports = { cleanText, containsAnyKeyword, + containsAllKeywords, + matchesKeywordGroups, isValidText, extractDomain, normalizeUrl, diff --git a/job-search-parser/README.md b/job-search-parser/README.md index cb17ea8..ce470ef 100644 --- a/job-search-parser/README.md +++ b/job-search-parser/README.md @@ -92,10 +92,32 @@ node index.js --sites=linkedin --keywords="software engineer,developer" # Search with location filter node index.js --sites=linkedin --keywords="co-op" --location="Ontario" +# Search with date filter (jobs posted after specific date) +node index.js --sites=linkedin --keywords="co-op" --min-date="2025-12-01" + +# Combine filters +node index.js --sites=linkedin --keywords="co-op" --location="Ontario" --min-date="2025-12-01" + # Combine multiple sites node index.js --sites=linkedin,skipthedrive --keywords="intern,co-op" + +# Use AND logic - jobs must match ALL keywords (e.g., "co-op" AND "summer 2026") +node index.js --sites=linkedin --keywords="co-op,summer 2026" --and + +# Use grouped AND/OR logic - (co-op OR intern) AND (summer 2026) +# Use | (pipe) for OR within groups, , (comma) to separate AND groups +node index.js --sites=linkedin --keywords="co-op|intern,summer 2026" --and + +# Multiple AND groups - (co-op OR intern) AND (summer 2026) AND (remote) +node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --and ``` +**Date Filter Notes:** +- The date filter uses LinkedIn's `f_TPR` parameter to filter at the LinkedIn level before parsing +- Format: `YYYY-MM-DD` (e.g., `2025-12-01`) +- LinkedIn supports relative timeframes up to ~30 days +- For dates older than 30 days, LinkedIn may limit results to the maximum supported timeframe + ### 🚧 Planned Parsers - **Indeed**: Comprehensive job aggregator @@ -128,6 +150,9 @@ Create a `.env` file in the parser directory: ```env # Job Search Configuration SEARCH_KEYWORDS=software engineer,developer,programmer +# For grouped AND/OR logic, use pipe (|) for OR within groups and comma (,) for AND groups: +# SEARCH_KEYWORDS=co-op|intern,summer 2026,remote # (co-op OR intern) AND (summer 2026) AND (remote) +USE_AND_LOGIC=false # Set to "true" to enable AND logic (required for grouped keywords) LOCATION_FILTER=Ontario,Canada MAX_PAGES=5 @@ -136,6 +161,9 @@ LINKEDIN_USERNAME=your_email@example.com LINKEDIN_PASSWORD=your_password LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location search +# Date Filter (LinkedIn only - filters at LinkedIn level before parsing) +MIN_DATE=2025-12-01 # Format: YYYY-MM-DD (jobs posted after this date) + # Analysis Configuration ENABLE_AI_ANALYSIS=false HEADLESS=true @@ -144,6 +172,22 @@ HEADLESS=true OUTPUT_FORMAT=json # Options: "json", "csv", or "both" ``` +**Keyword Examples in .env:** + +```env +# Simple OR logic (default) - matches ANY keyword +SEARCH_KEYWORDS=co-op,intern +USE_AND_LOGIC=false + +# Simple AND logic - matches ALL keywords +SEARCH_KEYWORDS=co-op,summer 2026 +USE_AND_LOGIC=true + +# Grouped AND/OR logic - (co-op OR intern) AND (summer 2026) AND (remote) +SEARCH_KEYWORDS=co-op|intern,summer 2026,remote +USE_AND_LOGIC=true +``` + ### Command Line Options ```bash @@ -168,16 +212,34 @@ node index.js --no-rejected # Output format (json, csv, or both) node index.js --output=csv node index.js --output=both + +# Date filter (LinkedIn only - filters at LinkedIn level) +node index.js --sites=linkedin --min-date="2025-12-01" + +# Use AND logic for keywords (all keywords must match) +node index.js --sites=linkedin --keywords="co-op,summer 2026" --and + +# Use grouped AND/OR logic: (co-op OR intern) AND (summer 2026) +# Use | (pipe) for OR within groups, , (comma) to separate AND groups +node index.js --sites=linkedin --keywords="co-op|intern,summer 2026" --and + +# Multiple AND groups: (co-op OR intern) AND (summer 2026) AND (remote) +node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --and ``` **Available Options:** - `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive) - `--keywords="keyword1,keyword2"`: Search keywords + - Use `|` (pipe) to separate OR keywords within a group: `"co-op|intern"` means "co-op" OR "intern" + - Use `,` (comma) to separate AND groups when using `--and`: `"co-op|intern,summer 2026"` means (co-op OR intern) AND (summer 2026) - `--location="LOCATION"`: Location filter - `--max-pages=NUMBER`: Maximum pages to parse (0 or "all" for unlimited) +- `--min-date="YYYY-MM-DD"`: Minimum posted date filter (LinkedIn only - filters at LinkedIn level before parsing) - `--no-rejected` or `--exclude-rejected`: Exclude rejected results from output - `--output=FORMAT` or `--format=FORMAT`: Output format - "json", "csv", or "both" (default: "json") +- `--and` or `--all-keywords`: Use AND logic for keywords (all keywords must match). Default is OR logic (any keyword matches) + - When combined with `|` (pipe) in keywords, enables grouped AND/OR logic ## šŸ“Š Keywords diff --git a/job-search-parser/index.js b/job-search-parser/index.js index 1be0efd..a39aecf 100644 --- a/job-search-parser/index.js +++ b/job-search-parser/index.js @@ -28,6 +28,8 @@ const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL; const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5; const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true"; const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both" +const MIN_DATE = process.env.MIN_DATE; // Minimum posted date (format: YYYY-MM-DD) +const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for keywords // Available site strategies const SITE_STRATEGIES = { @@ -50,6 +52,8 @@ function parseArguments() { maxPages: MAX_PAGES, excludeRejected: EXCLUDE_REJECTED, outputFormat: OUTPUT_FORMAT, + minDate: MIN_DATE, + useAndLogic: USE_AND_LOGIC, // Use AND logic instead of OR logic for keywords (from env or CLI) }; args.forEach((arg) => { @@ -82,6 +86,10 @@ function parseArguments() { } else { logger.warning(`āš ļø Unknown output format: ${format}. Using default: json`); } + } else if (arg.startsWith("--min-date=")) { + options.minDate = arg.split("=")[1]; + } else if (arg === "--and" || arg === "--all-keywords") { + options.useAndLogic = true; // CLI flag overrides env variable } }); @@ -104,15 +112,35 @@ async function startJobSearchParser(options = {}) { logger.step("šŸš€ Job Search Parser Starting..."); // Parse keywords - const keywords = + let keywords = finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim()); + + // Parse keyword groups if AND logic is enabled and keywords contain pipe (|) separator + // Format: "co-op|intern,summer 2026" means (co-op OR intern) AND (summer 2026) + let keywordGroups = null; + if (finalOptions.useAndLogic && keywords.some(k => k.includes('|'))) { + keywordGroups = keywords.map(group => + group.split('|').map(k => k.trim()).filter(k => k.length > 0) + ); + logger.info(`šŸ” Keyword Groups: ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); + } + const locationFilter = finalOptions.locationFilter || LOCATION_FILTER; const sites = finalOptions.sites; const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED; logger.info(`šŸ“¦ Selected job sites: ${sites.join(", ")}`); logger.info(`šŸ” Search Keywords: ${keywords.join(", ")}`); + if (keywordGroups) { + logger.info(`šŸ”— Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); + } else { + logger.info(`šŸ”— Keyword Logic: ${finalOptions.useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); + } logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); + const minDate = finalOptions.minDate || MIN_DATE; + if (minDate) { + logger.info(`šŸ“… Min Date Filter: ${minDate} (jobs posted after this date)`); + } logger.info( `🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}` ); @@ -124,6 +152,96 @@ async function startJobSearchParser(options = {}) { const allResults = []; const allRejectedResults = []; const siteResults = {}; + let analysisResults = null; + + // Initialize results directory and file for incremental saving + const resultsDir = path.join(__dirname, "results"); + if (!fs.existsSync(resultsDir)) { + fs.mkdirSync(resultsDir, { recursive: true }); + } + + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT; + let incrementalJsonFilepath = null; + let incrementalCsvFilepath = null; + + // Initialize incremental save files + if (outputFormat === "json" || outputFormat === "both") { + const jsonFilename = `job-search-results-${timestamp}.json`; + incrementalJsonFilepath = path.join(resultsDir, jsonFilename); + } + if (outputFormat === "csv" || outputFormat === "both") { + const csvFilename = `job-search-results-${timestamp}.csv`; + incrementalCsvFilepath = path.join(resultsDir, csvFilename); + } + + /** + * Save results incrementally as they're found + */ + const saveIncrementalResults = (currentResults, currentRejectedResults, currentSiteResults, currentAnalysisResults = null, isComplete = false) => { + try { + const outputData = { + metadata: { + extractedAt: new Date().toISOString(), + parser: "job-search-parser", + version: "2.0.0", + sites: sites, + keywords: keywords.join(", "), + locationFilter, + aiAnalysisEnabled: ENABLE_AI_ANALYSIS, + aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined, + aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined, + analysisResults: currentAnalysisResults, + rejectedJobsExcluded: excludeRejected, + isComplete: isComplete, + lastUpdated: new Date().toISOString(), + }, + results: currentResults, + siteResults: currentSiteResults, + }; + + if (!excludeRejected) { + outputData.rejectedResults = currentRejectedResults; + } + + // Save JSON incrementally + if (incrementalJsonFilepath) { + fs.writeFileSync(incrementalJsonFilepath, JSON.stringify(outputData, null, 2)); + } + + // Save CSV incrementally (convert on each save) + if (incrementalCsvFilepath) { + const csvContent = convertResultsToCsv(outputData); + fs.writeFileSync(incrementalCsvFilepath, csvContent); + } + + if (!isComplete) { + logger.info(`šŸ’¾ Incremental save: ${currentResults.length} results saved to ${incrementalJsonFilepath || incrementalCsvFilepath}`); + } + } catch (error) { + logger.warning(`āš ļø Failed to save incremental results: ${error.message}`); + } + }; + + // Save initial empty state + saveIncrementalResults([], [], {}, null, false); + + // Set up signal handlers for graceful shutdown + let isShuttingDown = false; + const gracefulShutdown = async (signal) => { + if (isShuttingDown) return; + isShuttingDown = true; + + logger.warning(`\nāš ļø Received ${signal}, saving current results before exit...`); + saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); + logger.info(`šŸ’¾ Saved ${allResults.length} results before shutdown`); + + await coreParser.cleanup(); + process.exit(0); + }; + + process.on('SIGINT', () => gracefulShutdown('SIGINT')); + process.on('SIGTERM', () => gracefulShutdown('SIGTERM')); // Process each selected site for (const site of sites) { @@ -140,8 +258,10 @@ async function startJobSearchParser(options = {}) { // Prepare strategy options const strategyOptions = { keywords, + keywordGroups, // Pass grouped keywords if available locationFilter, maxPages: finalOptions.maxPages, + useAndLogic: finalOptions.useAndLogic || false, }; // Add credentials for LinkedIn @@ -165,6 +285,7 @@ async function startJobSearchParser(options = {}) { password: LINKEDIN_PASSWORD, }; strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || ""; + strategyOptions.minDate = minDate; // Add date filter for LinkedIn } const parseResult = await strategy(coreParser, strategyOptions); @@ -188,6 +309,9 @@ async function startJobSearchParser(options = {}) { logger.success( `āœ… ${site} completed in ${duration}s - Found ${results.length} jobs` ); + + // Save results incrementally after each site + saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); } catch (error) { logger.error(`āŒ ${site} parsing failed: ${error.message}`); siteResults[site] = { @@ -196,23 +320,41 @@ async function startJobSearchParser(options = {}) { duration: "0s", error: error.message, }; + // Save even on error to preserve what we have + saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); } } // AI Analysis if enabled - let analysisResults = null; + // Save results before AI analysis (in case AI analysis takes a long time) + if (allResults.length > 0) { + saveIncrementalResults(allResults, allRejectedResults, siteResults, null, false); + } + if (ENABLE_AI_ANALYSIS && allResults.length > 0) { logger.step("🧠 Running AI Analysis..."); const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL); if (ollamaAvailable) { // Prepare data for analysis (analyzeBatch expects objects with 'text' field) - const analysisData = allResults.map((job) => ({ - text: `${job.title || ""} at ${job.company || ""}. ${job.description || ""}`.trim(), - location: job.location || "", - keyword: job.keyword || "", - timestamp: job.extractedAt || job.postedDate || "", - })); + const analysisData = allResults.map((job) => { + // Build comprehensive text including all available job information + const parts = []; + if (job.title) parts.push(`Title: ${job.title}`); + if (job.company) parts.push(`Company: ${job.company}`); + if (job.description) parts.push(`Description: ${job.description}`); + if (job.roleDuties) parts.push(`Role Duties: ${job.roleDuties}`); + if (job.jobRequirements) parts.push(`Requirements: ${job.jobRequirements}`); + + return { + text: parts.join("\n\n"), + location: job.location || "", + keyword: job.keyword || "", + timestamp: job.extractedAt || job.postedDate || "", + roleDuties: job.roleDuties || "", + jobRequirements: job.jobRequirements || "", + }; + }); // Process in smaller batches to avoid timeouts (5 jobs per batch) const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5; @@ -263,68 +405,32 @@ async function startJobSearchParser(options = {}) { logger.success( `āœ… AI Analysis completed for ${allResults.length} jobs` ); + + // Save results after AI analysis completes + saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false); } else { logger.warning("āš ļø Ollama not available, skipping AI analysis"); } } - // Save results - logger.info(`šŸ’¾ Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`); + // Final save with complete flag + logger.info(`šŸ’¾ Preparing final save: ${allResults.length} results, ${allRejectedResults.length} rejected`); logger.info(`šŸ’¾ EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`); - const outputData = { - metadata: { - extractedAt: new Date().toISOString(), - parser: "job-search-parser", - version: "2.0.0", - sites: sites, - keywords: keywords.join(", "), - locationFilter, - aiAnalysisEnabled: ENABLE_AI_ANALYSIS, - aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined, - aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined, - analysisResults, - rejectedJobsExcluded: excludeRejected, - }, - results: allResults, - siteResults, - }; - - // Always include rejectedResults if not excluded (make it explicit, not using spread) if (!excludeRejected) { - outputData.rejectedResults = allRejectedResults; logger.info(`āœ… Including ${allRejectedResults.length} rejected results in output`); } else { logger.info(`ā­ļø Excluding rejected results (EXCLUDE_REJECTED=true)`); } - logger.info(`šŸ’¾ Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`); + logger.info(`šŸ’¾ Final output: ${allResults.length} results, ${allRejectedResults.length} rejected`); - const resultsDir = path.join(__dirname, "results"); - if (!fs.existsSync(resultsDir)) { - fs.mkdirSync(resultsDir, { recursive: true }); - } - - const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); - const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT; + // Final save with isComplete flag + saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, true); + const savedFiles = []; - - // Save JSON if format is "json" or "both" - if (outputFormat === "json" || outputFormat === "both") { - const jsonFilename = `job-search-results-${timestamp}.json`; - const jsonFilepath = path.join(resultsDir, jsonFilename); - fs.writeFileSync(jsonFilepath, JSON.stringify(outputData, null, 2)); - savedFiles.push(jsonFilepath); - } - - // Save CSV if format is "csv" or "both" - if (outputFormat === "csv" || outputFormat === "both") { - const csvFilename = `job-search-results-${timestamp}.csv`; - const csvFilepath = path.join(resultsDir, csvFilename); - const csvContent = convertResultsToCsv(outputData); - fs.writeFileSync(csvFilepath, csvContent); - savedFiles.push(csvFilepath); - } + if (incrementalJsonFilepath) savedFiles.push(incrementalJsonFilepath); + if (incrementalCsvFilepath) savedFiles.push(incrementalCsvFilepath); // Final summary logger.step("\nšŸ“Š Job Search Parser Summary"); @@ -348,6 +454,31 @@ async function startJobSearchParser(options = {}) { logger.success("\nāœ… Job Search Parser completed successfully!"); + // Construct output data for return + const outputData = { + metadata: { + extractedAt: new Date().toISOString(), + parser: "job-search-parser", + version: "2.0.0", + sites: sites, + keywords: keywords.join(", "), + locationFilter, + aiAnalysisEnabled: ENABLE_AI_ANALYSIS, + aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined, + aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined, + analysisResults: analysisResults, + rejectedJobsExcluded: excludeRejected, + isComplete: true, + lastUpdated: new Date().toISOString(), + }, + results: allResults, + siteResults: siteResults, + }; + + if (!excludeRejected) { + outputData.rejectedResults = allRejectedResults; + } + return outputData; } catch (error) { logger.error(`āŒ Job Search Parser failed: ${error.message}`); diff --git a/job-search-parser/parsers/skipthedrive.js b/job-search-parser/parsers/skipthedrive.js index 1328c94..e3913e9 100644 --- a/job-search-parser/parsers/skipthedrive.js +++ b/job-search-parser/parsers/skipthedrive.js @@ -13,6 +13,7 @@ const { logger, cleanText, containsAnyKeyword, + containsAllKeywords, parseLocationFilters, validateLocationAgainstFilters, extractLocationFromProfile, @@ -125,10 +126,12 @@ async function parseSkipTheDrive(options = {}) { headless = process.env.HEADLESS !== "false", enableAI = process.env.ENABLE_AI_ANALYSIS === "true", aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis", + useAndLogic = false, // Use AND logic instead of OR logic for keywords } = options; logger.step("Starting SkipTheDrive parser..."); logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); + logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); logger.info( `šŸ“‹ Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}` ); @@ -154,8 +157,12 @@ async function parseSkipTheDrive(options = {}) { const seenJobs = new Set(); try { - // Search for each keyword - for (const keyword of keywords) { + // For AND logic, combine all keywords into a single search query + // For OR logic, search each keyword separately + const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords; + + // Search for each keyword (or combined keyword for AND logic) + for (const keyword of searchKeywords) { logger.info(`\nšŸ” Searching for: ${keyword}`); const searchUrl = buildSearchUrl(keyword, "date", jobTypes); @@ -208,11 +215,17 @@ async function parseSkipTheDrive(options = {}) { // Validate job against keywords const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`; - if (!containsAnyKeyword(fullText, keywords)) { + const keywordMatch = useAndLogic + ? containsAllKeywords(fullText, keywords) + : containsAnyKeyword(fullText, keywords); + + if (!keywordMatch) { rejectedResults.push({ ...jobData, rejected: true, - reason: "Keywords not found in job listing", + reason: useAndLogic + ? "Not all keywords found in job listing" + : "Keywords not found in job listing", }); continue; } diff --git a/job-search-parser/src/csv-utils.js b/job-search-parser/src/csv-utils.js index 1689363..c6f12dd 100644 --- a/job-search-parser/src/csv-utils.js +++ b/job-search-parser/src/csv-utils.js @@ -44,6 +44,8 @@ function convertJobsToCsv(jobs, metadata = null) { "jobUrl", "postedDate", "description", + "roleDuties", + "jobRequirements", "jobType", "experienceLevel", "keyword", diff --git a/job-search-parser/strategies/linkedin-jobs-strategy.js b/job-search-parser/strategies/linkedin-jobs-strategy.js index 9cc4299..ec4bc25 100644 --- a/job-search-parser/strategies/linkedin-jobs-strategy.js +++ b/job-search-parser/strategies/linkedin-jobs-strategy.js @@ -10,6 +10,8 @@ const { validateLocationAgainstFilters, parseLocationFilters, containsAnyKeyword, + containsAllKeywords, + matchesKeywordGroups, } = require("ai-analyzer"); /** @@ -34,6 +36,28 @@ function buildJobSearchUrl(keyword, location = "", filters = {}) { params.append("location", location); } + // Add date filter if provided (f_TPR parameter) + // LinkedIn uses f_TPR=r where seconds is the time range + if (filters.minDate) { + try { + const minDate = new Date(filters.minDate); + const now = new Date(); + const secondsDiff = Math.floor((now - minDate) / 1000); + + // LinkedIn supports relative timeframes (f_TPR parameter) + // If date is in the future, don't add filter + if (secondsDiff > 0) { + // LinkedIn typically supports up to ~30 days (2592000 seconds) + // For dates older than 30 days, we'll still add it but LinkedIn may limit results + const maxSeconds = 2592000; // 30 days + const timeRange = Math.min(secondsDiff, maxSeconds); + params.append("f_TPR", `r${timeRange}`); + } + } catch (error) { + logger.warning(`āš ļø Invalid date format for minDate: ${filters.minDate}. Expected format: YYYY-MM-DD`); + } + } + // Add additional filters if (filters.experienceLevel) { params.append("f_E", filters.experienceLevel); @@ -54,10 +78,13 @@ function buildJobSearchUrl(keyword, location = "", filters = {}) { async function linkedinJobsStrategy(coreParser, options = {}) { const { keywords = ["software engineer", "developer"], + keywordGroups = null, // Array of keyword groups for grouped AND/OR logic locationFilter = null, maxPages = 5, credentials = {}, location = "", // LinkedIn location search (e.g., "Canada", "Toronto, Ontario, Canada") + minDate = null, // Minimum posted date (format: YYYY-MM-DD) + useAndLogic = false, // Use AND logic instead of OR logic for keywords } = options; const results = []; @@ -79,15 +106,39 @@ async function linkedinJobsStrategy(coreParser, options = {}) { logger.info("šŸš€ Starting LinkedIn Jobs parser..."); logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); + if (keywordGroups) { + logger.info(`šŸ”— Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); + } else { + logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); + } logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); logger.info(`šŸŒ LinkedIn Location: ${location || "None"}`); logger.info(`šŸ“„ Max Pages: ${maxPages}`); + if (minDate) { + logger.info(`šŸ“… Min Date Filter: ${minDate} (jobs posted after this date)`); + } - // Search for each keyword - for (const keyword of keywords) { + // Determine search keywords based on logic type + let searchKeywords; + if (keywordGroups) { + // For grouped AND/OR logic, search each keyword in each group (OR within groups) + // We'll combine results and filter to ensure all groups match (AND between groups) + searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups + } else if (useAndLogic) { + // For simple AND logic, combine all keywords into a single search query + searchKeywords = [keywords.join(" ")]; + } else { + // For OR logic, search each keyword separately + searchKeywords = keywords; + } + + // Search for each keyword (or combined keyword for AND logic) + for (const keyword of searchKeywords) { logger.info(`\nšŸ” Searching LinkedIn Jobs for: "${keyword}"`); - const searchUrl = buildJobSearchUrl(keyword, location); + const searchUrl = buildJobSearchUrl(keyword, location, { + minDate: minDate, + }); logger.info(`šŸ”— Search URL: ${searchUrl}`); // Check if page is still valid before proceeding @@ -220,7 +271,7 @@ async function linkedinJobsStrategy(coreParser, options = {}) { await new Promise((resolve) => setTimeout(resolve, 2000)); // Extract jobs from current page - const pageJobs = await extractJobsFromPage(page, keyword, locationFilter); + const pageJobs = await extractJobsFromPage(page, keyword, locationFilter, coreParser); logger.info(`šŸ“‹ Extracted ${pageJobs.length} jobs from page ${currentPage}`); if (pageJobs.length === 0) { @@ -317,10 +368,35 @@ async function linkedinJobsStrategy(coreParser, options = {}) { } seenJobs.add(job.jobId); - // REMOVED: Keyword validation - LinkedIn already filtered by keyword in search results - // If LinkedIn returned this job in search results, it matches the keyword. - // The snippet might not contain the keyword, but the full description does. - // Trust LinkedIn's search algorithm rather than re-validating against snippets. + // Validate keywords based on logic type + if (keywordGroups) { + // Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR) + const fullText = `${job.title} ${job.description} ${job.company}`; + if (!matchesKeywordGroups(fullText, keywordGroups)) { + rejectedResults.push({ + ...job, + rejectionReason: "Job does not match all keyword groups", + }); + if (process.env.DEBUG === "true") { + logger.debug(`šŸ” Rejected (grouped logic): "${job.title}" - does not match all groups`); + } + continue; + } + } else if (useAndLogic) { + // Simple AND logic: all keywords must match + const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase(); + if (!containsAllKeywords(fullText, keywords)) { + rejectedResults.push({ + ...job, + rejectionReason: "Not all keywords found in job listing", + }); + if (process.env.DEBUG === "true") { + logger.debug(`šŸ” Rejected (AND logic): "${job.title}" - not all keywords found`); + } + continue; + } + } + // For OR logic, trust LinkedIn's search results (already filtered) // Validate location if filtering enabled if (locationFilter) { @@ -514,7 +590,7 @@ async function scrollToLoadJobs(page) { /** * Extract jobs from current page */ -async function extractJobsFromPage(page, keyword, locationFilter) { +async function extractJobsFromPage(page, keyword, locationFilter, coreParser = null) { const jobs = []; try { @@ -644,7 +720,7 @@ async function extractJobsFromPage(page, keyword, locationFilter) { logger.debug(`Could not scroll/hover job element ${i}: ${scrollError.message}`); } - const job = await extractJobData(jobElement, keyword); + const job = await extractJobData(jobElement, keyword, page, coreParser); if (job && (job.title || job.jobId)) { // Only add if we have at least a title or jobId jobs.push(job); @@ -671,10 +747,240 @@ async function extractJobsFromPage(page, keyword, locationFilter) { return jobs; } +/** + * Extract full job description from job detail page + */ +async function extractFullJobDescription(coreParser, jobUrl) { + try { + if (!jobUrl) { + return { fullDescription: "", roleDuties: "", jobRequirements: "" }; + } + + // Create a separate page for detail extraction to avoid disrupting search results + const detailPage = await coreParser.createPage(`linkedin-job-detail-${Date.now()}`); + + try { + // Navigate to job detail page + await detailPage.goto(jobUrl, { waitUntil: "networkidle2", timeout: 30000 }).catch(() => {}); + await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait for content to load + + const jobDetails = await detailPage.evaluate(() => { + const details = { + fullDescription: "", + roleDuties: "", + jobRequirements: "", + }; + + // Try multiple selectors for job description container + const descriptionSelectors = [ + ".description__text", + ".show-more-less-html__markup", + "[class*='description__text']", + "[class*='job-description']", + ".jobs-description__text", + ".jobs-box__html-content", + "[data-test-id='job-description']", + ".jobs-details__main-content", + ".jobs-description-content__text", + ]; + + let descriptionElement = null; + for (const selector of descriptionSelectors) { + descriptionElement = document.querySelector(selector); + if (descriptionElement) { + break; + } + } + + if (descriptionElement) { + details.fullDescription = descriptionElement.textContent?.trim() || + descriptionElement.innerText?.trim() || ""; + } + + // If we didn't find description, try to get from main content area + if (!details.fullDescription) { + const mainContent = document.querySelector("main") || + document.querySelector("[class*='jobs-details']") || + document.querySelector("[class*='job-details']"); + if (mainContent) { + details.fullDescription = mainContent.textContent?.trim() || + mainContent.innerText?.trim() || ""; + } + } + + return details; + }); + + // Parse duties and requirements from full description + const parsed = parseDutiesAndRequirements(jobDetails.fullDescription); + + return { + fullDescription: jobDetails.fullDescription, + roleDuties: parsed.duties, + jobRequirements: parsed.requirements, + }; + } finally { + // Close the detail page to free resources + try { + await detailPage.close(); + } catch (closeError) { + // Ignore close errors + } + } + } catch (error) { + logger.warning(`Failed to extract full job description from ${jobUrl}: ${error.message}`); + return { fullDescription: "", roleDuties: "", jobRequirements: "" }; + } +} + +/** + * Parse job description to separate role duties from job requirements + */ +function parseDutiesAndRequirements(description) { + if (!description || description.trim().length === 0) { + return { duties: "", requirements: "" }; + } + + const duties = []; + const requirements = []; + + // Common section headers that indicate duties/responsibilities + const dutiesKeywords = [ + /responsibilities?:/i, + /duties?:/i, + /what you['\u2019]ll do/i, + /key responsibilities/i, + /your role/i, + /position overview/i, + /about the role/i, + /role overview/i, + /what we need/i, + /you will:/i, + /you['\u2019]ll be responsible/i, + ]; + + // Common section headers that indicate requirements/qualifications + const requirementsKeywords = [ + /requirements?:/i, + /qualifications?:/i, + /must have/i, + /required:/i, + /what you['\u2019]ll bring/i, + /you have:/i, + /skills required/i, + /minimum requirements/i, + /preferred qualifications/i, + /education:/i, + /experience:/i, + /you must have/i, + /we['\u2019]re looking for/i, + ]; + + // Split description into sections (by common delimiters) + const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0); + + let currentSection = "duties"; // Default to duties + let dutiesText = ""; + let requirementsText = ""; + + for (const section of sections) { + const sectionLower = section.toLowerCase(); + + // Check if this section is about requirements + let isRequirementsSection = false; + for (const keyword of requirementsKeywords) { + if (keyword.test(section)) { + isRequirementsSection = true; + currentSection = "requirements"; + break; + } + } + + // Check if this section is about duties/responsibilities + if (!isRequirementsSection) { + for (const keyword of dutiesKeywords) { + if (keyword.test(section)) { + currentSection = "duties"; + break; + } + } + } + + // Add to appropriate section + if (currentSection === "requirements") { + requirementsText += (requirementsText ? "\n\n" : "") + section.trim(); + } else { + dutiesText += (dutiesText ? "\n\n" : "") + section.trim(); + } + } + + // If we couldn't split by sections, try to find bullet points or numbered lists + if (!dutiesText && !requirementsText) { + const lines = description.split(/\n/); + let foundRequirementsHeader = false; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + if (line.length === 0) continue; + + // Check if this line is a requirements header + for (const keyword of requirementsKeywords) { + if (keyword.test(line)) { + foundRequirementsHeader = true; + break; + } + } + + if (foundRequirementsHeader) { + requirementsText += (requirementsText ? "\n" : "") + line; + } else { + // Check if it's a duties header + let isDutiesHeader = false; + for (const keyword of dutiesKeywords) { + if (keyword.test(line)) { + isDutiesHeader = true; + break; + } + } + + if (!isDutiesHeader) { + // Add to duties if we haven't found requirements header yet + if (!foundRequirementsHeader) { + dutiesText += (dutiesText ? "\n" : "") + line; + } else { + requirementsText += (requirementsText ? "\n" : "") + line; + } + } else { + dutiesText += (dutiesText ? "\n" : "") + line; + } + } + } + } + + // Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements + if (!dutiesText && !requirementsText && description) { + const midPoint = Math.floor(description.length * 0.6); + const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement"); + const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification"); + const splitPoint = Math.max( + lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint, + lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint + ); + + dutiesText = description.substring(0, splitPoint).trim(); + requirementsText = description.substring(splitPoint).trim(); + } + + return { + duties: dutiesText.trim(), + requirements: requirementsText.trim(), + }; +} + /** * Extract data from individual job element */ -async function extractJobData(jobElement, keyword) { +async function extractJobData(jobElement, keyword, page = null, coreParser = null) { try { const jobData = await jobElement.evaluate((el) => { const data = { @@ -1191,6 +1497,20 @@ async function extractJobData(jobElement, keyword) { // Generate job ID if not found const jobId = jobData.jobId || `linkedin-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + // Extract full job details if coreParser and jobUrl are provided + let fullDetails = { fullDescription: "", roleDuties: "", jobRequirements: "" }; + if (coreParser && jobUrl) { + try { + fullDetails = await extractFullJobDescription(coreParser, jobUrl); + // If we got full description, update the description field + if (fullDetails.fullDescription) { + jobData.description = fullDetails.fullDescription; + } + } catch (error) { + logger.debug(`Could not extract full job details for ${jobUrl}: ${error.message}`); + } + } + return { jobId, title, @@ -1198,7 +1518,9 @@ async function extractJobData(jobElement, keyword) { location: cleanText(jobData.location), jobUrl, postedDate: jobData.postedDate, - description: cleanText(jobData.description), + description: cleanText(fullDetails.fullDescription || jobData.description), + roleDuties: cleanText(fullDetails.roleDuties), + jobRequirements: cleanText(fullDetails.jobRequirements), jobType: jobData.jobType, experienceLevel: jobData.experienceLevel, keyword, diff --git a/job-search-parser/strategies/skipthedrive-strategy.js b/job-search-parser/strategies/skipthedrive-strategy.js index 092ef09..71df176 100644 --- a/job-search-parser/strategies/skipthedrive-strategy.js +++ b/job-search-parser/strategies/skipthedrive-strategy.js @@ -8,6 +8,8 @@ const { logger, cleanText, containsAnyKeyword, + containsAllKeywords, + matchesKeywordGroups, validateLocationAgainstFilters, } = require("ai-analyzer"); @@ -34,9 +36,11 @@ function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { async function skipthedriveStrategy(coreParser, options = {}) { const { keywords = ["software engineer", "developer", "programmer"], + keywordGroups = null, // Array of keyword groups for grouped AND/OR logic locationFilter = null, maxPages = 5, jobTypes = [], + useAndLogic = false, // Use AND logic instead of OR logic for keywords } = options; const results = []; @@ -49,11 +53,29 @@ async function skipthedriveStrategy(coreParser, options = {}) { logger.info("šŸš€ Starting SkipTheDrive parser..."); logger.info(`šŸ” Keywords: ${keywords.join(", ")}`); + if (keywordGroups) { + logger.info(`šŸ”— Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); + } else { + logger.info(`šŸ”— Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); + } logger.info(`šŸ“ Location Filter: ${locationFilter || "None"}`); logger.info(`šŸ“„ Max Pages: ${maxPages}`); - // Search for each keyword - for (const keyword of keywords) { + // Determine search keywords based on logic type + let searchKeywords; + if (keywordGroups) { + // For grouped AND/OR logic, search each keyword in each group (OR within groups) + searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups + } else if (useAndLogic) { + // For simple AND logic, combine all keywords into a single search query + searchKeywords = [keywords.join(" ")]; + } else { + // For OR logic, search each keyword separately + searchKeywords = keywords; + } + + // Search for each keyword (or combined keyword for AND logic) + for (const keyword of searchKeywords) { logger.info(`\nšŸ” Searching for: ${keyword}`); const searchUrl = buildSearchUrl(keyword, "date", jobTypes); @@ -92,7 +114,10 @@ async function skipthedriveStrategy(coreParser, options = {}) { const pageJobs = await extractJobsFromPage( page, keyword, - locationFilter + locationFilter, + keywords, + keywordGroups, + useAndLogic ); for (const job of pageJobs) { @@ -100,6 +125,29 @@ async function skipthedriveStrategy(coreParser, options = {}) { if (seenJobs.has(job.jobId)) continue; seenJobs.add(job.jobId); + // Validate keywords based on logic type + if (keywordGroups) { + // Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR) + const fullText = `${job.title} ${job.description} ${job.company}`; + if (!matchesKeywordGroups(fullText, keywordGroups)) { + rejectedResults.push({ + ...job, + rejectionReason: "Job does not match all keyword groups", + }); + continue; + } + } else if (useAndLogic) { + // Simple AND logic: all keywords must match + const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase(); + if (!containsAllKeywords(fullText, keywords)) { + rejectedResults.push({ + ...job, + rejectionReason: "Not all keywords found in job listing", + }); + continue; + } + } + // Validate location if filtering enabled if (locationFilter) { const locationValid = validateLocationAgainstFilters( @@ -160,7 +208,7 @@ async function skipthedriveStrategy(coreParser, options = {}) { /** * Extract jobs from current page */ -async function extractJobsFromPage(page, keyword, locationFilter) { +async function extractJobsFromPage(page, keyword, locationFilter, allKeywords = [], keywordGroups = null, useAndLogic = false) { const jobs = []; try { @@ -184,6 +232,147 @@ async function extractJobsFromPage(page, keyword, locationFilter) { return jobs; } +/** + * Parse job description to separate role duties from job requirements + */ +function parseDutiesAndRequirements(description) { + if (!description || description.trim().length === 0) { + return { duties: "", requirements: "" }; + } + + // Common section headers that indicate duties/responsibilities + const dutiesKeywords = [ + /responsibilities?:/i, + /duties?:/i, + /what you['\u2019]ll do/i, + /key responsibilities/i, + /your role/i, + /position overview/i, + /about the role/i, + /role overview/i, + /what we need/i, + /you will:/i, + /you['\u2019]ll be responsible/i, + ]; + + // Common section headers that indicate requirements/qualifications + const requirementsKeywords = [ + /requirements?:/i, + /qualifications?:/i, + /must have/i, + /required:/i, + /what you['\u2019]ll bring/i, + /you have:/i, + /skills required/i, + /minimum requirements/i, + /preferred qualifications/i, + /education:/i, + /experience:/i, + /you must have/i, + /we['\u2019]re looking for/i, + ]; + + // Split description into sections (by common delimiters) + const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0); + + let currentSection = "duties"; // Default to duties + let dutiesText = ""; + let requirementsText = ""; + + for (const section of sections) { + const sectionLower = section.toLowerCase(); + + // Check if this section is about requirements + let isRequirementsSection = false; + for (const keyword of requirementsKeywords) { + if (keyword.test(section)) { + isRequirementsSection = true; + currentSection = "requirements"; + break; + } + } + + // Check if this section is about duties/responsibilities + if (!isRequirementsSection) { + for (const keyword of dutiesKeywords) { + if (keyword.test(section)) { + currentSection = "duties"; + break; + } + } + } + + // Add to appropriate section + if (currentSection === "requirements") { + requirementsText += (requirementsText ? "\n\n" : "") + section.trim(); + } else { + dutiesText += (dutiesText ? "\n\n" : "") + section.trim(); + } + } + + // If we couldn't split by sections, try to find bullet points or numbered lists + if (!dutiesText && !requirementsText) { + const lines = description.split(/\n/); + let foundRequirementsHeader = false; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + if (line.length === 0) continue; + + // Check if this line is a requirements header + for (const keyword of requirementsKeywords) { + if (keyword.test(line)) { + foundRequirementsHeader = true; + break; + } + } + + if (foundRequirementsHeader) { + requirementsText += (requirementsText ? "\n" : "") + line; + } else { + // Check if it's a duties header + let isDutiesHeader = false; + for (const keyword of dutiesKeywords) { + if (keyword.test(line)) { + isDutiesHeader = true; + break; + } + } + + if (!isDutiesHeader) { + // Add to duties if we haven't found requirements header yet + if (!foundRequirementsHeader) { + dutiesText += (dutiesText ? "\n" : "") + line; + } else { + requirementsText += (requirementsText ? "\n" : "") + line; + } + } else { + dutiesText += (dutiesText ? "\n" : "") + line; + } + } + } + } + + // Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements + if (!dutiesText && !requirementsText && description) { + const midPoint = Math.floor(description.length * 0.6); + const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement"); + const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification"); + const splitPoint = Math.max( + lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint, + lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint + ); + + dutiesText = description.substring(0, splitPoint).trim(); + requirementsText = description.substring(splitPoint).trim(); + } + + return { + duties: dutiesText.trim(), + requirements: requirementsText.trim(), + }; +} + /** * Extract data from individual job element */ @@ -242,6 +431,9 @@ async function extractJobData(jobElement, keyword) { } } + // Parse duties and requirements from description if available + const parsed = parseDutiesAndRequirements(description); + return { jobId, title, @@ -252,6 +444,8 @@ async function extractJobData(jobElement, keyword) { dateText, daysAgo, description, + roleDuties: parsed.duties, + jobRequirements: parsed.requirements, isFeatured, keyword, extractedAt: new Date().toISOString(),