Refactor text utilities for improved clarity and maintainability

- Cleaned up and organized text processing utilities in `text-utils.js` for better readability and reuse.
- Ensured consistent formatting and documentation across utility functions.
- No functional changes were made; the focus was on code structure and clarity.
This commit is contained in:
Tanya 2025-12-29 11:22:59 -05:00
parent 673f84d388
commit 691d61aaee
2 changed files with 491 additions and 491 deletions

View File

@ -1,146 +1,146 @@
/**
* Text processing utilities for cleaning and validating content
* Extracted from linkedout.js for reuse across parsers
*/
/**
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
*/
function cleanText(text) {
if (!text || typeof text !== "string") {
return "";
}
// Remove hashtags
text = text.replace(/#\w+/g, "");
// Remove hashtag mentions
text = text.replace(/\bhashtag\b/gi, "");
text = text.replace(/hashtag-\w+/gi, "");
// Remove URLs
text = text.replace(/https?:\/\/[^\s]+/g, "");
// Remove emojis (Unicode ranges for common emoji)
text = text.replace(
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
""
);
// Normalize whitespace
text = text.replace(/\s+/g, " ").trim();
return text;
}
/**
* Check if text contains any of the specified keywords (case insensitive)
*/
function containsAnyKeyword(text, keywords) {
if (!text || !Array.isArray(keywords)) {
return false;
}
const lowerText = text.toLowerCase();
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
}
/**
* Check if text contains all of the specified keywords (case insensitive)
*/
function containsAllKeywords(text, keywords) {
if (!text || !Array.isArray(keywords)) {
return false;
}
const lowerText = text.toLowerCase();
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
}
/**
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
* @param {string} text - Text to search in
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
*/
function matchesKeywordGroups(text, keywordGroups) {
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
return false;
}
const lowerText = text.toLowerCase();
// All groups must match (AND logic)
return keywordGroups.every((group) => {
if (!Array.isArray(group) || group.length === 0) {
return false;
}
// At least one keyword in the group must match (OR logic)
return group.some((keyword) =>
lowerText.includes(keyword.toLowerCase().trim())
);
});
}
/**
* Validate if text meets basic quality criteria
*/
function isValidText(text, minLength = 30) {
if (!text || typeof text !== "string") {
return false;
}
// Check minimum length
if (text.length < minLength) {
return false;
}
// Check if text contains alphanumeric characters
if (!/[a-zA-Z0-9]/.test(text)) {
return false;
}
return true;
}
/**
* Extract domain from URL
*/
function extractDomain(url) {
if (!url || typeof url !== "string") {
return null;
}
try {
const urlObj = new URL(url);
return urlObj.hostname;
} catch (error) {
return null;
}
}
/**
* Normalize URL by removing query parameters and fragments
*/
function normalizeUrl(url) {
if (!url || typeof url !== "string") {
return "";
}
try {
const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
} catch (error) {
return url;
}
}
module.exports = {
cleanText,
containsAnyKeyword,
containsAllKeywords,
matchesKeywordGroups,
isValidText,
extractDomain,
normalizeUrl,
};
/**
* Text processing utilities for cleaning and validating content
* Extracted from linkedout.js for reuse across parsers
*/
/**
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
*/
function cleanText(text) {
if (!text || typeof text !== "string") {
return "";
}
// Remove hashtags
text = text.replace(/#\w+/g, "");
// Remove hashtag mentions
text = text.replace(/\bhashtag\b/gi, "");
text = text.replace(/hashtag-\w+/gi, "");
// Remove URLs
text = text.replace(/https?:\/\/[^\s]+/g, "");
// Remove emojis (Unicode ranges for common emoji)
text = text.replace(
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
""
);
// Normalize whitespace
text = text.replace(/\s+/g, " ").trim();
return text;
}
/**
* Check if text contains any of the specified keywords (case insensitive)
*/
function containsAnyKeyword(text, keywords) {
if (!text || !Array.isArray(keywords)) {
return false;
}
const lowerText = text.toLowerCase();
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
}
/**
* Check if text contains all of the specified keywords (case insensitive)
*/
function containsAllKeywords(text, keywords) {
if (!text || !Array.isArray(keywords)) {
return false;
}
const lowerText = text.toLowerCase();
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
}
/**
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
* @param {string} text - Text to search in
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
*/
function matchesKeywordGroups(text, keywordGroups) {
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
return false;
}
const lowerText = text.toLowerCase();
// All groups must match (AND logic)
return keywordGroups.every((group) => {
if (!Array.isArray(group) || group.length === 0) {
return false;
}
// At least one keyword in the group must match (OR logic)
return group.some((keyword) =>
lowerText.includes(keyword.toLowerCase().trim())
);
});
}
/**
* Validate if text meets basic quality criteria
*/
function isValidText(text, minLength = 30) {
if (!text || typeof text !== "string") {
return false;
}
// Check minimum length
if (text.length < minLength) {
return false;
}
// Check if text contains alphanumeric characters
if (!/[a-zA-Z0-9]/.test(text)) {
return false;
}
return true;
}
/**
* Extract domain from URL
*/
function extractDomain(url) {
if (!url || typeof url !== "string") {
return null;
}
try {
const urlObj = new URL(url);
return urlObj.hostname;
} catch (error) {
return null;
}
}
/**
* Normalize URL by removing query parameters and fragments
*/
function normalizeUrl(url) {
if (!url || typeof url !== "string") {
return "";
}
try {
const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
} catch (error) {
return url;
}
}
module.exports = {
cleanText,
containsAnyKeyword,
containsAllKeywords,
matchesKeywordGroups,
isValidText,
extractDomain,
normalizeUrl,
};

View File

@ -1,345 +1,345 @@
/**
* SkipTheDrive Job Parser
*
* Parses remote job listings from SkipTheDrive.com
* Supports keyword search, job type filters, and pagination
*/
const { chromium } = require("playwright");
const path = require("path");
// Import from ai-analyzer core package
const {
logger,
cleanText,
containsAnyKeyword,
containsAllKeywords,
parseLocationFilters,
validateLocationAgainstFilters,
extractLocationFromProfile,
analyzeBatch,
checkOllamaStatus,
} = require("../../ai-analyzer");
/**
* Build search URL for SkipTheDrive
* @param {string} keyword - Search keyword
* @param {string} orderBy - Sort order (date, relevance)
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
* @returns {string} - Formatted search URL
*/
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
if (orderBy) {
url += `&orderby=${orderBy}`;
}
// Add job type filters
jobTypes.forEach((type) => {
url += `&jobtype=${encodeURIComponent(type)}`;
});
return url;
}
/**
* Extract job data from a single job listing element
* @param {Element} article - Job listing DOM element
* @returns {Object} - Extracted job data
*/
async function extractJobData(article) {
try {
// Extract job title and URL
const titleElement = await article.$("h2.post-title a");
const title = titleElement ? await titleElement.textContent() : "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract date
const dateElement = await article.$("time.post-date");
const datePosted = dateElement
? await dateElement.getAttribute("datetime")
: "";
const dateText = dateElement ? await dateElement.textContent() : "";
// Extract company name
const companyElement = await article.$(
".custom_fields_company_name_display_search_results"
);
let company = companyElement ? await companyElement.textContent() : "";
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract days ago
const daysAgoElement = await article.$(
".custom_fields_job_date_display_search_results"
);
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract job description excerpt
const excerptElement = await article.$(".excerpt_part");
const description = excerptElement
? await excerptElement.textContent()
: "";
// Check if featured/sponsored
const featuredElement = await article.$(".custom_fields_sponsored_job");
const isFeatured = !!featuredElement;
// Extract job ID from article ID
const articleId = await article.getAttribute("id");
const jobId = articleId ? articleId.replace("post-", "") : "";
return {
jobId,
title: cleanText(title),
company: cleanText(company),
jobUrl,
datePosted,
dateText: cleanText(dateText),
daysAgo: cleanText(daysAgo),
description: cleanText(description),
isFeatured,
source: "skipthedrive",
timestamp: new Date().toISOString(),
};
} catch (error) {
logger.error(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Parse SkipTheDrive job listings
* @param {Object} options - Parser options
* @returns {Promise<Array>} - Array of parsed job listings
*/
async function parseSkipTheDrive(options = {}) {
const {
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
"software engineer",
"developer",
],
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
locationFilter = process.env.LOCATION_FILTER || "",
maxPages = parseInt(process.env.MAX_PAGES) || 5,
headless = process.env.HEADLESS !== "false",
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
useAndLogic = false, // Use AND logic instead of OR logic for keywords
} = options;
logger.step("Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
logger.info(
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
const browser = await chromium.launch({
headless,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
});
const context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
});
const results = [];
const rejectedResults = [];
const seenJobs = new Set();
try {
// For AND logic, combine all keywords into a single search query
// For OR logic, search each keyword separately
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
// Search for each keyword (or combined keyword for AND logic)
for (const keyword of searchKeywords) {
logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
const page = await context.newPage();
try {
logger.info(
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
);
await page.goto(searchUrl, {
waitUntil: "domcontentloaded",
timeout: 30000,
});
logger.info(
`Navigation completed successfully at ${new Date().toISOString()}`
);
// Wait for job listings to load
logger.info("Waiting for selector #loops-wrapper");
await page
.waitForSelector("#loops-wrapper", { timeout: 5000 })
.catch(() => {
logger.warning(`No results found for keyword: ${keyword}`);
});
logger.info("Selector wait completed");
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) {
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
// Extract all job articles on current page
const jobArticles = await page.$$("article[id^='post-']");
logger.info(
`Found ${jobArticles.length} job listings on page ${currentPage}`
);
for (const article of jobArticles) {
const jobData = await extractJobData(article);
if (!jobData || seenJobs.has(jobData.jobId)) {
continue;
}
seenJobs.add(jobData.jobId);
// Add keyword that found this job
jobData.searchKeyword = keyword;
// Validate job against keywords
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
const keywordMatch = useAndLogic
? containsAllKeywords(fullText, keywords)
: containsAnyKeyword(fullText, keywords);
if (!keywordMatch) {
rejectedResults.push({
...jobData,
rejected: true,
reason: useAndLogic
? "Not all keywords found in job listing"
: "Keywords not found in job listing",
});
continue;
}
// Location validation (if enabled)
if (locationFilter) {
const locationFilters = parseLocationFilters(locationFilter);
// For SkipTheDrive, most jobs are remote, but we can check the title/description
const locationValid =
fullText.toLowerCase().includes("remote") ||
locationFilters.some((filter) =>
fullText.toLowerCase().includes(filter.toLowerCase())
);
if (!locationValid) {
rejectedResults.push({
...jobData,
rejected: true,
reason: "Location requirements not met",
});
continue;
}
jobData.locationValid = locationValid;
}
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
results.push(jobData);
}
// Check for next page
const nextPageLink = await page.$("a.nextp");
if (nextPageLink && currentPage < maxPages) {
logger.info("📄 Moving to next page...");
await nextPageLink.click();
await page.waitForLoadState("domcontentloaded");
await page.waitForTimeout(2000); // Wait for content to load
currentPage++;
} else {
hasNextPage = false;
}
}
} catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} finally {
await page.close();
}
}
logger.success(`\n✅ Parsing complete!`);
logger.info(`📊 Total jobs found: ${results.length}`);
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
// Run AI analysis if enabled
let aiAnalysis = null;
if (enableAI && results.length > 0) {
logger.step("Running AI analysis on job listings...");
const aiAvailable = await checkOllamaStatus();
if (aiAvailable) {
const analysisData = results.map((job) => ({
text: `${job.title} at ${job.company}. ${job.description}`,
metadata: {
jobId: job.jobId,
company: job.company,
daysAgo: job.daysAgo,
},
}));
aiAnalysis = await analyzeBatch(analysisData, aiContext);
// Merge AI analysis with results
results.forEach((job, index) => {
if (aiAnalysis && aiAnalysis[index]) {
job.aiAnalysis = {
isRelevant: aiAnalysis[index].isRelevant,
confidence: aiAnalysis[index].confidence,
reasoning: aiAnalysis[index].reasoning,
};
}
});
logger.success("✅ AI analysis completed");
} else {
logger.warning("⚠️ AI not available - skipping analysis");
}
}
return {
results,
rejectedResults,
metadata: {
source: "skipthedrive",
totalJobs: results.length,
rejectedJobs: rejectedResults.length,
keywords: keywords,
jobTypes: jobTypes,
locationFilter: locationFilter,
aiAnalysisEnabled: enableAI,
aiAnalysisCompleted: !!aiAnalysis,
timestamp: new Date().toISOString(),
},
};
} catch (error) {
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
throw error;
} finally {
await browser.close();
}
}
// Export the parser
module.exports = {
parseSkipTheDrive,
buildSearchUrl,
extractJobData,
};
/**
* SkipTheDrive Job Parser
*
* Parses remote job listings from SkipTheDrive.com
* Supports keyword search, job type filters, and pagination
*/
const { chromium } = require("playwright");
const path = require("path");
// Import from ai-analyzer core package
const {
logger,
cleanText,
containsAnyKeyword,
containsAllKeywords,
parseLocationFilters,
validateLocationAgainstFilters,
extractLocationFromProfile,
analyzeBatch,
checkOllamaStatus,
} = require("../../ai-analyzer");
/**
* Build search URL for SkipTheDrive
* @param {string} keyword - Search keyword
* @param {string} orderBy - Sort order (date, relevance)
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
* @returns {string} - Formatted search URL
*/
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
if (orderBy) {
url += `&orderby=${orderBy}`;
}
// Add job type filters
jobTypes.forEach((type) => {
url += `&jobtype=${encodeURIComponent(type)}`;
});
return url;
}
/**
* Extract job data from a single job listing element
* @param {Element} article - Job listing DOM element
* @returns {Object} - Extracted job data
*/
async function extractJobData(article) {
try {
// Extract job title and URL
const titleElement = await article.$("h2.post-title a");
const title = titleElement ? await titleElement.textContent() : "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract date
const dateElement = await article.$("time.post-date");
const datePosted = dateElement
? await dateElement.getAttribute("datetime")
: "";
const dateText = dateElement ? await dateElement.textContent() : "";
// Extract company name
const companyElement = await article.$(
".custom_fields_company_name_display_search_results"
);
let company = companyElement ? await companyElement.textContent() : "";
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract days ago
const daysAgoElement = await article.$(
".custom_fields_job_date_display_search_results"
);
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract job description excerpt
const excerptElement = await article.$(".excerpt_part");
const description = excerptElement
? await excerptElement.textContent()
: "";
// Check if featured/sponsored
const featuredElement = await article.$(".custom_fields_sponsored_job");
const isFeatured = !!featuredElement;
// Extract job ID from article ID
const articleId = await article.getAttribute("id");
const jobId = articleId ? articleId.replace("post-", "") : "";
return {
jobId,
title: cleanText(title),
company: cleanText(company),
jobUrl,
datePosted,
dateText: cleanText(dateText),
daysAgo: cleanText(daysAgo),
description: cleanText(description),
isFeatured,
source: "skipthedrive",
timestamp: new Date().toISOString(),
};
} catch (error) {
logger.error(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Parse SkipTheDrive job listings
* @param {Object} options - Parser options
* @returns {Promise<Array>} - Array of parsed job listings
*/
async function parseSkipTheDrive(options = {}) {
const {
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
"software engineer",
"developer",
],
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
locationFilter = process.env.LOCATION_FILTER || "",
maxPages = parseInt(process.env.MAX_PAGES) || 5,
headless = process.env.HEADLESS !== "false",
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
useAndLogic = false, // Use AND logic instead of OR logic for keywords
} = options;
logger.step("Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
logger.info(
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
const browser = await chromium.launch({
headless,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
});
const context = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
});
const results = [];
const rejectedResults = [];
const seenJobs = new Set();
try {
// For AND logic, combine all keywords into a single search query
// For OR logic, search each keyword separately
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
// Search for each keyword (or combined keyword for AND logic)
for (const keyword of searchKeywords) {
logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
const page = await context.newPage();
try {
logger.info(
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
);
await page.goto(searchUrl, {
waitUntil: "domcontentloaded",
timeout: 30000,
});
logger.info(
`Navigation completed successfully at ${new Date().toISOString()}`
);
// Wait for job listings to load
logger.info("Waiting for selector #loops-wrapper");
await page
.waitForSelector("#loops-wrapper", { timeout: 5000 })
.catch(() => {
logger.warning(`No results found for keyword: ${keyword}`);
});
logger.info("Selector wait completed");
let currentPage = 1;
let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) {
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
// Extract all job articles on current page
const jobArticles = await page.$$("article[id^='post-']");
logger.info(
`Found ${jobArticles.length} job listings on page ${currentPage}`
);
for (const article of jobArticles) {
const jobData = await extractJobData(article);
if (!jobData || seenJobs.has(jobData.jobId)) {
continue;
}
seenJobs.add(jobData.jobId);
// Add keyword that found this job
jobData.searchKeyword = keyword;
// Validate job against keywords
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
const keywordMatch = useAndLogic
? containsAllKeywords(fullText, keywords)
: containsAnyKeyword(fullText, keywords);
if (!keywordMatch) {
rejectedResults.push({
...jobData,
rejected: true,
reason: useAndLogic
? "Not all keywords found in job listing"
: "Keywords not found in job listing",
});
continue;
}
// Location validation (if enabled)
if (locationFilter) {
const locationFilters = parseLocationFilters(locationFilter);
// For SkipTheDrive, most jobs are remote, but we can check the title/description
const locationValid =
fullText.toLowerCase().includes("remote") ||
locationFilters.some((filter) =>
fullText.toLowerCase().includes(filter.toLowerCase())
);
if (!locationValid) {
rejectedResults.push({
...jobData,
rejected: true,
reason: "Location requirements not met",
});
continue;
}
jobData.locationValid = locationValid;
}
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
results.push(jobData);
}
// Check for next page
const nextPageLink = await page.$("a.nextp");
if (nextPageLink && currentPage < maxPages) {
logger.info("📄 Moving to next page...");
await nextPageLink.click();
await page.waitForLoadState("domcontentloaded");
await page.waitForTimeout(2000); // Wait for content to load
currentPage++;
} else {
hasNextPage = false;
}
}
} catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} finally {
await page.close();
}
}
logger.success(`\n✅ Parsing complete!`);
logger.info(`📊 Total jobs found: ${results.length}`);
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
// Run AI analysis if enabled
let aiAnalysis = null;
if (enableAI && results.length > 0) {
logger.step("Running AI analysis on job listings...");
const aiAvailable = await checkOllamaStatus();
if (aiAvailable) {
const analysisData = results.map((job) => ({
text: `${job.title} at ${job.company}. ${job.description}`,
metadata: {
jobId: job.jobId,
company: job.company,
daysAgo: job.daysAgo,
},
}));
aiAnalysis = await analyzeBatch(analysisData, aiContext);
// Merge AI analysis with results
results.forEach((job, index) => {
if (aiAnalysis && aiAnalysis[index]) {
job.aiAnalysis = {
isRelevant: aiAnalysis[index].isRelevant,
confidence: aiAnalysis[index].confidence,
reasoning: aiAnalysis[index].reasoning,
};
}
});
logger.success("✅ AI analysis completed");
} else {
logger.warning("⚠️ AI not available - skipping analysis");
}
}
return {
results,
rejectedResults,
metadata: {
source: "skipthedrive",
totalJobs: results.length,
rejectedJobs: rejectedResults.length,
keywords: keywords,
jobTypes: jobTypes,
locationFilter: locationFilter,
aiAnalysisEnabled: enableAI,
aiAnalysisCompleted: !!aiAnalysis,
timestamp: new Date().toISOString(),
},
};
} catch (error) {
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
throw error;
} finally {
await browser.close();
}
}
// Export the parser
module.exports = {
parseSkipTheDrive,
buildSearchUrl,
extractJobData,
};