Refactor text utilities for improved clarity and maintainability
- Cleaned up and organized text processing utilities in `text-utils.js` for better readability and reuse. - Ensured consistent formatting and documentation across utility functions. - No functional changes were made; the focus was on code structure and clarity.
This commit is contained in:
parent
673f84d388
commit
691d61aaee
@ -1,146 +1,146 @@
|
||||
/**
|
||||
* Text processing utilities for cleaning and validating content
|
||||
* Extracted from linkedout.js for reuse across parsers
|
||||
*/
|
||||
|
||||
/**
|
||||
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
|
||||
*/
|
||||
function cleanText(text) {
|
||||
if (!text || typeof text !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Remove hashtags
|
||||
text = text.replace(/#\w+/g, "");
|
||||
|
||||
// Remove hashtag mentions
|
||||
text = text.replace(/\bhashtag\b/gi, "");
|
||||
text = text.replace(/hashtag-\w+/gi, "");
|
||||
|
||||
// Remove URLs
|
||||
text = text.replace(/https?:\/\/[^\s]+/g, "");
|
||||
|
||||
// Remove emojis (Unicode ranges for common emoji)
|
||||
text = text.replace(
|
||||
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
|
||||
""
|
||||
);
|
||||
|
||||
// Normalize whitespace
|
||||
text = text.replace(/\s+/g, " ").trim();
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if text contains any of the specified keywords (case insensitive)
|
||||
*/
|
||||
function containsAnyKeyword(text, keywords) {
|
||||
if (!text || !Array.isArray(keywords)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if text contains all of the specified keywords (case insensitive)
|
||||
*/
|
||||
function containsAllKeywords(text, keywords) {
|
||||
if (!text || !Array.isArray(keywords)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
|
||||
* @param {string} text - Text to search in
|
||||
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
|
||||
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
|
||||
*/
|
||||
function matchesKeywordGroups(text, keywordGroups) {
|
||||
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
// All groups must match (AND logic)
|
||||
return keywordGroups.every((group) => {
|
||||
if (!Array.isArray(group) || group.length === 0) {
|
||||
return false;
|
||||
}
|
||||
// At least one keyword in the group must match (OR logic)
|
||||
return group.some((keyword) =>
|
||||
lowerText.includes(keyword.toLowerCase().trim())
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate if text meets basic quality criteria
|
||||
*/
|
||||
function isValidText(text, minLength = 30) {
|
||||
if (!text || typeof text !== "string") {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check minimum length
|
||||
if (text.length < minLength) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if text contains alphanumeric characters
|
||||
if (!/[a-zA-Z0-9]/.test(text)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract domain from URL
|
||||
*/
|
||||
function extractDomain(url) {
|
||||
if (!url || typeof url !== "string") {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return urlObj.hostname;
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize URL by removing query parameters and fragments
|
||||
*/
|
||||
function normalizeUrl(url) {
|
||||
if (!url || typeof url !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
|
||||
} catch (error) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
cleanText,
|
||||
containsAnyKeyword,
|
||||
containsAllKeywords,
|
||||
matchesKeywordGroups,
|
||||
isValidText,
|
||||
extractDomain,
|
||||
normalizeUrl,
|
||||
};
|
||||
/**
|
||||
* Text processing utilities for cleaning and validating content
|
||||
* Extracted from linkedout.js for reuse across parsers
|
||||
*/
|
||||
|
||||
/**
|
||||
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
|
||||
*/
|
||||
function cleanText(text) {
|
||||
if (!text || typeof text !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Remove hashtags
|
||||
text = text.replace(/#\w+/g, "");
|
||||
|
||||
// Remove hashtag mentions
|
||||
text = text.replace(/\bhashtag\b/gi, "");
|
||||
text = text.replace(/hashtag-\w+/gi, "");
|
||||
|
||||
// Remove URLs
|
||||
text = text.replace(/https?:\/\/[^\s]+/g, "");
|
||||
|
||||
// Remove emojis (Unicode ranges for common emoji)
|
||||
text = text.replace(
|
||||
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
|
||||
""
|
||||
);
|
||||
|
||||
// Normalize whitespace
|
||||
text = text.replace(/\s+/g, " ").trim();
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if text contains any of the specified keywords (case insensitive)
|
||||
*/
|
||||
function containsAnyKeyword(text, keywords) {
|
||||
if (!text || !Array.isArray(keywords)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if text contains all of the specified keywords (case insensitive)
|
||||
*/
|
||||
function containsAllKeywords(text, keywords) {
|
||||
if (!text || !Array.isArray(keywords)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
|
||||
* @param {string} text - Text to search in
|
||||
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
|
||||
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
|
||||
*/
|
||||
function matchesKeywordGroups(text, keywordGroups) {
|
||||
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
// All groups must match (AND logic)
|
||||
return keywordGroups.every((group) => {
|
||||
if (!Array.isArray(group) || group.length === 0) {
|
||||
return false;
|
||||
}
|
||||
// At least one keyword in the group must match (OR logic)
|
||||
return group.some((keyword) =>
|
||||
lowerText.includes(keyword.toLowerCase().trim())
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate if text meets basic quality criteria
|
||||
*/
|
||||
function isValidText(text, minLength = 30) {
|
||||
if (!text || typeof text !== "string") {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check minimum length
|
||||
if (text.length < minLength) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if text contains alphanumeric characters
|
||||
if (!/[a-zA-Z0-9]/.test(text)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract domain from URL
|
||||
*/
|
||||
function extractDomain(url) {
|
||||
if (!url || typeof url !== "string") {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return urlObj.hostname;
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize URL by removing query parameters and fragments
|
||||
*/
|
||||
function normalizeUrl(url) {
|
||||
if (!url || typeof url !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
|
||||
} catch (error) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
cleanText,
|
||||
containsAnyKeyword,
|
||||
containsAllKeywords,
|
||||
matchesKeywordGroups,
|
||||
isValidText,
|
||||
extractDomain,
|
||||
normalizeUrl,
|
||||
};
|
||||
|
||||
@ -1,345 +1,345 @@
|
||||
/**
|
||||
* SkipTheDrive Job Parser
|
||||
*
|
||||
* Parses remote job listings from SkipTheDrive.com
|
||||
* Supports keyword search, job type filters, and pagination
|
||||
*/
|
||||
|
||||
const { chromium } = require("playwright");
|
||||
const path = require("path");
|
||||
|
||||
// Import from ai-analyzer core package
|
||||
const {
|
||||
logger,
|
||||
cleanText,
|
||||
containsAnyKeyword,
|
||||
containsAllKeywords,
|
||||
parseLocationFilters,
|
||||
validateLocationAgainstFilters,
|
||||
extractLocationFromProfile,
|
||||
analyzeBatch,
|
||||
checkOllamaStatus,
|
||||
} = require("../../ai-analyzer");
|
||||
|
||||
/**
|
||||
* Build search URL for SkipTheDrive
|
||||
* @param {string} keyword - Search keyword
|
||||
* @param {string} orderBy - Sort order (date, relevance)
|
||||
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
|
||||
* @returns {string} - Formatted search URL
|
||||
*/
|
||||
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
||||
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
|
||||
|
||||
if (orderBy) {
|
||||
url += `&orderby=${orderBy}`;
|
||||
}
|
||||
|
||||
// Add job type filters
|
||||
jobTypes.forEach((type) => {
|
||||
url += `&jobtype=${encodeURIComponent(type)}`;
|
||||
});
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract job data from a single job listing element
|
||||
* @param {Element} article - Job listing DOM element
|
||||
* @returns {Object} - Extracted job data
|
||||
*/
|
||||
async function extractJobData(article) {
|
||||
try {
|
||||
// Extract job title and URL
|
||||
const titleElement = await article.$("h2.post-title a");
|
||||
const title = titleElement ? await titleElement.textContent() : "";
|
||||
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
||||
|
||||
// Extract date
|
||||
const dateElement = await article.$("time.post-date");
|
||||
const datePosted = dateElement
|
||||
? await dateElement.getAttribute("datetime")
|
||||
: "";
|
||||
const dateText = dateElement ? await dateElement.textContent() : "";
|
||||
|
||||
// Extract company name
|
||||
const companyElement = await article.$(
|
||||
".custom_fields_company_name_display_search_results"
|
||||
);
|
||||
let company = companyElement ? await companyElement.textContent() : "";
|
||||
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
||||
|
||||
// Extract days ago
|
||||
const daysAgoElement = await article.$(
|
||||
".custom_fields_job_date_display_search_results"
|
||||
);
|
||||
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
|
||||
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
||||
|
||||
// Extract job description excerpt
|
||||
const excerptElement = await article.$(".excerpt_part");
|
||||
const description = excerptElement
|
||||
? await excerptElement.textContent()
|
||||
: "";
|
||||
|
||||
// Check if featured/sponsored
|
||||
const featuredElement = await article.$(".custom_fields_sponsored_job");
|
||||
const isFeatured = !!featuredElement;
|
||||
|
||||
// Extract job ID from article ID
|
||||
const articleId = await article.getAttribute("id");
|
||||
const jobId = articleId ? articleId.replace("post-", "") : "";
|
||||
|
||||
return {
|
||||
jobId,
|
||||
title: cleanText(title),
|
||||
company: cleanText(company),
|
||||
jobUrl,
|
||||
datePosted,
|
||||
dateText: cleanText(dateText),
|
||||
daysAgo: cleanText(daysAgo),
|
||||
description: cleanText(description),
|
||||
isFeatured,
|
||||
source: "skipthedrive",
|
||||
timestamp: new Date().toISOString(),
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`Error extracting job data: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse SkipTheDrive job listings
|
||||
* @param {Object} options - Parser options
|
||||
* @returns {Promise<Array>} - Array of parsed job listings
|
||||
*/
|
||||
async function parseSkipTheDrive(options = {}) {
|
||||
const {
|
||||
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
|
||||
"software engineer",
|
||||
"developer",
|
||||
],
|
||||
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
|
||||
locationFilter = process.env.LOCATION_FILTER || "",
|
||||
maxPages = parseInt(process.env.MAX_PAGES) || 5,
|
||||
headless = process.env.HEADLESS !== "false",
|
||||
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
||||
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
||||
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||
} = options;
|
||||
|
||||
logger.step("Starting SkipTheDrive parser...");
|
||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||
logger.info(
|
||||
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
||||
);
|
||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
],
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent:
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
});
|
||||
|
||||
const results = [];
|
||||
const rejectedResults = [];
|
||||
const seenJobs = new Set();
|
||||
|
||||
try {
|
||||
// For AND logic, combine all keywords into a single search query
|
||||
// For OR logic, search each keyword separately
|
||||
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
|
||||
|
||||
// Search for each keyword (or combined keyword for AND logic)
|
||||
for (const keyword of searchKeywords) {
|
||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||
|
||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
logger.info(
|
||||
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
|
||||
);
|
||||
await page.goto(searchUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 30000,
|
||||
});
|
||||
logger.info(
|
||||
`Navigation completed successfully at ${new Date().toISOString()}`
|
||||
);
|
||||
|
||||
// Wait for job listings to load
|
||||
logger.info("Waiting for selector #loops-wrapper");
|
||||
await page
|
||||
.waitForSelector("#loops-wrapper", { timeout: 5000 })
|
||||
.catch(() => {
|
||||
logger.warning(`No results found for keyword: ${keyword}`);
|
||||
});
|
||||
logger.info("Selector wait completed");
|
||||
|
||||
let currentPage = 1;
|
||||
let hasNextPage = true;
|
||||
|
||||
while (hasNextPage && currentPage <= maxPages) {
|
||||
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
||||
|
||||
// Extract all job articles on current page
|
||||
const jobArticles = await page.$$("article[id^='post-']");
|
||||
logger.info(
|
||||
`Found ${jobArticles.length} job listings on page ${currentPage}`
|
||||
);
|
||||
|
||||
for (const article of jobArticles) {
|
||||
const jobData = await extractJobData(article);
|
||||
|
||||
if (!jobData || seenJobs.has(jobData.jobId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
seenJobs.add(jobData.jobId);
|
||||
|
||||
// Add keyword that found this job
|
||||
jobData.searchKeyword = keyword;
|
||||
|
||||
// Validate job against keywords
|
||||
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
||||
const keywordMatch = useAndLogic
|
||||
? containsAllKeywords(fullText, keywords)
|
||||
: containsAnyKeyword(fullText, keywords);
|
||||
|
||||
if (!keywordMatch) {
|
||||
rejectedResults.push({
|
||||
...jobData,
|
||||
rejected: true,
|
||||
reason: useAndLogic
|
||||
? "Not all keywords found in job listing"
|
||||
: "Keywords not found in job listing",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Location validation (if enabled)
|
||||
if (locationFilter) {
|
||||
const locationFilters = parseLocationFilters(locationFilter);
|
||||
// For SkipTheDrive, most jobs are remote, but we can check the title/description
|
||||
const locationValid =
|
||||
fullText.toLowerCase().includes("remote") ||
|
||||
locationFilters.some((filter) =>
|
||||
fullText.toLowerCase().includes(filter.toLowerCase())
|
||||
);
|
||||
|
||||
if (!locationValid) {
|
||||
rejectedResults.push({
|
||||
...jobData,
|
||||
rejected: true,
|
||||
reason: "Location requirements not met",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
jobData.locationValid = locationValid;
|
||||
}
|
||||
|
||||
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
|
||||
results.push(jobData);
|
||||
}
|
||||
|
||||
// Check for next page
|
||||
const nextPageLink = await page.$("a.nextp");
|
||||
if (nextPageLink && currentPage < maxPages) {
|
||||
logger.info("📄 Moving to next page...");
|
||||
await nextPageLink.click();
|
||||
await page.waitForLoadState("domcontentloaded");
|
||||
await page.waitForTimeout(2000); // Wait for content to load
|
||||
currentPage++;
|
||||
} else {
|
||||
hasNextPage = false;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
|
||||
logger.success(`\n✅ Parsing complete!`);
|
||||
logger.info(`📊 Total jobs found: ${results.length}`);
|
||||
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
|
||||
|
||||
// Run AI analysis if enabled
|
||||
let aiAnalysis = null;
|
||||
if (enableAI && results.length > 0) {
|
||||
logger.step("Running AI analysis on job listings...");
|
||||
|
||||
const aiAvailable = await checkOllamaStatus();
|
||||
if (aiAvailable) {
|
||||
const analysisData = results.map((job) => ({
|
||||
text: `${job.title} at ${job.company}. ${job.description}`,
|
||||
metadata: {
|
||||
jobId: job.jobId,
|
||||
company: job.company,
|
||||
daysAgo: job.daysAgo,
|
||||
},
|
||||
}));
|
||||
|
||||
aiAnalysis = await analyzeBatch(analysisData, aiContext);
|
||||
|
||||
// Merge AI analysis with results
|
||||
results.forEach((job, index) => {
|
||||
if (aiAnalysis && aiAnalysis[index]) {
|
||||
job.aiAnalysis = {
|
||||
isRelevant: aiAnalysis[index].isRelevant,
|
||||
confidence: aiAnalysis[index].confidence,
|
||||
reasoning: aiAnalysis[index].reasoning,
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
logger.success("✅ AI analysis completed");
|
||||
} else {
|
||||
logger.warning("⚠️ AI not available - skipping analysis");
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
results,
|
||||
rejectedResults,
|
||||
metadata: {
|
||||
source: "skipthedrive",
|
||||
totalJobs: results.length,
|
||||
rejectedJobs: rejectedResults.length,
|
||||
keywords: keywords,
|
||||
jobTypes: jobTypes,
|
||||
locationFilter: locationFilter,
|
||||
aiAnalysisEnabled: enableAI,
|
||||
aiAnalysisCompleted: !!aiAnalysis,
|
||||
timestamp: new Date().toISOString(),
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
|
||||
throw error;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Export the parser
|
||||
module.exports = {
|
||||
parseSkipTheDrive,
|
||||
buildSearchUrl,
|
||||
extractJobData,
|
||||
};
|
||||
/**
|
||||
* SkipTheDrive Job Parser
|
||||
*
|
||||
* Parses remote job listings from SkipTheDrive.com
|
||||
* Supports keyword search, job type filters, and pagination
|
||||
*/
|
||||
|
||||
const { chromium } = require("playwright");
|
||||
const path = require("path");
|
||||
|
||||
// Import from ai-analyzer core package
|
||||
const {
|
||||
logger,
|
||||
cleanText,
|
||||
containsAnyKeyword,
|
||||
containsAllKeywords,
|
||||
parseLocationFilters,
|
||||
validateLocationAgainstFilters,
|
||||
extractLocationFromProfile,
|
||||
analyzeBatch,
|
||||
checkOllamaStatus,
|
||||
} = require("../../ai-analyzer");
|
||||
|
||||
/**
|
||||
* Build search URL for SkipTheDrive
|
||||
* @param {string} keyword - Search keyword
|
||||
* @param {string} orderBy - Sort order (date, relevance)
|
||||
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
|
||||
* @returns {string} - Formatted search URL
|
||||
*/
|
||||
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
||||
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
|
||||
|
||||
if (orderBy) {
|
||||
url += `&orderby=${orderBy}`;
|
||||
}
|
||||
|
||||
// Add job type filters
|
||||
jobTypes.forEach((type) => {
|
||||
url += `&jobtype=${encodeURIComponent(type)}`;
|
||||
});
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract job data from a single job listing element
|
||||
* @param {Element} article - Job listing DOM element
|
||||
* @returns {Object} - Extracted job data
|
||||
*/
|
||||
async function extractJobData(article) {
|
||||
try {
|
||||
// Extract job title and URL
|
||||
const titleElement = await article.$("h2.post-title a");
|
||||
const title = titleElement ? await titleElement.textContent() : "";
|
||||
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
||||
|
||||
// Extract date
|
||||
const dateElement = await article.$("time.post-date");
|
||||
const datePosted = dateElement
|
||||
? await dateElement.getAttribute("datetime")
|
||||
: "";
|
||||
const dateText = dateElement ? await dateElement.textContent() : "";
|
||||
|
||||
// Extract company name
|
||||
const companyElement = await article.$(
|
||||
".custom_fields_company_name_display_search_results"
|
||||
);
|
||||
let company = companyElement ? await companyElement.textContent() : "";
|
||||
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
||||
|
||||
// Extract days ago
|
||||
const daysAgoElement = await article.$(
|
||||
".custom_fields_job_date_display_search_results"
|
||||
);
|
||||
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
|
||||
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
||||
|
||||
// Extract job description excerpt
|
||||
const excerptElement = await article.$(".excerpt_part");
|
||||
const description = excerptElement
|
||||
? await excerptElement.textContent()
|
||||
: "";
|
||||
|
||||
// Check if featured/sponsored
|
||||
const featuredElement = await article.$(".custom_fields_sponsored_job");
|
||||
const isFeatured = !!featuredElement;
|
||||
|
||||
// Extract job ID from article ID
|
||||
const articleId = await article.getAttribute("id");
|
||||
const jobId = articleId ? articleId.replace("post-", "") : "";
|
||||
|
||||
return {
|
||||
jobId,
|
||||
title: cleanText(title),
|
||||
company: cleanText(company),
|
||||
jobUrl,
|
||||
datePosted,
|
||||
dateText: cleanText(dateText),
|
||||
daysAgo: cleanText(daysAgo),
|
||||
description: cleanText(description),
|
||||
isFeatured,
|
||||
source: "skipthedrive",
|
||||
timestamp: new Date().toISOString(),
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`Error extracting job data: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse SkipTheDrive job listings
|
||||
* @param {Object} options - Parser options
|
||||
* @returns {Promise<Array>} - Array of parsed job listings
|
||||
*/
|
||||
async function parseSkipTheDrive(options = {}) {
|
||||
const {
|
||||
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
|
||||
"software engineer",
|
||||
"developer",
|
||||
],
|
||||
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
|
||||
locationFilter = process.env.LOCATION_FILTER || "",
|
||||
maxPages = parseInt(process.env.MAX_PAGES) || 5,
|
||||
headless = process.env.HEADLESS !== "false",
|
||||
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
||||
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
||||
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||
} = options;
|
||||
|
||||
logger.step("Starting SkipTheDrive parser...");
|
||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||
logger.info(
|
||||
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
||||
);
|
||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
],
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
userAgent:
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
});
|
||||
|
||||
const results = [];
|
||||
const rejectedResults = [];
|
||||
const seenJobs = new Set();
|
||||
|
||||
try {
|
||||
// For AND logic, combine all keywords into a single search query
|
||||
// For OR logic, search each keyword separately
|
||||
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
|
||||
|
||||
// Search for each keyword (or combined keyword for AND logic)
|
||||
for (const keyword of searchKeywords) {
|
||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||
|
||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
logger.info(
|
||||
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
|
||||
);
|
||||
await page.goto(searchUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 30000,
|
||||
});
|
||||
logger.info(
|
||||
`Navigation completed successfully at ${new Date().toISOString()}`
|
||||
);
|
||||
|
||||
// Wait for job listings to load
|
||||
logger.info("Waiting for selector #loops-wrapper");
|
||||
await page
|
||||
.waitForSelector("#loops-wrapper", { timeout: 5000 })
|
||||
.catch(() => {
|
||||
logger.warning(`No results found for keyword: ${keyword}`);
|
||||
});
|
||||
logger.info("Selector wait completed");
|
||||
|
||||
let currentPage = 1;
|
||||
let hasNextPage = true;
|
||||
|
||||
while (hasNextPage && currentPage <= maxPages) {
|
||||
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
||||
|
||||
// Extract all job articles on current page
|
||||
const jobArticles = await page.$$("article[id^='post-']");
|
||||
logger.info(
|
||||
`Found ${jobArticles.length} job listings on page ${currentPage}`
|
||||
);
|
||||
|
||||
for (const article of jobArticles) {
|
||||
const jobData = await extractJobData(article);
|
||||
|
||||
if (!jobData || seenJobs.has(jobData.jobId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
seenJobs.add(jobData.jobId);
|
||||
|
||||
// Add keyword that found this job
|
||||
jobData.searchKeyword = keyword;
|
||||
|
||||
// Validate job against keywords
|
||||
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
||||
const keywordMatch = useAndLogic
|
||||
? containsAllKeywords(fullText, keywords)
|
||||
: containsAnyKeyword(fullText, keywords);
|
||||
|
||||
if (!keywordMatch) {
|
||||
rejectedResults.push({
|
||||
...jobData,
|
||||
rejected: true,
|
||||
reason: useAndLogic
|
||||
? "Not all keywords found in job listing"
|
||||
: "Keywords not found in job listing",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Location validation (if enabled)
|
||||
if (locationFilter) {
|
||||
const locationFilters = parseLocationFilters(locationFilter);
|
||||
// For SkipTheDrive, most jobs are remote, but we can check the title/description
|
||||
const locationValid =
|
||||
fullText.toLowerCase().includes("remote") ||
|
||||
locationFilters.some((filter) =>
|
||||
fullText.toLowerCase().includes(filter.toLowerCase())
|
||||
);
|
||||
|
||||
if (!locationValid) {
|
||||
rejectedResults.push({
|
||||
...jobData,
|
||||
rejected: true,
|
||||
reason: "Location requirements not met",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
jobData.locationValid = locationValid;
|
||||
}
|
||||
|
||||
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
|
||||
results.push(jobData);
|
||||
}
|
||||
|
||||
// Check for next page
|
||||
const nextPageLink = await page.$("a.nextp");
|
||||
if (nextPageLink && currentPage < maxPages) {
|
||||
logger.info("📄 Moving to next page...");
|
||||
await nextPageLink.click();
|
||||
await page.waitForLoadState("domcontentloaded");
|
||||
await page.waitForTimeout(2000); // Wait for content to load
|
||||
currentPage++;
|
||||
} else {
|
||||
hasNextPage = false;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
|
||||
logger.success(`\n✅ Parsing complete!`);
|
||||
logger.info(`📊 Total jobs found: ${results.length}`);
|
||||
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
|
||||
|
||||
// Run AI analysis if enabled
|
||||
let aiAnalysis = null;
|
||||
if (enableAI && results.length > 0) {
|
||||
logger.step("Running AI analysis on job listings...");
|
||||
|
||||
const aiAvailable = await checkOllamaStatus();
|
||||
if (aiAvailable) {
|
||||
const analysisData = results.map((job) => ({
|
||||
text: `${job.title} at ${job.company}. ${job.description}`,
|
||||
metadata: {
|
||||
jobId: job.jobId,
|
||||
company: job.company,
|
||||
daysAgo: job.daysAgo,
|
||||
},
|
||||
}));
|
||||
|
||||
aiAnalysis = await analyzeBatch(analysisData, aiContext);
|
||||
|
||||
// Merge AI analysis with results
|
||||
results.forEach((job, index) => {
|
||||
if (aiAnalysis && aiAnalysis[index]) {
|
||||
job.aiAnalysis = {
|
||||
isRelevant: aiAnalysis[index].isRelevant,
|
||||
confidence: aiAnalysis[index].confidence,
|
||||
reasoning: aiAnalysis[index].reasoning,
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
logger.success("✅ AI analysis completed");
|
||||
} else {
|
||||
logger.warning("⚠️ AI not available - skipping analysis");
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
results,
|
||||
rejectedResults,
|
||||
metadata: {
|
||||
source: "skipthedrive",
|
||||
totalJobs: results.length,
|
||||
rejectedJobs: rejectedResults.length,
|
||||
keywords: keywords,
|
||||
jobTypes: jobTypes,
|
||||
locationFilter: locationFilter,
|
||||
aiAnalysisEnabled: enableAI,
|
||||
aiAnalysisCompleted: !!aiAnalysis,
|
||||
timestamp: new Date().toISOString(),
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
|
||||
throw error;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Export the parser
|
||||
module.exports = {
|
||||
parseSkipTheDrive,
|
||||
buildSearchUrl,
|
||||
extractJobData,
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user