Refactor text utilities for improved clarity and maintainability
- Cleaned up and organized text processing utilities in `text-utils.js` for better readability and reuse. - Ensured consistent formatting and documentation across utility functions. - No functional changes were made; the focus was on code structure and clarity.
This commit is contained in:
parent
673f84d388
commit
691d61aaee
@ -1,146 +1,146 @@
|
|||||||
/**
|
/**
|
||||||
* Text processing utilities for cleaning and validating content
|
* Text processing utilities for cleaning and validating content
|
||||||
* Extracted from linkedout.js for reuse across parsers
|
* Extracted from linkedout.js for reuse across parsers
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
|
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
|
||||||
*/
|
*/
|
||||||
function cleanText(text) {
|
function cleanText(text) {
|
||||||
if (!text || typeof text !== "string") {
|
if (!text || typeof text !== "string") {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove hashtags
|
// Remove hashtags
|
||||||
text = text.replace(/#\w+/g, "");
|
text = text.replace(/#\w+/g, "");
|
||||||
|
|
||||||
// Remove hashtag mentions
|
// Remove hashtag mentions
|
||||||
text = text.replace(/\bhashtag\b/gi, "");
|
text = text.replace(/\bhashtag\b/gi, "");
|
||||||
text = text.replace(/hashtag-\w+/gi, "");
|
text = text.replace(/hashtag-\w+/gi, "");
|
||||||
|
|
||||||
// Remove URLs
|
// Remove URLs
|
||||||
text = text.replace(/https?:\/\/[^\s]+/g, "");
|
text = text.replace(/https?:\/\/[^\s]+/g, "");
|
||||||
|
|
||||||
// Remove emojis (Unicode ranges for common emoji)
|
// Remove emojis (Unicode ranges for common emoji)
|
||||||
text = text.replace(
|
text = text.replace(
|
||||||
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
|
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
|
|
||||||
// Normalize whitespace
|
// Normalize whitespace
|
||||||
text = text.replace(/\s+/g, " ").trim();
|
text = text.replace(/\s+/g, " ").trim();
|
||||||
|
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if text contains any of the specified keywords (case insensitive)
|
* Check if text contains any of the specified keywords (case insensitive)
|
||||||
*/
|
*/
|
||||||
function containsAnyKeyword(text, keywords) {
|
function containsAnyKeyword(text, keywords) {
|
||||||
if (!text || !Array.isArray(keywords)) {
|
if (!text || !Array.isArray(keywords)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const lowerText = text.toLowerCase();
|
const lowerText = text.toLowerCase();
|
||||||
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if text contains all of the specified keywords (case insensitive)
|
* Check if text contains all of the specified keywords (case insensitive)
|
||||||
*/
|
*/
|
||||||
function containsAllKeywords(text, keywords) {
|
function containsAllKeywords(text, keywords) {
|
||||||
if (!text || !Array.isArray(keywords)) {
|
if (!text || !Array.isArray(keywords)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const lowerText = text.toLowerCase();
|
const lowerText = text.toLowerCase();
|
||||||
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
|
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
|
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
|
||||||
* @param {string} text - Text to search in
|
* @param {string} text - Text to search in
|
||||||
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
|
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
|
||||||
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
|
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
|
||||||
*/
|
*/
|
||||||
function matchesKeywordGroups(text, keywordGroups) {
|
function matchesKeywordGroups(text, keywordGroups) {
|
||||||
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
|
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const lowerText = text.toLowerCase();
|
const lowerText = text.toLowerCase();
|
||||||
|
|
||||||
// All groups must match (AND logic)
|
// All groups must match (AND logic)
|
||||||
return keywordGroups.every((group) => {
|
return keywordGroups.every((group) => {
|
||||||
if (!Array.isArray(group) || group.length === 0) {
|
if (!Array.isArray(group) || group.length === 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// At least one keyword in the group must match (OR logic)
|
// At least one keyword in the group must match (OR logic)
|
||||||
return group.some((keyword) =>
|
return group.some((keyword) =>
|
||||||
lowerText.includes(keyword.toLowerCase().trim())
|
lowerText.includes(keyword.toLowerCase().trim())
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate if text meets basic quality criteria
|
* Validate if text meets basic quality criteria
|
||||||
*/
|
*/
|
||||||
function isValidText(text, minLength = 30) {
|
function isValidText(text, minLength = 30) {
|
||||||
if (!text || typeof text !== "string") {
|
if (!text || typeof text !== "string") {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check minimum length
|
// Check minimum length
|
||||||
if (text.length < minLength) {
|
if (text.length < minLength) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if text contains alphanumeric characters
|
// Check if text contains alphanumeric characters
|
||||||
if (!/[a-zA-Z0-9]/.test(text)) {
|
if (!/[a-zA-Z0-9]/.test(text)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract domain from URL
|
* Extract domain from URL
|
||||||
*/
|
*/
|
||||||
function extractDomain(url) {
|
function extractDomain(url) {
|
||||||
if (!url || typeof url !== "string") {
|
if (!url || typeof url !== "string") {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
return urlObj.hostname;
|
return urlObj.hostname;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalize URL by removing query parameters and fragments
|
* Normalize URL by removing query parameters and fragments
|
||||||
*/
|
*/
|
||||||
function normalizeUrl(url) {
|
function normalizeUrl(url) {
|
||||||
if (!url || typeof url !== "string") {
|
if (!url || typeof url !== "string") {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
|
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
containsAllKeywords,
|
containsAllKeywords,
|
||||||
matchesKeywordGroups,
|
matchesKeywordGroups,
|
||||||
isValidText,
|
isValidText,
|
||||||
extractDomain,
|
extractDomain,
|
||||||
normalizeUrl,
|
normalizeUrl,
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,345 +1,345 @@
|
|||||||
/**
|
/**
|
||||||
* SkipTheDrive Job Parser
|
* SkipTheDrive Job Parser
|
||||||
*
|
*
|
||||||
* Parses remote job listings from SkipTheDrive.com
|
* Parses remote job listings from SkipTheDrive.com
|
||||||
* Supports keyword search, job type filters, and pagination
|
* Supports keyword search, job type filters, and pagination
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const { chromium } = require("playwright");
|
const { chromium } = require("playwright");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
|
|
||||||
// Import from ai-analyzer core package
|
// Import from ai-analyzer core package
|
||||||
const {
|
const {
|
||||||
logger,
|
logger,
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
containsAllKeywords,
|
containsAllKeywords,
|
||||||
parseLocationFilters,
|
parseLocationFilters,
|
||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
extractLocationFromProfile,
|
extractLocationFromProfile,
|
||||||
analyzeBatch,
|
analyzeBatch,
|
||||||
checkOllamaStatus,
|
checkOllamaStatus,
|
||||||
} = require("../../ai-analyzer");
|
} = require("../../ai-analyzer");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build search URL for SkipTheDrive
|
* Build search URL for SkipTheDrive
|
||||||
* @param {string} keyword - Search keyword
|
* @param {string} keyword - Search keyword
|
||||||
* @param {string} orderBy - Sort order (date, relevance)
|
* @param {string} orderBy - Sort order (date, relevance)
|
||||||
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
|
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
|
||||||
* @returns {string} - Formatted search URL
|
* @returns {string} - Formatted search URL
|
||||||
*/
|
*/
|
||||||
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
||||||
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
|
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
|
||||||
|
|
||||||
if (orderBy) {
|
if (orderBy) {
|
||||||
url += `&orderby=${orderBy}`;
|
url += `&orderby=${orderBy}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add job type filters
|
// Add job type filters
|
||||||
jobTypes.forEach((type) => {
|
jobTypes.forEach((type) => {
|
||||||
url += `&jobtype=${encodeURIComponent(type)}`;
|
url += `&jobtype=${encodeURIComponent(type)}`;
|
||||||
});
|
});
|
||||||
|
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract job data from a single job listing element
|
* Extract job data from a single job listing element
|
||||||
* @param {Element} article - Job listing DOM element
|
* @param {Element} article - Job listing DOM element
|
||||||
* @returns {Object} - Extracted job data
|
* @returns {Object} - Extracted job data
|
||||||
*/
|
*/
|
||||||
async function extractJobData(article) {
|
async function extractJobData(article) {
|
||||||
try {
|
try {
|
||||||
// Extract job title and URL
|
// Extract job title and URL
|
||||||
const titleElement = await article.$("h2.post-title a");
|
const titleElement = await article.$("h2.post-title a");
|
||||||
const title = titleElement ? await titleElement.textContent() : "";
|
const title = titleElement ? await titleElement.textContent() : "";
|
||||||
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
||||||
|
|
||||||
// Extract date
|
// Extract date
|
||||||
const dateElement = await article.$("time.post-date");
|
const dateElement = await article.$("time.post-date");
|
||||||
const datePosted = dateElement
|
const datePosted = dateElement
|
||||||
? await dateElement.getAttribute("datetime")
|
? await dateElement.getAttribute("datetime")
|
||||||
: "";
|
: "";
|
||||||
const dateText = dateElement ? await dateElement.textContent() : "";
|
const dateText = dateElement ? await dateElement.textContent() : "";
|
||||||
|
|
||||||
// Extract company name
|
// Extract company name
|
||||||
const companyElement = await article.$(
|
const companyElement = await article.$(
|
||||||
".custom_fields_company_name_display_search_results"
|
".custom_fields_company_name_display_search_results"
|
||||||
);
|
);
|
||||||
let company = companyElement ? await companyElement.textContent() : "";
|
let company = companyElement ? await companyElement.textContent() : "";
|
||||||
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
||||||
|
|
||||||
// Extract days ago
|
// Extract days ago
|
||||||
const daysAgoElement = await article.$(
|
const daysAgoElement = await article.$(
|
||||||
".custom_fields_job_date_display_search_results"
|
".custom_fields_job_date_display_search_results"
|
||||||
);
|
);
|
||||||
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
|
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
|
||||||
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
|
||||||
|
|
||||||
// Extract job description excerpt
|
// Extract job description excerpt
|
||||||
const excerptElement = await article.$(".excerpt_part");
|
const excerptElement = await article.$(".excerpt_part");
|
||||||
const description = excerptElement
|
const description = excerptElement
|
||||||
? await excerptElement.textContent()
|
? await excerptElement.textContent()
|
||||||
: "";
|
: "";
|
||||||
|
|
||||||
// Check if featured/sponsored
|
// Check if featured/sponsored
|
||||||
const featuredElement = await article.$(".custom_fields_sponsored_job");
|
const featuredElement = await article.$(".custom_fields_sponsored_job");
|
||||||
const isFeatured = !!featuredElement;
|
const isFeatured = !!featuredElement;
|
||||||
|
|
||||||
// Extract job ID from article ID
|
// Extract job ID from article ID
|
||||||
const articleId = await article.getAttribute("id");
|
const articleId = await article.getAttribute("id");
|
||||||
const jobId = articleId ? articleId.replace("post-", "") : "";
|
const jobId = articleId ? articleId.replace("post-", "") : "";
|
||||||
|
|
||||||
return {
|
return {
|
||||||
jobId,
|
jobId,
|
||||||
title: cleanText(title),
|
title: cleanText(title),
|
||||||
company: cleanText(company),
|
company: cleanText(company),
|
||||||
jobUrl,
|
jobUrl,
|
||||||
datePosted,
|
datePosted,
|
||||||
dateText: cleanText(dateText),
|
dateText: cleanText(dateText),
|
||||||
daysAgo: cleanText(daysAgo),
|
daysAgo: cleanText(daysAgo),
|
||||||
description: cleanText(description),
|
description: cleanText(description),
|
||||||
isFeatured,
|
isFeatured,
|
||||||
source: "skipthedrive",
|
source: "skipthedrive",
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error extracting job data: ${error.message}`);
|
logger.error(`Error extracting job data: ${error.message}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse SkipTheDrive job listings
|
* Parse SkipTheDrive job listings
|
||||||
* @param {Object} options - Parser options
|
* @param {Object} options - Parser options
|
||||||
* @returns {Promise<Array>} - Array of parsed job listings
|
* @returns {Promise<Array>} - Array of parsed job listings
|
||||||
*/
|
*/
|
||||||
async function parseSkipTheDrive(options = {}) {
|
async function parseSkipTheDrive(options = {}) {
|
||||||
const {
|
const {
|
||||||
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
|
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
|
||||||
"software engineer",
|
"software engineer",
|
||||||
"developer",
|
"developer",
|
||||||
],
|
],
|
||||||
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
|
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
|
||||||
locationFilter = process.env.LOCATION_FILTER || "",
|
locationFilter = process.env.LOCATION_FILTER || "",
|
||||||
maxPages = parseInt(process.env.MAX_PAGES) || 5,
|
maxPages = parseInt(process.env.MAX_PAGES) || 5,
|
||||||
headless = process.env.HEADLESS !== "false",
|
headless = process.env.HEADLESS !== "false",
|
||||||
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
||||||
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
||||||
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
logger.step("Starting SkipTheDrive parser...");
|
logger.step("Starting SkipTheDrive parser...");
|
||||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
logger.info(
|
logger.info(
|
||||||
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
||||||
);
|
);
|
||||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||||
|
|
||||||
const browser = await chromium.launch({
|
const browser = await chromium.launch({
|
||||||
headless,
|
headless,
|
||||||
args: [
|
args: [
|
||||||
"--no-sandbox",
|
"--no-sandbox",
|
||||||
"--disable-setuid-sandbox",
|
"--disable-setuid-sandbox",
|
||||||
"--disable-dev-shm-usage",
|
"--disable-dev-shm-usage",
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
|
||||||
const context = await browser.newContext({
|
const context = await browser.newContext({
|
||||||
userAgent:
|
userAgent:
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||||
});
|
});
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
const rejectedResults = [];
|
const rejectedResults = [];
|
||||||
const seenJobs = new Set();
|
const seenJobs = new Set();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// For AND logic, combine all keywords into a single search query
|
// For AND logic, combine all keywords into a single search query
|
||||||
// For OR logic, search each keyword separately
|
// For OR logic, search each keyword separately
|
||||||
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
|
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
|
||||||
|
|
||||||
// Search for each keyword (or combined keyword for AND logic)
|
// Search for each keyword (or combined keyword for AND logic)
|
||||||
for (const keyword of searchKeywords) {
|
for (const keyword of searchKeywords) {
|
||||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||||
|
|
||||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||||
const page = await context.newPage();
|
const page = await context.newPage();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
|
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
|
||||||
);
|
);
|
||||||
await page.goto(searchUrl, {
|
await page.goto(searchUrl, {
|
||||||
waitUntil: "domcontentloaded",
|
waitUntil: "domcontentloaded",
|
||||||
timeout: 30000,
|
timeout: 30000,
|
||||||
});
|
});
|
||||||
logger.info(
|
logger.info(
|
||||||
`Navigation completed successfully at ${new Date().toISOString()}`
|
`Navigation completed successfully at ${new Date().toISOString()}`
|
||||||
);
|
);
|
||||||
|
|
||||||
// Wait for job listings to load
|
// Wait for job listings to load
|
||||||
logger.info("Waiting for selector #loops-wrapper");
|
logger.info("Waiting for selector #loops-wrapper");
|
||||||
await page
|
await page
|
||||||
.waitForSelector("#loops-wrapper", { timeout: 5000 })
|
.waitForSelector("#loops-wrapper", { timeout: 5000 })
|
||||||
.catch(() => {
|
.catch(() => {
|
||||||
logger.warning(`No results found for keyword: ${keyword}`);
|
logger.warning(`No results found for keyword: ${keyword}`);
|
||||||
});
|
});
|
||||||
logger.info("Selector wait completed");
|
logger.info("Selector wait completed");
|
||||||
|
|
||||||
let currentPage = 1;
|
let currentPage = 1;
|
||||||
let hasNextPage = true;
|
let hasNextPage = true;
|
||||||
|
|
||||||
while (hasNextPage && currentPage <= maxPages) {
|
while (hasNextPage && currentPage <= maxPages) {
|
||||||
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
||||||
|
|
||||||
// Extract all job articles on current page
|
// Extract all job articles on current page
|
||||||
const jobArticles = await page.$$("article[id^='post-']");
|
const jobArticles = await page.$$("article[id^='post-']");
|
||||||
logger.info(
|
logger.info(
|
||||||
`Found ${jobArticles.length} job listings on page ${currentPage}`
|
`Found ${jobArticles.length} job listings on page ${currentPage}`
|
||||||
);
|
);
|
||||||
|
|
||||||
for (const article of jobArticles) {
|
for (const article of jobArticles) {
|
||||||
const jobData = await extractJobData(article);
|
const jobData = await extractJobData(article);
|
||||||
|
|
||||||
if (!jobData || seenJobs.has(jobData.jobId)) {
|
if (!jobData || seenJobs.has(jobData.jobId)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
seenJobs.add(jobData.jobId);
|
seenJobs.add(jobData.jobId);
|
||||||
|
|
||||||
// Add keyword that found this job
|
// Add keyword that found this job
|
||||||
jobData.searchKeyword = keyword;
|
jobData.searchKeyword = keyword;
|
||||||
|
|
||||||
// Validate job against keywords
|
// Validate job against keywords
|
||||||
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
||||||
const keywordMatch = useAndLogic
|
const keywordMatch = useAndLogic
|
||||||
? containsAllKeywords(fullText, keywords)
|
? containsAllKeywords(fullText, keywords)
|
||||||
: containsAnyKeyword(fullText, keywords);
|
: containsAnyKeyword(fullText, keywords);
|
||||||
|
|
||||||
if (!keywordMatch) {
|
if (!keywordMatch) {
|
||||||
rejectedResults.push({
|
rejectedResults.push({
|
||||||
...jobData,
|
...jobData,
|
||||||
rejected: true,
|
rejected: true,
|
||||||
reason: useAndLogic
|
reason: useAndLogic
|
||||||
? "Not all keywords found in job listing"
|
? "Not all keywords found in job listing"
|
||||||
: "Keywords not found in job listing",
|
: "Keywords not found in job listing",
|
||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Location validation (if enabled)
|
// Location validation (if enabled)
|
||||||
if (locationFilter) {
|
if (locationFilter) {
|
||||||
const locationFilters = parseLocationFilters(locationFilter);
|
const locationFilters = parseLocationFilters(locationFilter);
|
||||||
// For SkipTheDrive, most jobs are remote, but we can check the title/description
|
// For SkipTheDrive, most jobs are remote, but we can check the title/description
|
||||||
const locationValid =
|
const locationValid =
|
||||||
fullText.toLowerCase().includes("remote") ||
|
fullText.toLowerCase().includes("remote") ||
|
||||||
locationFilters.some((filter) =>
|
locationFilters.some((filter) =>
|
||||||
fullText.toLowerCase().includes(filter.toLowerCase())
|
fullText.toLowerCase().includes(filter.toLowerCase())
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!locationValid) {
|
if (!locationValid) {
|
||||||
rejectedResults.push({
|
rejectedResults.push({
|
||||||
...jobData,
|
...jobData,
|
||||||
rejected: true,
|
rejected: true,
|
||||||
reason: "Location requirements not met",
|
reason: "Location requirements not met",
|
||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
jobData.locationValid = locationValid;
|
jobData.locationValid = locationValid;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
|
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
|
||||||
results.push(jobData);
|
results.push(jobData);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for next page
|
// Check for next page
|
||||||
const nextPageLink = await page.$("a.nextp");
|
const nextPageLink = await page.$("a.nextp");
|
||||||
if (nextPageLink && currentPage < maxPages) {
|
if (nextPageLink && currentPage < maxPages) {
|
||||||
logger.info("📄 Moving to next page...");
|
logger.info("📄 Moving to next page...");
|
||||||
await nextPageLink.click();
|
await nextPageLink.click();
|
||||||
await page.waitForLoadState("domcontentloaded");
|
await page.waitForLoadState("domcontentloaded");
|
||||||
await page.waitForTimeout(2000); // Wait for content to load
|
await page.waitForTimeout(2000); // Wait for content to load
|
||||||
currentPage++;
|
currentPage++;
|
||||||
} else {
|
} else {
|
||||||
hasNextPage = false;
|
hasNextPage = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
||||||
} finally {
|
} finally {
|
||||||
await page.close();
|
await page.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.success(`\n✅ Parsing complete!`);
|
logger.success(`\n✅ Parsing complete!`);
|
||||||
logger.info(`📊 Total jobs found: ${results.length}`);
|
logger.info(`📊 Total jobs found: ${results.length}`);
|
||||||
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
|
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
|
||||||
|
|
||||||
// Run AI analysis if enabled
|
// Run AI analysis if enabled
|
||||||
let aiAnalysis = null;
|
let aiAnalysis = null;
|
||||||
if (enableAI && results.length > 0) {
|
if (enableAI && results.length > 0) {
|
||||||
logger.step("Running AI analysis on job listings...");
|
logger.step("Running AI analysis on job listings...");
|
||||||
|
|
||||||
const aiAvailable = await checkOllamaStatus();
|
const aiAvailable = await checkOllamaStatus();
|
||||||
if (aiAvailable) {
|
if (aiAvailable) {
|
||||||
const analysisData = results.map((job) => ({
|
const analysisData = results.map((job) => ({
|
||||||
text: `${job.title} at ${job.company}. ${job.description}`,
|
text: `${job.title} at ${job.company}. ${job.description}`,
|
||||||
metadata: {
|
metadata: {
|
||||||
jobId: job.jobId,
|
jobId: job.jobId,
|
||||||
company: job.company,
|
company: job.company,
|
||||||
daysAgo: job.daysAgo,
|
daysAgo: job.daysAgo,
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
aiAnalysis = await analyzeBatch(analysisData, aiContext);
|
aiAnalysis = await analyzeBatch(analysisData, aiContext);
|
||||||
|
|
||||||
// Merge AI analysis with results
|
// Merge AI analysis with results
|
||||||
results.forEach((job, index) => {
|
results.forEach((job, index) => {
|
||||||
if (aiAnalysis && aiAnalysis[index]) {
|
if (aiAnalysis && aiAnalysis[index]) {
|
||||||
job.aiAnalysis = {
|
job.aiAnalysis = {
|
||||||
isRelevant: aiAnalysis[index].isRelevant,
|
isRelevant: aiAnalysis[index].isRelevant,
|
||||||
confidence: aiAnalysis[index].confidence,
|
confidence: aiAnalysis[index].confidence,
|
||||||
reasoning: aiAnalysis[index].reasoning,
|
reasoning: aiAnalysis[index].reasoning,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.success("✅ AI analysis completed");
|
logger.success("✅ AI analysis completed");
|
||||||
} else {
|
} else {
|
||||||
logger.warning("⚠️ AI not available - skipping analysis");
|
logger.warning("⚠️ AI not available - skipping analysis");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
results,
|
results,
|
||||||
rejectedResults,
|
rejectedResults,
|
||||||
metadata: {
|
metadata: {
|
||||||
source: "skipthedrive",
|
source: "skipthedrive",
|
||||||
totalJobs: results.length,
|
totalJobs: results.length,
|
||||||
rejectedJobs: rejectedResults.length,
|
rejectedJobs: rejectedResults.length,
|
||||||
keywords: keywords,
|
keywords: keywords,
|
||||||
jobTypes: jobTypes,
|
jobTypes: jobTypes,
|
||||||
locationFilter: locationFilter,
|
locationFilter: locationFilter,
|
||||||
aiAnalysisEnabled: enableAI,
|
aiAnalysisEnabled: enableAI,
|
||||||
aiAnalysisCompleted: !!aiAnalysis,
|
aiAnalysisCompleted: !!aiAnalysis,
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
|
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
|
||||||
throw error;
|
throw error;
|
||||||
} finally {
|
} finally {
|
||||||
await browser.close();
|
await browser.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Export the parser
|
// Export the parser
|
||||||
module.exports = {
|
module.exports = {
|
||||||
parseSkipTheDrive,
|
parseSkipTheDrive,
|
||||||
buildSearchUrl,
|
buildSearchUrl,
|
||||||
extractJobData,
|
extractJobData,
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user