Enhance job search parser with LinkedIn strategy and configuration updates
- Added LinkedIn jobs parsing strategy to support job extraction from LinkedIn. - Updated job search parser to include new site strategy and improved argument parsing for max pages and exclusion of rejected results. - Enhanced README documentation to reflect new features and usage examples. - Refactored existing strategies for consistency and improved error handling.
This commit is contained in:
parent
bbfd3c84aa
commit
4099b23744
@ -62,3 +62,5 @@ class CoreParser {
|
|||||||
|
|
||||||
module.exports = CoreParser;
|
module.exports = CoreParser;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -10,6 +10,7 @@ const path = require("path");
|
|||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const CoreParser = require("../core-parser");
|
const CoreParser = require("../core-parser");
|
||||||
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
|
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
|
||||||
|
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
|
||||||
const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
|
const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
|
||||||
|
|
||||||
// Load environment variables
|
// Load environment variables
|
||||||
@ -18,14 +19,16 @@ require("dotenv").config({ path: path.join(__dirname, ".env") });
|
|||||||
// Configuration from environment
|
// Configuration from environment
|
||||||
const HEADLESS = process.env.HEADLESS !== "false";
|
const HEADLESS = process.env.HEADLESS !== "false";
|
||||||
const SEARCH_KEYWORDS =
|
const SEARCH_KEYWORDS =
|
||||||
process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer";
|
process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
|
||||||
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
||||||
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
|
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
|
||||||
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
||||||
|
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
|
||||||
|
|
||||||
// Available site strategies
|
// Available site strategies
|
||||||
const SITE_STRATEGIES = {
|
const SITE_STRATEGIES = {
|
||||||
skipthedrive: skipthedriveStrategy,
|
skipthedrive: skipthedriveStrategy,
|
||||||
|
linkedin: linkedinJobsStrategy,
|
||||||
// Add more site strategies here
|
// Add more site strategies here
|
||||||
// indeed: indeedStrategy,
|
// indeed: indeedStrategy,
|
||||||
// glassdoor: glassdoorStrategy,
|
// glassdoor: glassdoorStrategy,
|
||||||
@ -41,6 +44,7 @@ function parseArguments() {
|
|||||||
keywords: null,
|
keywords: null,
|
||||||
locationFilter: null,
|
locationFilter: null,
|
||||||
maxPages: MAX_PAGES,
|
maxPages: MAX_PAGES,
|
||||||
|
excludeRejected: EXCLUDE_REJECTED,
|
||||||
};
|
};
|
||||||
|
|
||||||
args.forEach((arg) => {
|
args.forEach((arg) => {
|
||||||
@ -57,7 +61,15 @@ function parseArguments() {
|
|||||||
} else if (arg.startsWith("--location=")) {
|
} else if (arg.startsWith("--location=")) {
|
||||||
options.locationFilter = arg.split("=")[1];
|
options.locationFilter = arg.split("=")[1];
|
||||||
} else if (arg.startsWith("--max-pages=")) {
|
} else if (arg.startsWith("--max-pages=")) {
|
||||||
options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES;
|
const value = arg.split("=")[1];
|
||||||
|
// Support "all" or "0" to mean unlimited pages
|
||||||
|
if (value === "all" || value === "0") {
|
||||||
|
options.maxPages = 0; // 0 means unlimited
|
||||||
|
} else {
|
||||||
|
options.maxPages = parseInt(value) || MAX_PAGES;
|
||||||
|
}
|
||||||
|
} else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
|
||||||
|
options.excludeRejected = true;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -84,6 +96,7 @@ async function startJobSearchParser(options = {}) {
|
|||||||
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
||||||
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
||||||
const sites = finalOptions.sites;
|
const sites = finalOptions.sites;
|
||||||
|
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
|
||||||
|
|
||||||
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
||||||
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
||||||
@ -108,18 +121,46 @@ async function startJobSearchParser(options = {}) {
|
|||||||
logger.step(`\n🌐 Parsing ${site}...`);
|
logger.step(`\n🌐 Parsing ${site}...`);
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
|
|
||||||
const parseResult = await strategy(coreParser, {
|
// Prepare strategy options
|
||||||
|
const strategyOptions = {
|
||||||
keywords,
|
keywords,
|
||||||
locationFilter,
|
locationFilter,
|
||||||
maxPages: finalOptions.maxPages,
|
maxPages: finalOptions.maxPages,
|
||||||
});
|
};
|
||||||
|
|
||||||
|
// Add credentials for LinkedIn
|
||||||
|
if (site === "linkedin") {
|
||||||
|
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
||||||
|
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||||
|
|
||||||
|
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
|
||||||
|
logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
|
||||||
|
siteResults[site] = {
|
||||||
|
count: 0,
|
||||||
|
rejected: 0,
|
||||||
|
duration: "0s",
|
||||||
|
error: "LinkedIn credentials not found",
|
||||||
|
};
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
strategyOptions.credentials = {
|
||||||
|
username: LINKEDIN_USERNAME,
|
||||||
|
password: LINKEDIN_PASSWORD,
|
||||||
|
};
|
||||||
|
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
|
||||||
|
}
|
||||||
|
|
||||||
|
const parseResult = await strategy(coreParser, strategyOptions);
|
||||||
|
|
||||||
const { results, rejectedResults, summary } = parseResult;
|
const { results, rejectedResults, summary } = parseResult;
|
||||||
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
||||||
|
|
||||||
// Collect results
|
// Collect results
|
||||||
|
logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
|
||||||
allResults.push(...results);
|
allResults.push(...results);
|
||||||
allRejectedResults.push(...rejectedResults);
|
allRejectedResults.push(...rejectedResults);
|
||||||
|
logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||||
|
|
||||||
siteResults[site] = {
|
siteResults[site] = {
|
||||||
count: results.length,
|
count: results.length,
|
||||||
@ -162,6 +203,9 @@ async function startJobSearchParser(options = {}) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Save results
|
// Save results
|
||||||
|
logger.info(`💾 Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||||
|
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
|
||||||
|
|
||||||
const outputData = {
|
const outputData = {
|
||||||
metadata: {
|
metadata: {
|
||||||
extractedAt: new Date().toISOString(),
|
extractedAt: new Date().toISOString(),
|
||||||
@ -171,11 +215,21 @@ async function startJobSearchParser(options = {}) {
|
|||||||
keywords: keywords.join(", "),
|
keywords: keywords.join(", "),
|
||||||
locationFilter,
|
locationFilter,
|
||||||
analysisResults,
|
analysisResults,
|
||||||
|
rejectedJobsExcluded: excludeRejected,
|
||||||
},
|
},
|
||||||
results: allResults,
|
results: allResults,
|
||||||
rejectedResults: allRejectedResults,
|
|
||||||
siteResults,
|
siteResults,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Always include rejectedResults if not excluded (make it explicit, not using spread)
|
||||||
|
if (!excludeRejected) {
|
||||||
|
outputData.rejectedResults = allRejectedResults;
|
||||||
|
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
|
||||||
|
} else {
|
||||||
|
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`💾 Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`);
|
||||||
|
|
||||||
const resultsDir = path.join(__dirname, "results");
|
const resultsDir = path.join(__dirname, "results");
|
||||||
if (!fs.existsSync(resultsDir)) {
|
if (!fs.existsSync(resultsDir)) {
|
||||||
|
|||||||
1360
job-search-parser/strategies/linkedin-jobs-strategy.js
Normal file
1360
job-search-parser/strategies/linkedin-jobs-strategy.js
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,302 +1,299 @@
|
|||||||
/**
|
/**
|
||||||
* SkipTheDrive Parsing Strategy
|
* SkipTheDrive Parsing Strategy
|
||||||
*
|
*
|
||||||
* Uses core-parser for browser management and ai-analyzer for utilities
|
* Uses core-parser for browser management and ai-analyzer for utilities
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const {
|
const {
|
||||||
logger,
|
logger,
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
} = require("ai-analyzer");
|
} = require("ai-analyzer");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SkipTheDrive URL builder
|
* SkipTheDrive URL builder
|
||||||
*/
|
*/
|
||||||
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
||||||
const baseUrl = "https://www.skipthedrive.com/";
|
const baseUrl = "https://www.skipthedrive.com/";
|
||||||
const params = new URLSearchParams({
|
const params = new URLSearchParams({
|
||||||
s: keyword,
|
s: keyword,
|
||||||
orderby: orderBy,
|
orderby: orderBy,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (jobTypes && jobTypes.length > 0) {
|
if (jobTypes && jobTypes.length > 0) {
|
||||||
params.append("job_type", jobTypes.join(","));
|
params.append("job_type", jobTypes.join(","));
|
||||||
}
|
}
|
||||||
|
|
||||||
return `${baseUrl}?${params.toString()}`;
|
return `${baseUrl}?${params.toString()}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SkipTheDrive parsing strategy function
|
* SkipTheDrive parsing strategy function
|
||||||
*/
|
*/
|
||||||
async function skipthedriveStrategy(coreParser, options = {}) {
|
async function skipthedriveStrategy(coreParser, options = {}) {
|
||||||
const {
|
const {
|
||||||
keywords = ["software engineer", "developer", "programmer"],
|
keywords = ["software engineer", "developer", "programmer"],
|
||||||
locationFilter = null,
|
locationFilter = null,
|
||||||
maxPages = 5,
|
maxPages = 5,
|
||||||
jobTypes = [],
|
jobTypes = [],
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
const rejectedResults = [];
|
const rejectedResults = [];
|
||||||
const seenJobs = new Set();
|
const seenJobs = new Set();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Create main page
|
// Create main page
|
||||||
const page = await coreParser.createPage("skipthedrive-main");
|
const page = await coreParser.createPage("skipthedrive-main");
|
||||||
|
|
||||||
logger.info("🚀 Starting SkipTheDrive parser...");
|
logger.info("🚀 Starting SkipTheDrive parser...");
|
||||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||||
|
|
||||||
// Search for each keyword
|
// Search for each keyword
|
||||||
for (const keyword of keywords) {
|
for (const keyword of keywords) {
|
||||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||||
|
|
||||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Navigate to search results
|
// Navigate to search results
|
||||||
await coreParser.navigateTo(searchUrl, {
|
await coreParser.navigateTo(searchUrl, {
|
||||||
pageId: "skipthedrive-main",
|
pageId: "skipthedrive-main",
|
||||||
retries: 2,
|
retries: 2,
|
||||||
timeout: 30000,
|
timeout: 30000,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Wait for job listings to load
|
// Wait for job listings to load
|
||||||
const hasResults = await coreParser
|
const hasResults = await page
|
||||||
.waitForSelector(
|
.waitForSelector("#loops-wrapper", {
|
||||||
"#loops-wrapper",
|
timeout: 5000,
|
||||||
{
|
})
|
||||||
timeout: 5000,
|
.then(() => true)
|
||||||
},
|
.catch(() => {
|
||||||
"skipthedrive-main"
|
logger.warning(`No results found for keyword: ${keyword}`);
|
||||||
)
|
return false;
|
||||||
.catch(() => {
|
});
|
||||||
logger.warning(`No results found for keyword: ${keyword}`);
|
|
||||||
return false;
|
if (!hasResults) {
|
||||||
});
|
continue;
|
||||||
|
}
|
||||||
if (!hasResults) {
|
|
||||||
continue;
|
// Process multiple pages
|
||||||
}
|
let currentPage = 1;
|
||||||
|
let hasNextPage = true;
|
||||||
// Process multiple pages
|
|
||||||
let currentPage = 1;
|
while (hasNextPage && currentPage <= maxPages) {
|
||||||
let hasNextPage = true;
|
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
||||||
|
|
||||||
while (hasNextPage && currentPage <= maxPages) {
|
// Extract jobs from current page
|
||||||
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
|
const pageJobs = await extractJobsFromPage(
|
||||||
|
page,
|
||||||
// Extract jobs from current page
|
keyword,
|
||||||
const pageJobs = await extractJobsFromPage(
|
locationFilter
|
||||||
page,
|
);
|
||||||
keyword,
|
|
||||||
locationFilter
|
for (const job of pageJobs) {
|
||||||
);
|
// Skip duplicates
|
||||||
|
if (seenJobs.has(job.jobId)) continue;
|
||||||
for (const job of pageJobs) {
|
seenJobs.add(job.jobId);
|
||||||
// Skip duplicates
|
|
||||||
if (seenJobs.has(job.jobId)) continue;
|
// Validate location if filtering enabled
|
||||||
seenJobs.add(job.jobId);
|
if (locationFilter) {
|
||||||
|
const locationValid = validateLocationAgainstFilters(
|
||||||
// Validate location if filtering enabled
|
job.location,
|
||||||
if (locationFilter) {
|
locationFilter
|
||||||
const locationValid = validateLocationAgainstFilters(
|
);
|
||||||
job.location,
|
|
||||||
locationFilter
|
if (!locationValid) {
|
||||||
);
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
if (!locationValid) {
|
rejectionReason: "Location filter mismatch",
|
||||||
rejectedResults.push({
|
});
|
||||||
...job,
|
continue;
|
||||||
rejectionReason: "Location filter mismatch",
|
}
|
||||||
});
|
}
|
||||||
continue;
|
|
||||||
}
|
results.push(job);
|
||||||
}
|
}
|
||||||
|
|
||||||
results.push(job);
|
// Check for next page
|
||||||
}
|
hasNextPage = await hasNextPageAvailable(page);
|
||||||
|
if (hasNextPage && currentPage < maxPages) {
|
||||||
// Check for next page
|
await navigateToNextPage(page, currentPage + 1);
|
||||||
hasNextPage = await hasNextPageAvailable(page);
|
currentPage++;
|
||||||
if (hasNextPage && currentPage < maxPages) {
|
|
||||||
await navigateToNextPage(page, currentPage + 1);
|
// Wait for new page to load
|
||||||
currentPage++;
|
await page.waitForTimeout(2000);
|
||||||
|
} else {
|
||||||
// Wait for new page to load
|
hasNextPage = false;
|
||||||
await page.waitForTimeout(2000);
|
}
|
||||||
} else {
|
}
|
||||||
hasNextPage = false;
|
} catch (error) {
|
||||||
}
|
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
}
|
||||||
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
|
||||||
}
|
logger.info(
|
||||||
}
|
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
|
||||||
|
);
|
||||||
logger.info(
|
|
||||||
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
|
return {
|
||||||
);
|
results,
|
||||||
|
rejectedResults,
|
||||||
return {
|
summary: {
|
||||||
results,
|
totalJobs: results.length,
|
||||||
rejectedResults,
|
totalRejected: rejectedResults.length,
|
||||||
summary: {
|
keywords: keywords.join(", "),
|
||||||
totalJobs: results.length,
|
locationFilter,
|
||||||
totalRejected: rejectedResults.length,
|
source: "skipthedrive",
|
||||||
keywords: keywords.join(", "),
|
},
|
||||||
locationFilter,
|
};
|
||||||
source: "skipthedrive",
|
} catch (error) {
|
||||||
},
|
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
|
||||||
};
|
throw error;
|
||||||
} catch (error) {
|
}
|
||||||
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
|
}
|
||||||
throw error;
|
|
||||||
}
|
/**
|
||||||
}
|
* Extract jobs from current page
|
||||||
|
*/
|
||||||
/**
|
async function extractJobsFromPage(page, keyword, locationFilter) {
|
||||||
* Extract jobs from current page
|
const jobs = [];
|
||||||
*/
|
|
||||||
async function extractJobsFromPage(page, keyword, locationFilter) {
|
try {
|
||||||
const jobs = [];
|
// Get all job article elements
|
||||||
|
const jobElements = await page.$$("article.job_listing");
|
||||||
try {
|
|
||||||
// Get all job article elements
|
for (const jobElement of jobElements) {
|
||||||
const jobElements = await page.$$("article.job_listing");
|
try {
|
||||||
|
const job = await extractJobData(jobElement, keyword);
|
||||||
for (const jobElement of jobElements) {
|
if (job) {
|
||||||
try {
|
jobs.push(job);
|
||||||
const job = await extractJobData(jobElement, keyword);
|
}
|
||||||
if (job) {
|
} catch (error) {
|
||||||
jobs.push(job);
|
logger.warning(`Failed to extract job data: ${error.message}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
}
|
||||||
logger.warning(`Failed to extract job data: ${error.message}`);
|
} catch (error) {
|
||||||
}
|
logger.error(`Failed to extract jobs from page: ${error.message}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
logger.error(`Failed to extract jobs from page: ${error.message}`);
|
return jobs;
|
||||||
}
|
}
|
||||||
|
|
||||||
return jobs;
|
/**
|
||||||
}
|
* Extract data from individual job element
|
||||||
|
*/
|
||||||
/**
|
async function extractJobData(jobElement, keyword) {
|
||||||
* Extract data from individual job element
|
try {
|
||||||
*/
|
// Extract job ID
|
||||||
async function extractJobData(jobElement, keyword) {
|
const articleId = (await jobElement.getAttribute("id")) || "";
|
||||||
try {
|
const jobId = articleId ? articleId.replace("post-", "") : "";
|
||||||
// Extract job ID
|
|
||||||
const articleId = (await jobElement.getAttribute("id")) || "";
|
// Extract title
|
||||||
const jobId = articleId ? articleId.replace("post-", "") : "";
|
const titleElement = await jobElement.$(".job_listing-title a");
|
||||||
|
const title = titleElement
|
||||||
// Extract title
|
? cleanText(await titleElement.textContent())
|
||||||
const titleElement = await jobElement.$(".job_listing-title a");
|
: "";
|
||||||
const title = titleElement
|
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
||||||
? cleanText(await titleElement.textContent())
|
|
||||||
: "";
|
// Extract company
|
||||||
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
|
const companyElement = await jobElement.$(".company");
|
||||||
|
const company = companyElement
|
||||||
// Extract company
|
? cleanText(await companyElement.textContent())
|
||||||
const companyElement = await jobElement.$(".company");
|
: "";
|
||||||
const company = companyElement
|
|
||||||
? cleanText(await companyElement.textContent())
|
// Extract location
|
||||||
: "";
|
const locationElement = await jobElement.$(".location");
|
||||||
|
const location = locationElement
|
||||||
// Extract location
|
? cleanText(await locationElement.textContent())
|
||||||
const locationElement = await jobElement.$(".location");
|
: "";
|
||||||
const location = locationElement
|
|
||||||
? cleanText(await locationElement.textContent())
|
// Extract date posted
|
||||||
: "";
|
const dateElement = await jobElement.$(".job-date");
|
||||||
|
const dateText = dateElement
|
||||||
// Extract date posted
|
? cleanText(await dateElement.textContent())
|
||||||
const dateElement = await jobElement.$(".job-date");
|
: "";
|
||||||
const dateText = dateElement
|
|
||||||
? cleanText(await dateElement.textContent())
|
// Extract description
|
||||||
: "";
|
const descElement = await jobElement.$(".job_listing-description");
|
||||||
|
const description = descElement
|
||||||
// Extract description
|
? cleanText(await descElement.textContent())
|
||||||
const descElement = await jobElement.$(".job_listing-description");
|
: "";
|
||||||
const description = descElement
|
|
||||||
? cleanText(await descElement.textContent())
|
// Check if featured
|
||||||
: "";
|
const featuredElement = await jobElement.$(".featured");
|
||||||
|
const isFeatured = featuredElement !== null;
|
||||||
// Check if featured
|
|
||||||
const featuredElement = await jobElement.$(".featured");
|
// Parse date
|
||||||
const isFeatured = featuredElement !== null;
|
let datePosted = null;
|
||||||
|
let daysAgo = null;
|
||||||
// Parse date
|
|
||||||
let datePosted = null;
|
if (dateText) {
|
||||||
let daysAgo = null;
|
const match = dateText.match(/(\d+)\s+days?\s+ago/);
|
||||||
|
if (match) {
|
||||||
if (dateText) {
|
daysAgo = parseInt(match[1]);
|
||||||
const match = dateText.match(/(\d+)\s+days?\s+ago/);
|
const date = new Date();
|
||||||
if (match) {
|
date.setDate(date.getDate() - daysAgo);
|
||||||
daysAgo = parseInt(match[1]);
|
datePosted = date.toISOString().split("T")[0];
|
||||||
const date = new Date();
|
}
|
||||||
date.setDate(date.getDate() - daysAgo);
|
}
|
||||||
datePosted = date.toISOString().split("T")[0];
|
|
||||||
}
|
return {
|
||||||
}
|
jobId,
|
||||||
|
title,
|
||||||
return {
|
company,
|
||||||
jobId,
|
location,
|
||||||
title,
|
jobUrl,
|
||||||
company,
|
datePosted,
|
||||||
location,
|
dateText,
|
||||||
jobUrl,
|
daysAgo,
|
||||||
datePosted,
|
description,
|
||||||
dateText,
|
isFeatured,
|
||||||
daysAgo,
|
keyword,
|
||||||
description,
|
extractedAt: new Date().toISOString(),
|
||||||
isFeatured,
|
source: "skipthedrive",
|
||||||
keyword,
|
};
|
||||||
extractedAt: new Date().toISOString(),
|
} catch (error) {
|
||||||
source: "skipthedrive",
|
logger.warning(`Error extracting job data: ${error.message}`);
|
||||||
};
|
return null;
|
||||||
} catch (error) {
|
}
|
||||||
logger.warning(`Error extracting job data: ${error.message}`);
|
}
|
||||||
return null;
|
|
||||||
}
|
/**
|
||||||
}
|
* Check if next page is available
|
||||||
|
*/
|
||||||
/**
|
async function hasNextPageAvailable(page) {
|
||||||
* Check if next page is available
|
try {
|
||||||
*/
|
const nextButton = await page.$(".next-page");
|
||||||
async function hasNextPageAvailable(page) {
|
return nextButton !== null;
|
||||||
try {
|
} catch {
|
||||||
const nextButton = await page.$(".next-page");
|
return false;
|
||||||
return nextButton !== null;
|
}
|
||||||
} catch {
|
}
|
||||||
return false;
|
|
||||||
}
|
/**
|
||||||
}
|
* Navigate to next page
|
||||||
|
*/
|
||||||
/**
|
async function navigateToNextPage(page, pageNumber) {
|
||||||
* Navigate to next page
|
try {
|
||||||
*/
|
const nextButton = await page.$(".next-page");
|
||||||
async function navigateToNextPage(page, pageNumber) {
|
if (nextButton) {
|
||||||
try {
|
await nextButton.click();
|
||||||
const nextButton = await page.$(".next-page");
|
}
|
||||||
if (nextButton) {
|
} catch (error) {
|
||||||
await nextButton.click();
|
logger.warning(
|
||||||
}
|
`Failed to navigate to page ${pageNumber}: ${error.message}`
|
||||||
} catch (error) {
|
);
|
||||||
logger.warning(
|
}
|
||||||
`Failed to navigate to page ${pageNumber}: ${error.message}`
|
}
|
||||||
);
|
|
||||||
}
|
module.exports = {
|
||||||
}
|
skipthedriveStrategy,
|
||||||
|
buildSearchUrl,
|
||||||
module.exports = {
|
extractJobsFromPage,
|
||||||
skipthedriveStrategy,
|
extractJobData,
|
||||||
buildSearchUrl,
|
};
|
||||||
extractJobsFromPage,
|
|
||||||
extractJobData,
|
|
||||||
};
|
|
||||||
|
|||||||
@ -31,12 +31,13 @@ const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
|||||||
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||||
const HEADLESS = process.env.HEADLESS !== "false";
|
const HEADLESS = process.env.HEADLESS !== "false";
|
||||||
const SEARCH_KEYWORDS =
|
const SEARCH_KEYWORDS =
|
||||||
process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts";
|
process.env.SEARCH_KEYWORDS || "layoff";//,downsizing";//,job cuts";
|
||||||
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
||||||
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
|
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
|
||||||
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
|
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
|
||||||
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
||||||
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
|
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
|
||||||
|
const EXTRACT_LOCATION_FROM_PROFILE = process.env.EXTRACT_LOCATION_FROM_PROFILE === "true";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main LinkedIn parser function
|
* Main LinkedIn parser function
|
||||||
@ -71,6 +72,7 @@ async function startLinkedInParser(options = {}) {
|
|||||||
keywords,
|
keywords,
|
||||||
locationFilter: LOCATION_FILTER,
|
locationFilter: LOCATION_FILTER,
|
||||||
maxResults: MAX_RESULTS,
|
maxResults: MAX_RESULTS,
|
||||||
|
extractLocationFromProfile: EXTRACT_LOCATION_FROM_PROFILE,
|
||||||
credentials: {
|
credentials: {
|
||||||
username: LINKEDIN_USERNAME,
|
username: LINKEDIN_USERNAME,
|
||||||
password: LINKEDIN_PASSWORD,
|
password: LINKEDIN_PASSWORD,
|
||||||
|
|||||||
@ -21,6 +21,7 @@ async function linkedinStrategy(coreParser, options = {}) {
|
|||||||
keywords = ["layoff", "downsizing", "job cuts"],
|
keywords = ["layoff", "downsizing", "job cuts"],
|
||||||
locationFilter = null,
|
locationFilter = null,
|
||||||
maxResults = 50,
|
maxResults = 50,
|
||||||
|
extractLocationFromProfile = false,
|
||||||
credentials = {},
|
credentials = {},
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
@ -106,7 +107,7 @@ async function linkedinStrategy(coreParser, options = {}) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Extract posts from current page
|
// Extract posts from current page
|
||||||
const posts = await extractPostsFromPage(page, keyword);
|
const posts = await extractPostsFromPage(page, keyword, extractLocationFromProfile);
|
||||||
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
|
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
|
||||||
|
|
||||||
for (const post of posts) {
|
for (const post of posts) {
|
||||||
@ -172,7 +173,7 @@ async function linkedinStrategy(coreParser, options = {}) {
|
|||||||
/**
|
/**
|
||||||
* Extract posts from current search results page
|
* Extract posts from current search results page
|
||||||
*/
|
*/
|
||||||
async function extractPostsFromPage(page, keyword) {
|
async function extractPostsFromPage(page, keyword, extractLocationFromProfile = false) {
|
||||||
const posts = [];
|
const posts = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -254,10 +255,26 @@ async function extractPostsFromPage(page, keyword) {
|
|||||||
|
|
||||||
const post = await extractPostData(postElements[i], keyword);
|
const post = await extractPostData(postElements[i], keyword);
|
||||||
if (post) {
|
if (post) {
|
||||||
|
// If location is missing and we're enabled to extract from profile, try to get it
|
||||||
|
if (!post.location && extractLocationFromProfile && post.authorUrl) {
|
||||||
|
try {
|
||||||
|
logger.debug(`📍 Location missing for post ${i + 1}, attempting to extract from profile...`);
|
||||||
|
const profileLocation = await extractLocationFromProfilePage(page, post.authorUrl);
|
||||||
|
if (profileLocation) {
|
||||||
|
post.location = profileLocation;
|
||||||
|
post.profileLocation = profileLocation;
|
||||||
|
logger.debug(`✅ Extracted location from profile: ${profileLocation}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.debug(`⚠️ Could not extract location from profile: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
posts.push(post);
|
posts.push(post);
|
||||||
const hasContent = post.content && post.content.length > 0;
|
const hasContent = post.content && post.content.length > 0;
|
||||||
const hasAuthor = post.authorName && post.authorName.length > 0;
|
const hasAuthor = post.authorName && post.authorName.length > 0;
|
||||||
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`);
|
const hasLocation = post.location && post.location.length > 0;
|
||||||
|
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'}, location: ${hasLocation ? 'yes' : 'no'})`);
|
||||||
} else {
|
} else {
|
||||||
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
|
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
|
||||||
}
|
}
|
||||||
@ -626,6 +643,42 @@ async function extractPostData(postElement, keyword) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try to extract from data attributes or hidden elements
|
||||||
|
if (!data.location) {
|
||||||
|
// Check for data attributes that might contain location
|
||||||
|
const actorSection = el.querySelector(".feed-shared-actor");
|
||||||
|
if (actorSection) {
|
||||||
|
// Check all data attributes
|
||||||
|
for (const attr of actorSection.attributes) {
|
||||||
|
if (attr.name.startsWith("data-") && attr.value) {
|
||||||
|
const value = attr.value.toLowerCase();
|
||||||
|
// Look for location-like patterns in data attributes
|
||||||
|
if (/(ontario|alberta|british columbia|quebec|toronto|vancouver|calgary|ottawa|montreal)/i.test(value)) {
|
||||||
|
// Try to extract the actual location text
|
||||||
|
const locationMatch = attr.value.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
|
||||||
|
if (locationMatch) {
|
||||||
|
data.location = locationMatch[0];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for hidden spans or divs with location info
|
||||||
|
const hiddenElements = actorSection.querySelectorAll("span[style*='display: none'], div[style*='display: none'], [aria-hidden='true']");
|
||||||
|
for (const hiddenElem of hiddenElements) {
|
||||||
|
const text = hiddenElem.textContent || hiddenElem.getAttribute("aria-label") || "";
|
||||||
|
if (text && /(ontario|alberta|british columbia|quebec|toronto|vancouver)/i.test(text)) {
|
||||||
|
const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
|
||||||
|
if (locationMatch) {
|
||||||
|
data.location = locationMatch[0].trim();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Extract engagement metrics - try multiple approaches
|
// Extract engagement metrics - try multiple approaches
|
||||||
const likesSelectors = [
|
const likesSelectors = [
|
||||||
@ -799,6 +852,48 @@ async function extractPostData(postElement, keyword) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract location from a LinkedIn profile page
|
||||||
|
*/
|
||||||
|
async function extractLocationFromProfilePage(page, profileUrl) {
|
||||||
|
try {
|
||||||
|
// Ensure URL is complete
|
||||||
|
let fullUrl = profileUrl;
|
||||||
|
if (!fullUrl.startsWith("http")) {
|
||||||
|
fullUrl = `https://www.linkedin.com${fullUrl}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove query parameters that might cause issues
|
||||||
|
fullUrl = fullUrl.split("?")[0];
|
||||||
|
|
||||||
|
// Open profile in new tab
|
||||||
|
const profilePage = await page.context().newPage();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await profilePage.goto(fullUrl, {
|
||||||
|
waitUntil: "domcontentloaded",
|
||||||
|
timeout: 15000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait a bit for content to load
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||||
|
|
||||||
|
// Use the extractLocationFromProfile utility from ai-analyzer
|
||||||
|
const location = await extractLocationFromProfile(profilePage);
|
||||||
|
|
||||||
|
await profilePage.close();
|
||||||
|
|
||||||
|
return location;
|
||||||
|
} catch (error) {
|
||||||
|
await profilePage.close();
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.debug(`Failed to extract location from profile ${profileUrl}: ${error.message}`);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract numbers from text (e.g., "15 likes" -> 15)
|
* Extract numbers from text (e.g., "15 likes" -> 15)
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -1,80 +1,80 @@
|
|||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const assert = require("assert");
|
const assert = require("assert");
|
||||||
const { analyzeSinglePost, checkOllamaStatus } = require("../ai-analyzer");
|
const { analyzeSinglePost, checkOllamaStatus } = require("../ai-analyzer");
|
||||||
|
|
||||||
console.log("AI Analyzer logic tests");
|
console.log("AI Analyzer logic tests");
|
||||||
|
|
||||||
const testData = JSON.parse(
|
const testData = JSON.parse(
|
||||||
fs.readFileSync(__dirname + "/test-data.json", "utf-8")
|
fs.readFileSync(__dirname + "/test-data.json", "utf-8")
|
||||||
);
|
);
|
||||||
const aiResults = testData.positive;
|
const aiResults = testData.positive;
|
||||||
const context = "job layoffs and workforce reduction";
|
const context = "job layoffs and workforce reduction";
|
||||||
const model = process.env.OLLAMA_MODEL || "mistral"; // Use OLLAMA_MODEL from env or default to mistral
|
const model = process.env.OLLAMA_MODEL || "mistral"; // Use OLLAMA_MODEL from env or default to mistral
|
||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
// Check if Ollama is available
|
// Check if Ollama is available
|
||||||
const ollamaAvailable = await checkOllamaStatus(model);
|
const ollamaAvailable = await checkOllamaStatus(model);
|
||||||
if (!ollamaAvailable) {
|
if (!ollamaAvailable) {
|
||||||
console.log("SKIP: Ollama not available - skipping AI analyzer tests");
|
console.log("SKIP: Ollama not available - skipping AI analyzer tests");
|
||||||
console.log("PASS: AI analyzer tests skipped (Ollama not running)");
|
console.log("PASS: AI analyzer tests skipped (Ollama not running)");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`Testing AI analyzer with ${aiResults.length} posts...`);
|
console.log(`Testing AI analyzer with ${aiResults.length} posts...`);
|
||||||
|
|
||||||
for (let i = 0; i < aiResults.length; i++) {
|
for (let i = 0; i < aiResults.length; i++) {
|
||||||
const post = aiResults[i];
|
const post = aiResults[i];
|
||||||
console.log(`Testing post ${i + 1}: "${post.text.substring(0, 50)}..."`);
|
console.log(`Testing post ${i + 1}: "${post.text.substring(0, 50)}..."`);
|
||||||
|
|
||||||
const aiOutput = await analyzeSinglePost(post.text, context, model);
|
const aiOutput = await analyzeSinglePost(post.text, context, model);
|
||||||
|
|
||||||
// Test that the function returns the expected structure
|
// Test that the function returns the expected structure
|
||||||
assert(
|
assert(
|
||||||
typeof aiOutput === "object" && aiOutput !== null,
|
typeof aiOutput === "object" && aiOutput !== null,
|
||||||
`Post ${i} output is not an object`
|
`Post ${i} output is not an object`
|
||||||
);
|
);
|
||||||
|
|
||||||
assert(
|
assert(
|
||||||
typeof aiOutput.isRelevant === "boolean",
|
typeof aiOutput.isRelevant === "boolean",
|
||||||
`Post ${i} isRelevant is not a boolean: ${typeof aiOutput.isRelevant}`
|
`Post ${i} isRelevant is not a boolean: ${typeof aiOutput.isRelevant}`
|
||||||
);
|
);
|
||||||
|
|
||||||
assert(
|
assert(
|
||||||
typeof aiOutput.confidence === "number",
|
typeof aiOutput.confidence === "number",
|
||||||
`Post ${i} confidence is not a number: ${typeof aiOutput.confidence}`
|
`Post ${i} confidence is not a number: ${typeof aiOutput.confidence}`
|
||||||
);
|
);
|
||||||
|
|
||||||
assert(
|
assert(
|
||||||
typeof aiOutput.reasoning === "string",
|
typeof aiOutput.reasoning === "string",
|
||||||
`Post ${i} reasoning is not a string: ${typeof aiOutput.reasoning}`
|
`Post ${i} reasoning is not a string: ${typeof aiOutput.reasoning}`
|
||||||
);
|
);
|
||||||
|
|
||||||
// Test that confidence is within valid range
|
// Test that confidence is within valid range
|
||||||
assert(
|
assert(
|
||||||
aiOutput.confidence >= 0 && aiOutput.confidence <= 1,
|
aiOutput.confidence >= 0 && aiOutput.confidence <= 1,
|
||||||
`Post ${i} confidence out of range: ${aiOutput.confidence} (should be 0-1)`
|
`Post ${i} confidence out of range: ${aiOutput.confidence} (should be 0-1)`
|
||||||
);
|
);
|
||||||
|
|
||||||
// Test that reasoning exists and is not empty
|
// Test that reasoning exists and is not empty
|
||||||
assert(
|
assert(
|
||||||
aiOutput.reasoning && aiOutput.reasoning.length > 0,
|
aiOutput.reasoning && aiOutput.reasoning.length > 0,
|
||||||
`Post ${i} missing or empty reasoning`
|
`Post ${i} missing or empty reasoning`
|
||||||
);
|
);
|
||||||
|
|
||||||
// Test that relevance is a boolean value
|
// Test that relevance is a boolean value
|
||||||
assert(
|
assert(
|
||||||
aiOutput.isRelevant === true || aiOutput.isRelevant === false,
|
aiOutput.isRelevant === true || aiOutput.isRelevant === false,
|
||||||
`Post ${i} isRelevant is not a valid boolean: ${aiOutput.isRelevant}`
|
`Post ${i} isRelevant is not a valid boolean: ${aiOutput.isRelevant}`
|
||||||
);
|
);
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
` ✓ Post ${i + 1}: relevant=${aiOutput.isRelevant}, confidence=${
|
` ✓ Post ${i + 1}: relevant=${aiOutput.isRelevant}, confidence=${
|
||||||
aiOutput.confidence
|
aiOutput.confidence
|
||||||
}`
|
}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
"PASS: AI analyzer returns valid structure and values for all test posts."
|
"PASS: AI analyzer returns valid structure and values for all test posts."
|
||||||
);
|
);
|
||||||
})();
|
})();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user