Enhance job search parser with LinkedIn strategy and configuration updates

- Added LinkedIn jobs parsing strategy to support job extraction from LinkedIn.
- Updated job search parser to include new site strategy and improved argument parsing for max pages and exclusion of rejected results.
- Enhanced README documentation to reflect new features and usage examples.
- Refactored existing strategies for consistency and improved error handling.
This commit is contained in:
tanyar09 2025-12-16 23:17:12 -05:00
parent bbfd3c84aa
commit 4099b23744
8 changed files with 2431 additions and 888 deletions

View File

@ -62,3 +62,5 @@ class CoreParser {
module.exports = CoreParser; module.exports = CoreParser;

File diff suppressed because it is too large Load Diff

View File

@ -10,6 +10,7 @@ const path = require("path");
const fs = require("fs"); const fs = require("fs");
const CoreParser = require("../core-parser"); const CoreParser = require("../core-parser");
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy"); const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer"); const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
// Load environment variables // Load environment variables
@ -18,14 +19,16 @@ require("dotenv").config({ path: path.join(__dirname, ".env") });
// Configuration from environment // Configuration from environment
const HEADLESS = process.env.HEADLESS !== "false"; const HEADLESS = process.env.HEADLESS !== "false";
const SEARCH_KEYWORDS = const SEARCH_KEYWORDS =
process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer"; process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
const LOCATION_FILTER = process.env.LOCATION_FILTER; const LOCATION_FILTER = process.env.LOCATION_FILTER;
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true"; const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5; const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
// Available site strategies // Available site strategies
const SITE_STRATEGIES = { const SITE_STRATEGIES = {
skipthedrive: skipthedriveStrategy, skipthedrive: skipthedriveStrategy,
linkedin: linkedinJobsStrategy,
// Add more site strategies here // Add more site strategies here
// indeed: indeedStrategy, // indeed: indeedStrategy,
// glassdoor: glassdoorStrategy, // glassdoor: glassdoorStrategy,
@ -41,6 +44,7 @@ function parseArguments() {
keywords: null, keywords: null,
locationFilter: null, locationFilter: null,
maxPages: MAX_PAGES, maxPages: MAX_PAGES,
excludeRejected: EXCLUDE_REJECTED,
}; };
args.forEach((arg) => { args.forEach((arg) => {
@ -57,7 +61,15 @@ function parseArguments() {
} else if (arg.startsWith("--location=")) { } else if (arg.startsWith("--location=")) {
options.locationFilter = arg.split("=")[1]; options.locationFilter = arg.split("=")[1];
} else if (arg.startsWith("--max-pages=")) { } else if (arg.startsWith("--max-pages=")) {
options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES; const value = arg.split("=")[1];
// Support "all" or "0" to mean unlimited pages
if (value === "all" || value === "0") {
options.maxPages = 0; // 0 means unlimited
} else {
options.maxPages = parseInt(value) || MAX_PAGES;
}
} else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
options.excludeRejected = true;
} }
}); });
@ -84,6 +96,7 @@ async function startJobSearchParser(options = {}) {
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim()); finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER; const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
const sites = finalOptions.sites; const sites = finalOptions.sites;
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
logger.info(`📦 Selected job sites: ${sites.join(", ")}`); logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`); logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
@ -108,18 +121,46 @@ async function startJobSearchParser(options = {}) {
logger.step(`\n🌐 Parsing ${site}...`); logger.step(`\n🌐 Parsing ${site}...`);
const startTime = Date.now(); const startTime = Date.now();
const parseResult = await strategy(coreParser, { // Prepare strategy options
const strategyOptions = {
keywords, keywords,
locationFilter, locationFilter,
maxPages: finalOptions.maxPages, maxPages: finalOptions.maxPages,
}); };
// Add credentials for LinkedIn
if (site === "linkedin") {
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
siteResults[site] = {
count: 0,
rejected: 0,
duration: "0s",
error: "LinkedIn credentials not found",
};
continue;
}
strategyOptions.credentials = {
username: LINKEDIN_USERNAME,
password: LINKEDIN_PASSWORD,
};
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
}
const parseResult = await strategy(coreParser, strategyOptions);
const { results, rejectedResults, summary } = parseResult; const { results, rejectedResults, summary } = parseResult;
const duration = ((Date.now() - startTime) / 1000).toFixed(2); const duration = ((Date.now() - startTime) / 1000).toFixed(2);
// Collect results // Collect results
logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
allResults.push(...results); allResults.push(...results);
allRejectedResults.push(...rejectedResults); allRejectedResults.push(...rejectedResults);
logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
siteResults[site] = { siteResults[site] = {
count: results.length, count: results.length,
@ -162,6 +203,9 @@ async function startJobSearchParser(options = {}) {
} }
// Save results // Save results
logger.info(`💾 Preparing to save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
const outputData = { const outputData = {
metadata: { metadata: {
extractedAt: new Date().toISOString(), extractedAt: new Date().toISOString(),
@ -171,11 +215,21 @@ async function startJobSearchParser(options = {}) {
keywords: keywords.join(", "), keywords: keywords.join(", "),
locationFilter, locationFilter,
analysisResults, analysisResults,
rejectedJobsExcluded: excludeRejected,
}, },
results: allResults, results: allResults,
rejectedResults: allRejectedResults,
siteResults, siteResults,
}; };
// Always include rejectedResults if not excluded (make it explicit, not using spread)
if (!excludeRejected) {
outputData.rejectedResults = allRejectedResults;
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
} else {
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
}
logger.info(`💾 Final output: ${outputData.results.length} results, ${outputData.rejectedResults?.length || 0} rejected`);
const resultsDir = path.join(__dirname, "results"); const resultsDir = path.join(__dirname, "results");
if (!fs.existsSync(resultsDir)) { if (!fs.existsSync(resultsDir)) {

File diff suppressed because it is too large Load Diff

View File

@ -1,302 +1,299 @@
/** /**
* SkipTheDrive Parsing Strategy * SkipTheDrive Parsing Strategy
* *
* Uses core-parser for browser management and ai-analyzer for utilities * Uses core-parser for browser management and ai-analyzer for utilities
*/ */
const { const {
logger, logger,
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
validateLocationAgainstFilters, validateLocationAgainstFilters,
} = require("ai-analyzer"); } = require("ai-analyzer");
/** /**
* SkipTheDrive URL builder * SkipTheDrive URL builder
*/ */
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
const baseUrl = "https://www.skipthedrive.com/"; const baseUrl = "https://www.skipthedrive.com/";
const params = new URLSearchParams({ const params = new URLSearchParams({
s: keyword, s: keyword,
orderby: orderBy, orderby: orderBy,
}); });
if (jobTypes && jobTypes.length > 0) { if (jobTypes && jobTypes.length > 0) {
params.append("job_type", jobTypes.join(",")); params.append("job_type", jobTypes.join(","));
} }
return `${baseUrl}?${params.toString()}`; return `${baseUrl}?${params.toString()}`;
} }
/** /**
* SkipTheDrive parsing strategy function * SkipTheDrive parsing strategy function
*/ */
async function skipthedriveStrategy(coreParser, options = {}) { async function skipthedriveStrategy(coreParser, options = {}) {
const { const {
keywords = ["software engineer", "developer", "programmer"], keywords = ["software engineer", "developer", "programmer"],
locationFilter = null, locationFilter = null,
maxPages = 5, maxPages = 5,
jobTypes = [], jobTypes = [],
} = options; } = options;
const results = []; const results = [];
const rejectedResults = []; const rejectedResults = [];
const seenJobs = new Set(); const seenJobs = new Set();
try { try {
// Create main page // Create main page
const page = await coreParser.createPage("skipthedrive-main"); const page = await coreParser.createPage("skipthedrive-main");
logger.info("🚀 Starting SkipTheDrive parser..."); logger.info("🚀 Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`); logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`); logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`); logger.info(`📄 Max Pages: ${maxPages}`);
// Search for each keyword // Search for each keyword
for (const keyword of keywords) { for (const keyword of keywords) {
logger.info(`\n🔍 Searching for: ${keyword}`); logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes); const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
try { try {
// Navigate to search results // Navigate to search results
await coreParser.navigateTo(searchUrl, { await coreParser.navigateTo(searchUrl, {
pageId: "skipthedrive-main", pageId: "skipthedrive-main",
retries: 2, retries: 2,
timeout: 30000, timeout: 30000,
}); });
// Wait for job listings to load // Wait for job listings to load
const hasResults = await coreParser const hasResults = await page
.waitForSelector( .waitForSelector("#loops-wrapper", {
"#loops-wrapper", timeout: 5000,
{ })
timeout: 5000, .then(() => true)
}, .catch(() => {
"skipthedrive-main" logger.warning(`No results found for keyword: ${keyword}`);
) return false;
.catch(() => { });
logger.warning(`No results found for keyword: ${keyword}`);
return false; if (!hasResults) {
}); continue;
}
if (!hasResults) {
continue; // Process multiple pages
} let currentPage = 1;
let hasNextPage = true;
// Process multiple pages
let currentPage = 1; while (hasNextPage && currentPage <= maxPages) {
let hasNextPage = true; logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
while (hasNextPage && currentPage <= maxPages) { // Extract jobs from current page
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`); const pageJobs = await extractJobsFromPage(
page,
// Extract jobs from current page keyword,
const pageJobs = await extractJobsFromPage( locationFilter
page, );
keyword,
locationFilter for (const job of pageJobs) {
); // Skip duplicates
if (seenJobs.has(job.jobId)) continue;
for (const job of pageJobs) { seenJobs.add(job.jobId);
// Skip duplicates
if (seenJobs.has(job.jobId)) continue; // Validate location if filtering enabled
seenJobs.add(job.jobId); if (locationFilter) {
const locationValid = validateLocationAgainstFilters(
// Validate location if filtering enabled job.location,
if (locationFilter) { locationFilter
const locationValid = validateLocationAgainstFilters( );
job.location,
locationFilter if (!locationValid) {
); rejectedResults.push({
...job,
if (!locationValid) { rejectionReason: "Location filter mismatch",
rejectedResults.push({ });
...job, continue;
rejectionReason: "Location filter mismatch", }
}); }
continue;
} results.push(job);
} }
results.push(job); // Check for next page
} hasNextPage = await hasNextPageAvailable(page);
if (hasNextPage && currentPage < maxPages) {
// Check for next page await navigateToNextPage(page, currentPage + 1);
hasNextPage = await hasNextPageAvailable(page); currentPage++;
if (hasNextPage && currentPage < maxPages) {
await navigateToNextPage(page, currentPage + 1); // Wait for new page to load
currentPage++; await page.waitForTimeout(2000);
} else {
// Wait for new page to load hasNextPage = false;
await page.waitForTimeout(2000); }
} else { }
hasNextPage = false; } catch (error) {
} logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} }
} catch (error) { }
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} logger.info(
} `🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
);
logger.info(
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected` return {
); results,
rejectedResults,
return { summary: {
results, totalJobs: results.length,
rejectedResults, totalRejected: rejectedResults.length,
summary: { keywords: keywords.join(", "),
totalJobs: results.length, locationFilter,
totalRejected: rejectedResults.length, source: "skipthedrive",
keywords: keywords.join(", "), },
locationFilter, };
source: "skipthedrive", } catch (error) {
}, logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
}; throw error;
} catch (error) { }
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`); }
throw error;
} /**
} * Extract jobs from current page
*/
/** async function extractJobsFromPage(page, keyword, locationFilter) {
* Extract jobs from current page const jobs = [];
*/
async function extractJobsFromPage(page, keyword, locationFilter) { try {
const jobs = []; // Get all job article elements
const jobElements = await page.$$("article.job_listing");
try {
// Get all job article elements for (const jobElement of jobElements) {
const jobElements = await page.$$("article.job_listing"); try {
const job = await extractJobData(jobElement, keyword);
for (const jobElement of jobElements) { if (job) {
try { jobs.push(job);
const job = await extractJobData(jobElement, keyword); }
if (job) { } catch (error) {
jobs.push(job); logger.warning(`Failed to extract job data: ${error.message}`);
} }
} catch (error) { }
logger.warning(`Failed to extract job data: ${error.message}`); } catch (error) {
} logger.error(`Failed to extract jobs from page: ${error.message}`);
} }
} catch (error) {
logger.error(`Failed to extract jobs from page: ${error.message}`); return jobs;
} }
return jobs; /**
} * Extract data from individual job element
*/
/** async function extractJobData(jobElement, keyword) {
* Extract data from individual job element try {
*/ // Extract job ID
async function extractJobData(jobElement, keyword) { const articleId = (await jobElement.getAttribute("id")) || "";
try { const jobId = articleId ? articleId.replace("post-", "") : "";
// Extract job ID
const articleId = (await jobElement.getAttribute("id")) || ""; // Extract title
const jobId = articleId ? articleId.replace("post-", "") : ""; const titleElement = await jobElement.$(".job_listing-title a");
const title = titleElement
// Extract title ? cleanText(await titleElement.textContent())
const titleElement = await jobElement.$(".job_listing-title a"); : "";
const title = titleElement const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
? cleanText(await titleElement.textContent())
: ""; // Extract company
const jobUrl = titleElement ? await titleElement.getAttribute("href") : ""; const companyElement = await jobElement.$(".company");
const company = companyElement
// Extract company ? cleanText(await companyElement.textContent())
const companyElement = await jobElement.$(".company"); : "";
const company = companyElement
? cleanText(await companyElement.textContent()) // Extract location
: ""; const locationElement = await jobElement.$(".location");
const location = locationElement
// Extract location ? cleanText(await locationElement.textContent())
const locationElement = await jobElement.$(".location"); : "";
const location = locationElement
? cleanText(await locationElement.textContent()) // Extract date posted
: ""; const dateElement = await jobElement.$(".job-date");
const dateText = dateElement
// Extract date posted ? cleanText(await dateElement.textContent())
const dateElement = await jobElement.$(".job-date"); : "";
const dateText = dateElement
? cleanText(await dateElement.textContent()) // Extract description
: ""; const descElement = await jobElement.$(".job_listing-description");
const description = descElement
// Extract description ? cleanText(await descElement.textContent())
const descElement = await jobElement.$(".job_listing-description"); : "";
const description = descElement
? cleanText(await descElement.textContent()) // Check if featured
: ""; const featuredElement = await jobElement.$(".featured");
const isFeatured = featuredElement !== null;
// Check if featured
const featuredElement = await jobElement.$(".featured"); // Parse date
const isFeatured = featuredElement !== null; let datePosted = null;
let daysAgo = null;
// Parse date
let datePosted = null; if (dateText) {
let daysAgo = null; const match = dateText.match(/(\d+)\s+days?\s+ago/);
if (match) {
if (dateText) { daysAgo = parseInt(match[1]);
const match = dateText.match(/(\d+)\s+days?\s+ago/); const date = new Date();
if (match) { date.setDate(date.getDate() - daysAgo);
daysAgo = parseInt(match[1]); datePosted = date.toISOString().split("T")[0];
const date = new Date(); }
date.setDate(date.getDate() - daysAgo); }
datePosted = date.toISOString().split("T")[0];
} return {
} jobId,
title,
return { company,
jobId, location,
title, jobUrl,
company, datePosted,
location, dateText,
jobUrl, daysAgo,
datePosted, description,
dateText, isFeatured,
daysAgo, keyword,
description, extractedAt: new Date().toISOString(),
isFeatured, source: "skipthedrive",
keyword, };
extractedAt: new Date().toISOString(), } catch (error) {
source: "skipthedrive", logger.warning(`Error extracting job data: ${error.message}`);
}; return null;
} catch (error) { }
logger.warning(`Error extracting job data: ${error.message}`); }
return null;
} /**
} * Check if next page is available
*/
/** async function hasNextPageAvailable(page) {
* Check if next page is available try {
*/ const nextButton = await page.$(".next-page");
async function hasNextPageAvailable(page) { return nextButton !== null;
try { } catch {
const nextButton = await page.$(".next-page"); return false;
return nextButton !== null; }
} catch { }
return false;
} /**
} * Navigate to next page
*/
/** async function navigateToNextPage(page, pageNumber) {
* Navigate to next page try {
*/ const nextButton = await page.$(".next-page");
async function navigateToNextPage(page, pageNumber) { if (nextButton) {
try { await nextButton.click();
const nextButton = await page.$(".next-page"); }
if (nextButton) { } catch (error) {
await nextButton.click(); logger.warning(
} `Failed to navigate to page ${pageNumber}: ${error.message}`
} catch (error) { );
logger.warning( }
`Failed to navigate to page ${pageNumber}: ${error.message}` }
);
} module.exports = {
} skipthedriveStrategy,
buildSearchUrl,
module.exports = { extractJobsFromPage,
skipthedriveStrategy, extractJobData,
buildSearchUrl, };
extractJobsFromPage,
extractJobData,
};

View File

@ -31,12 +31,13 @@ const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD; const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
const HEADLESS = process.env.HEADLESS !== "false"; const HEADLESS = process.env.HEADLESS !== "false";
const SEARCH_KEYWORDS = const SEARCH_KEYWORDS =
process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts"; process.env.SEARCH_KEYWORDS || "layoff";//,downsizing";//,job cuts";
const LOCATION_FILTER = process.env.LOCATION_FILTER; const LOCATION_FILTER = process.env.LOCATION_FILTER;
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false"; const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends"; const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL; const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50; const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
const EXTRACT_LOCATION_FROM_PROFILE = process.env.EXTRACT_LOCATION_FROM_PROFILE === "true";
/** /**
* Main LinkedIn parser function * Main LinkedIn parser function
@ -71,6 +72,7 @@ async function startLinkedInParser(options = {}) {
keywords, keywords,
locationFilter: LOCATION_FILTER, locationFilter: LOCATION_FILTER,
maxResults: MAX_RESULTS, maxResults: MAX_RESULTS,
extractLocationFromProfile: EXTRACT_LOCATION_FROM_PROFILE,
credentials: { credentials: {
username: LINKEDIN_USERNAME, username: LINKEDIN_USERNAME,
password: LINKEDIN_PASSWORD, password: LINKEDIN_PASSWORD,

View File

@ -21,6 +21,7 @@ async function linkedinStrategy(coreParser, options = {}) {
keywords = ["layoff", "downsizing", "job cuts"], keywords = ["layoff", "downsizing", "job cuts"],
locationFilter = null, locationFilter = null,
maxResults = 50, maxResults = 50,
extractLocationFromProfile = false,
credentials = {}, credentials = {},
} = options; } = options;
@ -106,7 +107,7 @@ async function linkedinStrategy(coreParser, options = {}) {
} }
// Extract posts from current page // Extract posts from current page
const posts = await extractPostsFromPage(page, keyword); const posts = await extractPostsFromPage(page, keyword, extractLocationFromProfile);
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`); logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
for (const post of posts) { for (const post of posts) {
@ -172,7 +173,7 @@ async function linkedinStrategy(coreParser, options = {}) {
/** /**
* Extract posts from current search results page * Extract posts from current search results page
*/ */
async function extractPostsFromPage(page, keyword) { async function extractPostsFromPage(page, keyword, extractLocationFromProfile = false) {
const posts = []; const posts = [];
try { try {
@ -254,10 +255,26 @@ async function extractPostsFromPage(page, keyword) {
const post = await extractPostData(postElements[i], keyword); const post = await extractPostData(postElements[i], keyword);
if (post) { if (post) {
// If location is missing and we're enabled to extract from profile, try to get it
if (!post.location && extractLocationFromProfile && post.authorUrl) {
try {
logger.debug(`📍 Location missing for post ${i + 1}, attempting to extract from profile...`);
const profileLocation = await extractLocationFromProfilePage(page, post.authorUrl);
if (profileLocation) {
post.location = profileLocation;
post.profileLocation = profileLocation;
logger.debug(`✅ Extracted location from profile: ${profileLocation}`);
}
} catch (error) {
logger.debug(`⚠️ Could not extract location from profile: ${error.message}`);
}
}
posts.push(post); posts.push(post);
const hasContent = post.content && post.content.length > 0; const hasContent = post.content && post.content.length > 0;
const hasAuthor = post.authorName && post.authorName.length > 0; const hasAuthor = post.authorName && post.authorName.length > 0;
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'})`); const hasLocation = post.location && post.location.length > 0;
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'}, location: ${hasLocation ? 'yes' : 'no'})`);
} else { } else {
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`); logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
} }
@ -626,6 +643,42 @@ async function extractPostData(postElement, keyword) {
} }
} }
} }
// Try to extract from data attributes or hidden elements
if (!data.location) {
// Check for data attributes that might contain location
const actorSection = el.querySelector(".feed-shared-actor");
if (actorSection) {
// Check all data attributes
for (const attr of actorSection.attributes) {
if (attr.name.startsWith("data-") && attr.value) {
const value = attr.value.toLowerCase();
// Look for location-like patterns in data attributes
if (/(ontario|alberta|british columbia|quebec|toronto|vancouver|calgary|ottawa|montreal)/i.test(value)) {
// Try to extract the actual location text
const locationMatch = attr.value.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
if (locationMatch) {
data.location = locationMatch[0];
break;
}
}
}
}
// Check for hidden spans or divs with location info
const hiddenElements = actorSection.querySelectorAll("span[style*='display: none'], div[style*='display: none'], [aria-hidden='true']");
for (const hiddenElem of hiddenElements) {
const text = hiddenElem.textContent || hiddenElem.getAttribute("aria-label") || "";
if (text && /(ontario|alberta|british columbia|quebec|toronto|vancouver)/i.test(text)) {
const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
if (locationMatch) {
data.location = locationMatch[0].trim();
break;
}
}
}
}
}
// Extract engagement metrics - try multiple approaches // Extract engagement metrics - try multiple approaches
const likesSelectors = [ const likesSelectors = [
@ -799,6 +852,48 @@ async function extractPostData(postElement, keyword) {
} }
} }
/**
* Extract location from a LinkedIn profile page
*/
async function extractLocationFromProfilePage(page, profileUrl) {
try {
// Ensure URL is complete
let fullUrl = profileUrl;
if (!fullUrl.startsWith("http")) {
fullUrl = `https://www.linkedin.com${fullUrl}`;
}
// Remove query parameters that might cause issues
fullUrl = fullUrl.split("?")[0];
// Open profile in new tab
const profilePage = await page.context().newPage();
try {
await profilePage.goto(fullUrl, {
waitUntil: "domcontentloaded",
timeout: 15000,
});
// Wait a bit for content to load
await new Promise(resolve => setTimeout(resolve, 2000));
// Use the extractLocationFromProfile utility from ai-analyzer
const location = await extractLocationFromProfile(profilePage);
await profilePage.close();
return location;
} catch (error) {
await profilePage.close();
throw error;
}
} catch (error) {
logger.debug(`Failed to extract location from profile ${profileUrl}: ${error.message}`);
return "";
}
}
/** /**
* Extract numbers from text (e.g., "15 likes" -> 15) * Extract numbers from text (e.g., "15 likes" -> 15)
*/ */

View File

@ -1,80 +1,80 @@
const fs = require("fs"); const fs = require("fs");
const assert = require("assert"); const assert = require("assert");
const { analyzeSinglePost, checkOllamaStatus } = require("../ai-analyzer"); const { analyzeSinglePost, checkOllamaStatus } = require("../ai-analyzer");
console.log("AI Analyzer logic tests"); console.log("AI Analyzer logic tests");
const testData = JSON.parse( const testData = JSON.parse(
fs.readFileSync(__dirname + "/test-data.json", "utf-8") fs.readFileSync(__dirname + "/test-data.json", "utf-8")
); );
const aiResults = testData.positive; const aiResults = testData.positive;
const context = "job layoffs and workforce reduction"; const context = "job layoffs and workforce reduction";
const model = process.env.OLLAMA_MODEL || "mistral"; // Use OLLAMA_MODEL from env or default to mistral const model = process.env.OLLAMA_MODEL || "mistral"; // Use OLLAMA_MODEL from env or default to mistral
(async () => { (async () => {
// Check if Ollama is available // Check if Ollama is available
const ollamaAvailable = await checkOllamaStatus(model); const ollamaAvailable = await checkOllamaStatus(model);
if (!ollamaAvailable) { if (!ollamaAvailable) {
console.log("SKIP: Ollama not available - skipping AI analyzer tests"); console.log("SKIP: Ollama not available - skipping AI analyzer tests");
console.log("PASS: AI analyzer tests skipped (Ollama not running)"); console.log("PASS: AI analyzer tests skipped (Ollama not running)");
return; return;
} }
console.log(`Testing AI analyzer with ${aiResults.length} posts...`); console.log(`Testing AI analyzer with ${aiResults.length} posts...`);
for (let i = 0; i < aiResults.length; i++) { for (let i = 0; i < aiResults.length; i++) {
const post = aiResults[i]; const post = aiResults[i];
console.log(`Testing post ${i + 1}: "${post.text.substring(0, 50)}..."`); console.log(`Testing post ${i + 1}: "${post.text.substring(0, 50)}..."`);
const aiOutput = await analyzeSinglePost(post.text, context, model); const aiOutput = await analyzeSinglePost(post.text, context, model);
// Test that the function returns the expected structure // Test that the function returns the expected structure
assert( assert(
typeof aiOutput === "object" && aiOutput !== null, typeof aiOutput === "object" && aiOutput !== null,
`Post ${i} output is not an object` `Post ${i} output is not an object`
); );
assert( assert(
typeof aiOutput.isRelevant === "boolean", typeof aiOutput.isRelevant === "boolean",
`Post ${i} isRelevant is not a boolean: ${typeof aiOutput.isRelevant}` `Post ${i} isRelevant is not a boolean: ${typeof aiOutput.isRelevant}`
); );
assert( assert(
typeof aiOutput.confidence === "number", typeof aiOutput.confidence === "number",
`Post ${i} confidence is not a number: ${typeof aiOutput.confidence}` `Post ${i} confidence is not a number: ${typeof aiOutput.confidence}`
); );
assert( assert(
typeof aiOutput.reasoning === "string", typeof aiOutput.reasoning === "string",
`Post ${i} reasoning is not a string: ${typeof aiOutput.reasoning}` `Post ${i} reasoning is not a string: ${typeof aiOutput.reasoning}`
); );
// Test that confidence is within valid range // Test that confidence is within valid range
assert( assert(
aiOutput.confidence >= 0 && aiOutput.confidence <= 1, aiOutput.confidence >= 0 && aiOutput.confidence <= 1,
`Post ${i} confidence out of range: ${aiOutput.confidence} (should be 0-1)` `Post ${i} confidence out of range: ${aiOutput.confidence} (should be 0-1)`
); );
// Test that reasoning exists and is not empty // Test that reasoning exists and is not empty
assert( assert(
aiOutput.reasoning && aiOutput.reasoning.length > 0, aiOutput.reasoning && aiOutput.reasoning.length > 0,
`Post ${i} missing or empty reasoning` `Post ${i} missing or empty reasoning`
); );
// Test that relevance is a boolean value // Test that relevance is a boolean value
assert( assert(
aiOutput.isRelevant === true || aiOutput.isRelevant === false, aiOutput.isRelevant === true || aiOutput.isRelevant === false,
`Post ${i} isRelevant is not a valid boolean: ${aiOutput.isRelevant}` `Post ${i} isRelevant is not a valid boolean: ${aiOutput.isRelevant}`
); );
console.log( console.log(
` ✓ Post ${i + 1}: relevant=${aiOutput.isRelevant}, confidence=${ ` ✓ Post ${i + 1}: relevant=${aiOutput.isRelevant}, confidence=${
aiOutput.confidence aiOutput.confidence
}` }`
); );
} }
console.log( console.log(
"PASS: AI analyzer returns valid structure and values for all test posts." "PASS: AI analyzer returns valid structure and values for all test posts."
); );
})(); })();