update CoreParser to increase default timeout and change navigation waitUntil option to networkidle

This commit is contained in:
ilia 2025-12-12 12:18:48 -05:00
parent ef9720abf2
commit 83ed86668e
24 changed files with 9137 additions and 9094 deletions

View File

@ -1,250 +1,250 @@
#!/usr/bin/env node #!/usr/bin/env node
/** /**
* AI Analyzer CLI * AI Analyzer CLI
* *
* Command-line interface for the ai-analyzer package * Command-line interface for the ai-analyzer package
* Can be used by any parser to analyze JSON files * Can be used by any parser to analyze JSON files
*/ */
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");
// Import AI utilities from this package // Import AI utilities from this package
const { const {
logger, logger,
analyzeBatch, analyzeBatch,
checkOllamaStatus, checkOllamaStatus,
findLatestResultsFile, findLatestResultsFile,
} = require("./index"); } = require("./index");
// Default configuration // Default configuration
const DEFAULT_CONTEXT = const DEFAULT_CONTEXT =
process.env.AI_CONTEXT || "job market analysis and trends"; process.env.AI_CONTEXT || "job market analysis and trends";
const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral"; const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral";
const DEFAULT_RESULTS_DIR = "results"; const DEFAULT_RESULTS_DIR = "results";
// Parse command line arguments // Parse command line arguments
const args = process.argv.slice(2); const args = process.argv.slice(2);
let inputFile = null; let inputFile = null;
let outputFile = null; let outputFile = null;
let context = DEFAULT_CONTEXT; let context = DEFAULT_CONTEXT;
let model = DEFAULT_MODEL; let model = DEFAULT_MODEL;
let findLatest = false; let findLatest = false;
let resultsDir = DEFAULT_RESULTS_DIR; let resultsDir = DEFAULT_RESULTS_DIR;
for (const arg of args) { for (const arg of args) {
if (arg.startsWith("--input=")) { if (arg.startsWith("--input=")) {
inputFile = arg.split("=")[1]; inputFile = arg.split("=")[1];
} else if (arg.startsWith("--output=")) { } else if (arg.startsWith("--output=")) {
outputFile = arg.split("=")[1]; outputFile = arg.split("=")[1];
} else if (arg.startsWith("--context=")) { } else if (arg.startsWith("--context=")) {
context = arg.split("=")[1]; context = arg.split("=")[1];
} else if (arg.startsWith("--model=")) { } else if (arg.startsWith("--model=")) {
model = arg.split("=")[1]; model = arg.split("=")[1];
} else if (arg.startsWith("--dir=")) { } else if (arg.startsWith("--dir=")) {
resultsDir = arg.split("=")[1]; resultsDir = arg.split("=")[1];
} else if (arg === "--latest") { } else if (arg === "--latest") {
findLatest = true; findLatest = true;
} else if (arg === "--help" || arg === "-h") { } else if (arg === "--help" || arg === "-h") {
console.log(` console.log(`
AI Analyzer CLI AI Analyzer CLI
Usage: node cli.js [options] Usage: node cli.js [options]
Options: Options:
--input=FILE Input JSON file --input=FILE Input JSON file
--output=FILE Output file (default: ai-analysis-{timestamp}.json) --output=FILE Output file (default: ai-analysis-{timestamp}.json)
--context="description" Analysis context (default: "${DEFAULT_CONTEXT}") --context="description" Analysis context (default: "${DEFAULT_CONTEXT}")
--model=MODEL Ollama model (default: ${DEFAULT_MODEL}) --model=MODEL Ollama model (default: ${DEFAULT_MODEL})
--latest Use latest results file from results directory --latest Use latest results file from results directory
--dir=PATH Directory to look for results (default: 'results') --dir=PATH Directory to look for results (default: 'results')
--help, -h Show this help --help, -h Show this help
Examples: Examples:
node cli.js --input=results.json node cli.js --input=results.json
node cli.js --latest --dir=results node cli.js --latest --dir=results
node cli.js --input=results.json --context="job trends" --model=mistral node cli.js --input=results.json --context="job trends" --model=mistral
Environment Variables: Environment Variables:
AI_CONTEXT Default analysis context AI_CONTEXT Default analysis context
OLLAMA_MODEL Default Ollama model OLLAMA_MODEL Default Ollama model
`); `);
process.exit(0); process.exit(0);
} }
} }
async function main() { async function main() {
try { try {
// Determine input file // Determine input file
if (findLatest) { if (findLatest) {
try { try {
inputFile = findLatestResultsFile(resultsDir); inputFile = findLatestResultsFile(resultsDir);
logger.info(`Found latest results file: ${inputFile}`); logger.info(`Found latest results file: ${inputFile}`);
} catch (error) { } catch (error) {
logger.error( logger.error(
`❌ No results files found in '${resultsDir}': ${error.message}` `❌ No results files found in '${resultsDir}': ${error.message}`
); );
logger.info(`💡 To create results files:`); logger.info(`💡 To create results files:`);
logger.info( logger.info(
` 1. Run a parser first (e.g., npm start in linkedin-parser)` ` 1. Run a parser first (e.g., npm start in linkedin-parser)`
); );
logger.info(` 2. Or provide a specific file with --input=FILE`); logger.info(` 2. Or provide a specific file with --input=FILE`);
logger.info(` 3. Or create a sample JSON file to test with`); logger.info(` 3. Or create a sample JSON file to test with`);
process.exit(1); process.exit(1);
} }
} }
// If inputFile is a relative path and --dir is set, resolve it // If inputFile is a relative path and --dir is set, resolve it
if (inputFile && !path.isAbsolute(inputFile) && !fs.existsSync(inputFile)) { if (inputFile && !path.isAbsolute(inputFile) && !fs.existsSync(inputFile)) {
const candidate = path.join(resultsDir, inputFile); const candidate = path.join(resultsDir, inputFile);
if (fs.existsSync(candidate)) { if (fs.existsSync(candidate)) {
inputFile = candidate; inputFile = candidate;
} }
} }
if (!inputFile) { if (!inputFile) {
logger.error("❌ Input file required. Use --input=FILE or --latest"); logger.error("❌ Input file required. Use --input=FILE or --latest");
logger.info(`💡 Examples:`); logger.info(`💡 Examples:`);
logger.info(` node cli.js --input=results.json`); logger.info(` node cli.js --input=results.json`);
logger.info(` node cli.js --latest --dir=results`); logger.info(` node cli.js --latest --dir=results`);
logger.info(` node cli.js --help`); logger.info(` node cli.js --help`);
process.exit(1); process.exit(1);
} }
// Load input file // Load input file
logger.step(`Loading input file: ${inputFile}`); logger.step(`Loading input file: ${inputFile}`);
if (!fs.existsSync(inputFile)) { if (!fs.existsSync(inputFile)) {
throw new Error(`Input file not found: ${inputFile}`); throw new Error(`Input file not found: ${inputFile}`);
} }
const data = JSON.parse(fs.readFileSync(inputFile, "utf-8")); const data = JSON.parse(fs.readFileSync(inputFile, "utf-8"));
// Extract posts from different formats // Extract posts from different formats
let posts = []; let posts = [];
if (data.results && Array.isArray(data.results)) { if (data.results && Array.isArray(data.results)) {
posts = data.results; posts = data.results;
logger.info(`Found ${posts.length} items in results array`); logger.info(`Found ${posts.length} items in results array`);
} else if (Array.isArray(data)) { } else if (Array.isArray(data)) {
posts = data; posts = data;
logger.info(`Found ${posts.length} items in array`); logger.info(`Found ${posts.length} items in array`);
} else { } else {
throw new Error("Invalid JSON format - need array or {results: [...]}"); throw new Error("Invalid JSON format - need array or {results: [...]}");
} }
if (posts.length === 0) { if (posts.length === 0) {
throw new Error("No items found to analyze"); throw new Error("No items found to analyze");
} }
// Check AI availability // Check AI availability
logger.step("Checking AI availability"); logger.step("Checking AI availability");
const aiAvailable = await checkOllamaStatus(model); const aiAvailable = await checkOllamaStatus(model);
if (!aiAvailable) { if (!aiAvailable) {
throw new Error( throw new Error(
`AI not available. Make sure Ollama is running and model '${model}' is installed.` `AI not available. Make sure Ollama is running and model '${model}' is installed.`
); );
} }
// Check if results already have AI analysis // Check if results already have AI analysis
const hasExistingAI = posts.some((post) => post.aiAnalysis); const hasExistingAI = posts.some((post) => post.aiAnalysis);
if (hasExistingAI) { if (hasExistingAI) {
logger.info( logger.info(
`📋 Results already contain AI analysis - will update with new context` `📋 Results already contain AI analysis - will update with new context`
); );
} }
// Prepare data for analysis // Prepare data for analysis
const analysisData = posts.map((post, i) => ({ const analysisData = posts.map((post, i) => ({
text: post.text || post.content || post.post || "", text: post.text || post.content || post.post || "",
location: post.location || "Unknown", location: post.location || "Unknown",
keyword: post.keyword || "Unknown", keyword: post.keyword || "Unknown",
timestamp: post.timestamp || new Date().toISOString(), timestamp: post.timestamp || new Date().toISOString(),
})); }));
// Run analysis // Run analysis
logger.step(`Running AI analysis with context: "${context}"`); logger.step(`Running AI analysis with context: "${context}"`);
const analysis = await analyzeBatch(analysisData, context, model); const analysis = await analyzeBatch(analysisData, context, model);
// Integrate AI analysis back into the original results // Integrate AI analysis back into the original results
const updatedPosts = posts.map((post, index) => { const updatedPosts = posts.map((post, index) => {
const aiResult = analysis[index]; const aiResult = analysis[index];
return { return {
...post, ...post,
aiAnalysis: { aiAnalysis: {
isRelevant: aiResult.isRelevant, isRelevant: aiResult.isRelevant,
confidence: aiResult.confidence, confidence: aiResult.confidence,
reasoning: aiResult.reasoning, reasoning: aiResult.reasoning,
context: context, context: context,
model: model, model: model,
analyzedAt: new Date().toISOString(), analyzedAt: new Date().toISOString(),
}, },
}; };
}); });
// Update the original data structure // Update the original data structure
if (data.results && Array.isArray(data.results)) { if (data.results && Array.isArray(data.results)) {
data.results = updatedPosts; data.results = updatedPosts;
// Update metadata // Update metadata
data.metadata = data.metadata || {}; data.metadata = data.metadata || {};
data.metadata.aiAnalysisUpdated = new Date().toISOString(); data.metadata.aiAnalysisUpdated = new Date().toISOString();
data.metadata.aiContext = context; data.metadata.aiContext = context;
data.metadata.aiModel = model; data.metadata.aiModel = model;
} else { } else {
// If it's a simple array, create a proper structure // If it's a simple array, create a proper structure
data = { data = {
metadata: { metadata: {
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
totalItems: updatedPosts.length, totalItems: updatedPosts.length,
aiContext: context, aiContext: context,
aiModel: model, aiModel: model,
analysisType: "cli", analysisType: "cli",
}, },
results: updatedPosts, results: updatedPosts,
}; };
} }
// Generate output filename if not provided // Generate output filename if not provided
if (!outputFile) { if (!outputFile) {
// Use the original filename with -ai suffix // Use the original filename with -ai suffix
const originalName = path.basename(inputFile, path.extname(inputFile)); const originalName = path.basename(inputFile, path.extname(inputFile));
outputFile = path.join( outputFile = path.join(
path.dirname(inputFile), path.dirname(inputFile),
`${originalName}-ai.json` `${originalName}-ai.json`
); );
} }
// Save updated results back to file // Save updated results back to file
fs.writeFileSync(outputFile, JSON.stringify(data, null, 2)); fs.writeFileSync(outputFile, JSON.stringify(data, null, 2));
// Show summary // Show summary
const relevant = analysis.filter((a) => a.isRelevant).length; const relevant = analysis.filter((a) => a.isRelevant).length;
const irrelevant = analysis.filter((a) => !a.isRelevant).length; const irrelevant = analysis.filter((a) => !a.isRelevant).length;
const avgConfidence = const avgConfidence =
analysis.reduce((sum, a) => sum + a.confidence, 0) / analysis.length; analysis.reduce((sum, a) => sum + a.confidence, 0) / analysis.length;
logger.success("✅ AI analysis completed and integrated"); logger.success("✅ AI analysis completed and integrated");
logger.info(`📊 Context: "${context}"`); logger.info(`📊 Context: "${context}"`);
logger.info(`📈 Total items analyzed: ${analysis.length}`); logger.info(`📈 Total items analyzed: ${analysis.length}`);
logger.info( logger.info(
`✅ Relevant items: ${relevant} (${( `✅ Relevant items: ${relevant} (${(
(relevant / analysis.length) * (relevant / analysis.length) *
100 100
).toFixed(1)}%)` ).toFixed(1)}%)`
); );
logger.info( logger.info(
`❌ Irrelevant items: ${irrelevant} (${( `❌ Irrelevant items: ${irrelevant} (${(
(irrelevant / analysis.length) * (irrelevant / analysis.length) *
100 100
).toFixed(1)}%)` ).toFixed(1)}%)`
); );
logger.info(`🎯 Average confidence: ${avgConfidence.toFixed(2)}`); logger.info(`🎯 Average confidence: ${avgConfidence.toFixed(2)}`);
logger.file(`🧠 Updated results saved to: ${outputFile}`); logger.file(`🧠 Updated results saved to: ${outputFile}`);
} catch (error) { } catch (error) {
logger.error(`❌ Analysis failed: ${error.message}`); logger.error(`❌ Analysis failed: ${error.message}`);
process.exit(1); process.exit(1);
} }
} }
// Run the CLI // Run the CLI
main(); main();

View File

@ -1,346 +1,346 @@
/** /**
* AI Analyzer Demo * AI Analyzer Demo
* *
* Demonstrates all the core utilities provided by the ai-analyzer package: * Demonstrates all the core utilities provided by the ai-analyzer package:
* - Logger functionality * - Logger functionality
* - Text processing utilities * - Text processing utilities
* - Location validation * - Location validation
* - AI analysis capabilities * - AI analysis capabilities
* - Test utilities * - Test utilities
*/ */
const { const {
logger, logger,
Logger, Logger,
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
parseLocationFilters, parseLocationFilters,
validateLocationAgainstFilters, validateLocationAgainstFilters,
extractLocationFromProfile, extractLocationFromProfile,
analyzeBatch, analyzeBatch,
} = require("./index"); } = require("./index");
// Terminal colors for demo output // Terminal colors for demo output
const colors = { const colors = {
reset: "\x1b[0m", reset: "\x1b[0m",
bright: "\x1b[1m", bright: "\x1b[1m",
cyan: "\x1b[36m", cyan: "\x1b[36m",
green: "\x1b[32m", green: "\x1b[32m",
yellow: "\x1b[33m", yellow: "\x1b[33m",
blue: "\x1b[34m", blue: "\x1b[34m",
magenta: "\x1b[35m", magenta: "\x1b[35m",
red: "\x1b[31m", red: "\x1b[31m",
}; };
const demo = { const demo = {
title: (text) => title: (text) =>
console.log(`\n${colors.bright}${colors.cyan}${text}${colors.reset}`), console.log(`\n${colors.bright}${colors.cyan}${text}${colors.reset}`),
section: (text) => section: (text) =>
console.log(`\n${colors.bright}${colors.magenta}${text}${colors.reset}`), console.log(`\n${colors.bright}${colors.magenta}${text}${colors.reset}`),
success: (text) => console.log(`${colors.green}${text}${colors.reset}`), success: (text) => console.log(`${colors.green}${text}${colors.reset}`),
info: (text) => console.log(`${colors.blue} ${text}${colors.reset}`), info: (text) => console.log(`${colors.blue} ${text}${colors.reset}`),
warning: (text) => console.log(`${colors.yellow}⚠️ ${text}${colors.reset}`), warning: (text) => console.log(`${colors.yellow}⚠️ ${text}${colors.reset}`),
error: (text) => console.log(`${colors.red}${text}${colors.reset}`), error: (text) => console.log(`${colors.red}${text}${colors.reset}`),
code: (text) => console.log(`${colors.cyan}${text}${colors.reset}`), code: (text) => console.log(`${colors.cyan}${text}${colors.reset}`),
}; };
async function runDemo() { async function runDemo() {
demo.title("=== AI Analyzer Demo ==="); demo.title("=== AI Analyzer Demo ===");
demo.info( demo.info(
"This demo showcases all the core utilities provided by the ai-analyzer package." "This demo showcases all the core utilities provided by the ai-analyzer package."
); );
demo.info("Press Enter to continue through each section...\n"); demo.info("Press Enter to continue through each section...\n");
await waitForEnter(); await waitForEnter();
// 1. Logger Demo // 1. Logger Demo
await demonstrateLogger(); await demonstrateLogger();
// 2. Text Processing Demo // 2. Text Processing Demo
await demonstrateTextProcessing(); await demonstrateTextProcessing();
// 3. Location Validation Demo // 3. Location Validation Demo
await demonstrateLocationValidation(); await demonstrateLocationValidation();
// 4. AI Analysis Demo // 4. AI Analysis Demo
await demonstrateAIAnalysis(); await demonstrateAIAnalysis();
// 5. Integration Demo // 5. Integration Demo
await demonstrateIntegration(); await demonstrateIntegration();
demo.title("=== Demo Complete ==="); demo.title("=== Demo Complete ===");
demo.success("All ai-analyzer utilities demonstrated successfully!"); demo.success("All ai-analyzer utilities demonstrated successfully!");
demo.info("Check the README.md for detailed API documentation."); demo.info("Check the README.md for detailed API documentation.");
} }
async function demonstrateLogger() { async function demonstrateLogger() {
demo.section("1. Logger Utilities"); demo.section("1. Logger Utilities");
demo.info( demo.info(
"The logger provides consistent logging across all parsers with configurable levels and color support." "The logger provides consistent logging across all parsers with configurable levels and color support."
); );
demo.code("// Using default logger"); demo.code("// Using default logger");
logger.info("This is an info message"); logger.info("This is an info message");
logger.warning("This is a warning message"); logger.warning("This is a warning message");
logger.error("This is an error message"); logger.error("This is an error message");
logger.success("This is a success message"); logger.success("This is a success message");
logger.debug("This is a debug message (if enabled)"); logger.debug("This is a debug message (if enabled)");
demo.code("// Convenience methods with emoji prefixes"); demo.code("// Convenience methods with emoji prefixes");
logger.step("Starting demo process"); logger.step("Starting demo process");
logger.search("Searching for keywords"); logger.search("Searching for keywords");
logger.ai("Running AI analysis"); logger.ai("Running AI analysis");
logger.location("Validating location"); logger.location("Validating location");
logger.file("Saving results"); logger.file("Saving results");
demo.code("// Custom logger configuration"); demo.code("// Custom logger configuration");
const customLogger = new Logger({ const customLogger = new Logger({
debug: false, debug: false,
colors: true, colors: true,
}); });
customLogger.info("Custom logger with debug disabled"); customLogger.info("Custom logger with debug disabled");
customLogger.debug("This won't show"); customLogger.debug("This won't show");
demo.code("// Silent mode"); demo.code("// Silent mode");
const silentLogger = new Logger(); const silentLogger = new Logger();
silentLogger.silent(); silentLogger.silent();
silentLogger.info("This won't show"); silentLogger.info("This won't show");
silentLogger.verbose(); // Re-enable all levels silentLogger.verbose(); // Re-enable all levels
await waitForEnter(); await waitForEnter();
} }
async function demonstrateTextProcessing() { async function demonstrateTextProcessing() {
demo.section("2. Text Processing Utilities"); demo.section("2. Text Processing Utilities");
demo.info( demo.info(
"Text utilities provide content cleaning and keyword matching capabilities." "Text utilities provide content cleaning and keyword matching capabilities."
); );
const sampleTexts = [ const sampleTexts = [
"Check out this #awesome post! https://example.com 🚀", "Check out this #awesome post! https://example.com 🚀",
"Just got #laidoff from my job. Looking for new opportunities!", "Just got #laidoff from my job. Looking for new opportunities!",
"Company is #downsizing and I'm affected. #RIF #layoff", "Company is #downsizing and I'm affected. #RIF #layoff",
"Great news! We're #hiring new developers! 🎉", "Great news! We're #hiring new developers! 🎉",
]; ];
demo.code("// Text cleaning examples:"); demo.code("// Text cleaning examples:");
sampleTexts.forEach((text, index) => { sampleTexts.forEach((text, index) => {
const cleaned = cleanText(text); const cleaned = cleanText(text);
demo.info(`Original: ${text}`); demo.info(`Original: ${text}`);
demo.success(`Cleaned: ${cleaned}`); demo.success(`Cleaned: ${cleaned}`);
console.log(); console.log();
}); });
demo.code("// Keyword matching:"); demo.code("// Keyword matching:");
const keywords = ["layoff", "downsizing", "RIF", "hiring"]; const keywords = ["layoff", "downsizing", "RIF", "hiring"];
sampleTexts.forEach((text, index) => { sampleTexts.forEach((text, index) => {
const hasMatch = containsAnyKeyword(text, keywords); const hasMatch = containsAnyKeyword(text, keywords);
const matchedKeywords = keywords.filter((keyword) => const matchedKeywords = keywords.filter((keyword) =>
text.toLowerCase().includes(keyword.toLowerCase()) text.toLowerCase().includes(keyword.toLowerCase())
); );
demo.info( demo.info(
`Text ${index + 1}: ${hasMatch ? "✅" : "❌"} ${ `Text ${index + 1}: ${hasMatch ? "✅" : "❌"} ${
matchedKeywords.join(", ") || "No matches" matchedKeywords.join(", ") || "No matches"
}` }`
); );
}); });
await waitForEnter(); await waitForEnter();
} }
async function demonstrateLocationValidation() { async function demonstrateLocationValidation() {
demo.section("3. Location Validation Utilities"); demo.section("3. Location Validation Utilities");
demo.info( demo.info(
"Location utilities provide geographic filtering and validation capabilities." "Location utilities provide geographic filtering and validation capabilities."
); );
demo.code("// Location filter parsing:"); demo.code("// Location filter parsing:");
const filterStrings = [ const filterStrings = [
"Ontario,Manitoba", "Ontario,Manitoba",
"Toronto,Vancouver", "Toronto,Vancouver",
"British Columbia,Alberta", "British Columbia,Alberta",
"Canada", "Canada",
]; ];
filterStrings.forEach((filterString) => { filterStrings.forEach((filterString) => {
const filters = parseLocationFilters(filterString); const filters = parseLocationFilters(filterString);
demo.info(`Filter: "${filterString}"`); demo.info(`Filter: "${filterString}"`);
demo.success(`Parsed: [${filters.join(", ")}]`); demo.success(`Parsed: [${filters.join(", ")}]`);
console.log(); console.log();
}); });
demo.code("// Location validation examples:"); demo.code("// Location validation examples:");
const testLocations = [ const testLocations = [
{ location: "Toronto, Ontario, Canada", filters: ["Ontario"] }, { location: "Toronto, Ontario, Canada", filters: ["Ontario"] },
{ location: "Vancouver, BC", filters: ["British Columbia"] }, { location: "Vancouver, BC", filters: ["British Columbia"] },
{ location: "Calgary, Alberta", filters: ["Ontario"] }, { location: "Calgary, Alberta", filters: ["Ontario"] },
{ location: "Montreal, Quebec", filters: ["Ontario", "Manitoba"] }, { location: "Montreal, Quebec", filters: ["Ontario", "Manitoba"] },
{ location: "New York, NY", filters: ["Ontario"] }, { location: "New York, NY", filters: ["Ontario"] },
]; ];
testLocations.forEach(({ location, filters }) => { testLocations.forEach(({ location, filters }) => {
const isValid = validateLocationAgainstFilters(location, filters); const isValid = validateLocationAgainstFilters(location, filters);
demo.info(`Location: "${location}"`); demo.info(`Location: "${location}"`);
demo.info(`Filters: [${filters.join(", ")}]`); demo.info(`Filters: [${filters.join(", ")}]`);
demo.success(`Valid: ${isValid ? "✅ Yes" : "❌ No"}`); demo.success(`Valid: ${isValid ? "✅ Yes" : "❌ No"}`);
console.log(); console.log();
}); });
demo.code("// Profile location extraction:"); demo.code("// Profile location extraction:");
const profileTexts = [ const profileTexts = [
"Software Engineer at Tech Corp • Toronto, Ontario", "Software Engineer at Tech Corp • Toronto, Ontario",
"Product Manager • Vancouver, BC", "Product Manager • Vancouver, BC",
"Data Scientist • Remote", "Data Scientist • Remote",
"CEO at Startup Inc • Montreal, Quebec, Canada", "CEO at Startup Inc • Montreal, Quebec, Canada",
]; ];
profileTexts.forEach((profileText) => { profileTexts.forEach((profileText) => {
const location = extractLocationFromProfile(profileText); const location = extractLocationFromProfile(profileText);
demo.info(`Profile: "${profileText}"`); demo.info(`Profile: "${profileText}"`);
demo.success(`Extracted: "${location || "No location found"}"`); demo.success(`Extracted: "${location || "No location found"}"`);
console.log(); console.log();
}); });
await waitForEnter(); await waitForEnter();
} }
async function demonstrateAIAnalysis() { async function demonstrateAIAnalysis() {
demo.section("4. AI Analysis Utilities"); demo.section("4. AI Analysis Utilities");
demo.info( demo.info(
"AI utilities provide content analysis using OpenAI or local Ollama models." "AI utilities provide content analysis using OpenAI or local Ollama models."
); );
// Mock posts for demo // Mock posts for demo
const mockPosts = [ const mockPosts = [
{ {
id: "1", id: "1",
content: content:
"Just got laid off from my software engineering role. Looking for new opportunities in Toronto.", "Just got laid off from my software engineering role. Looking for new opportunities in Toronto.",
author: "John Doe", author: "John Doe",
location: "Toronto, Ontario", location: "Toronto, Ontario",
}, },
{ {
id: "2", id: "2",
content: content:
"Our company is downsizing and I'm affected. This is really tough news.", "Our company is downsizing and I'm affected. This is really tough news.",
author: "Jane Smith", author: "Jane Smith",
location: "Vancouver, BC", location: "Vancouver, BC",
}, },
{ {
id: "3", id: "3",
content: content:
"We're hiring! Looking for talented developers to join our team.", "We're hiring! Looking for talented developers to join our team.",
author: "Bob Wilson", author: "Bob Wilson",
location: "Calgary, Alberta", location: "Calgary, Alberta",
}, },
]; ];
demo.code("// Mock AI analysis (simulated):"); demo.code("// Mock AI analysis (simulated):");
demo.info("In a real scenario, this would call Ollama or OpenAI API"); demo.info("In a real scenario, this would call Ollama or OpenAI API");
mockPosts.forEach((post, index) => { mockPosts.forEach((post, index) => {
demo.info(`Post ${index + 1}: ${post.content.substring(0, 50)}...`); demo.info(`Post ${index + 1}: ${post.content.substring(0, 50)}...`);
demo.success( demo.success(
`Analysis: Relevant to job layoffs (confidence: 0.${85 + index * 5})` `Analysis: Relevant to job layoffs (confidence: 0.${85 + index * 5})`
); );
console.log(); console.log();
}); });
demo.code("// Batch analysis simulation:"); demo.code("// Batch analysis simulation:");
demo.info("Processing batch of 3 posts..."); demo.info("Processing batch of 3 posts...");
await simulateProcessing(); await simulateProcessing();
demo.success("Batch analysis completed!"); demo.success("Batch analysis completed!");
await waitForEnter(); await waitForEnter();
} }
async function demonstrateIntegration() { async function demonstrateIntegration() {
demo.section("5. Integration Example"); demo.section("5. Integration Example");
demo.info("Here's how all utilities work together in a real scenario:"); demo.info("Here's how all utilities work together in a real scenario:");
const samplePost = { const samplePost = {
id: "demo-1", id: "demo-1",
content: content:
"Just got #laidoff from my job at TechCorp! Looking for new opportunities in #Toronto. This is really tough but I'm staying positive! 🚀", "Just got #laidoff from my job at TechCorp! Looking for new opportunities in #Toronto. This is really tough but I'm staying positive! 🚀",
author: "Demo User", author: "Demo User",
location: "Toronto, Ontario, Canada", location: "Toronto, Ontario, Canada",
}; };
demo.code("// Processing pipeline:"); demo.code("// Processing pipeline:");
// 1. Log the start // 1. Log the start
logger.step("Processing new post"); logger.step("Processing new post");
// 2. Clean the text // 2. Clean the text
const cleanedContent = cleanText(samplePost.content); const cleanedContent = cleanText(samplePost.content);
logger.info(`Cleaned content: ${cleanedContent}`); logger.info(`Cleaned content: ${cleanedContent}`);
// 3. Check for keywords // 3. Check for keywords
const keywords = ["layoff", "downsizing", "RIF"]; const keywords = ["layoff", "downsizing", "RIF"];
const hasKeywords = containsAnyKeyword(cleanedContent, keywords); const hasKeywords = containsAnyKeyword(cleanedContent, keywords);
logger.search(`Keyword match: ${hasKeywords ? "Found" : "Not found"}`); logger.search(`Keyword match: ${hasKeywords ? "Found" : "Not found"}`);
// 4. Validate location // 4. Validate location
const locationFilters = parseLocationFilters("Ontario,Manitoba"); const locationFilters = parseLocationFilters("Ontario,Manitoba");
const isValidLocation = validateLocationAgainstFilters( const isValidLocation = validateLocationAgainstFilters(
samplePost.location, samplePost.location,
locationFilters locationFilters
); );
logger.location(`Location valid: ${isValidLocation ? "Yes" : "No"}`); logger.location(`Location valid: ${isValidLocation ? "Yes" : "No"}`);
// 5. Simulate AI analysis // 5. Simulate AI analysis
if (hasKeywords && isValidLocation) { if (hasKeywords && isValidLocation) {
logger.ai("Running AI analysis..."); logger.ai("Running AI analysis...");
await simulateProcessing(); await simulateProcessing();
logger.success("Post accepted and analyzed!"); logger.success("Post accepted and analyzed!");
} else { } else {
logger.warning("Post rejected - doesn't meet criteria"); logger.warning("Post rejected - doesn't meet criteria");
} }
await waitForEnter(); await waitForEnter();
} }
// Helper functions // Helper functions
function waitForEnter() { function waitForEnter() {
return new Promise((resolve) => { return new Promise((resolve) => {
const readline = require("readline"); const readline = require("readline");
const rl = readline.createInterface({ const rl = readline.createInterface({
input: process.stdin, input: process.stdin,
output: process.stdout, output: process.stdout,
}); });
rl.question("\nPress Enter to continue...", () => { rl.question("\nPress Enter to continue...", () => {
rl.close(); rl.close();
resolve(); resolve();
}); });
}); });
} }
async function simulateProcessing() { async function simulateProcessing() {
return new Promise((resolve) => { return new Promise((resolve) => {
const dots = [".", "..", "..."]; const dots = [".", "..", "..."];
let i = 0; let i = 0;
const interval = setInterval(() => { const interval = setInterval(() => {
process.stdout.write(`\rProcessing${dots[i]}`); process.stdout.write(`\rProcessing${dots[i]}`);
i = (i + 1) % dots.length; i = (i + 1) % dots.length;
}, 500); }, 500);
setTimeout(() => { setTimeout(() => {
clearInterval(interval); clearInterval(interval);
process.stdout.write("\r"); process.stdout.write("\r");
resolve(); resolve();
}, 2000); }, 2000);
}); });
} }
// Run the demo if this file is executed directly // Run the demo if this file is executed directly
if (require.main === module) { if (require.main === module) {
runDemo().catch((error) => { runDemo().catch((error) => {
demo.error(`Demo failed: ${error.message}`); demo.error(`Demo failed: ${error.message}`);
process.exit(1); process.exit(1);
}); });
} }
module.exports = { runDemo }; module.exports = { runDemo };

View File

@ -1,22 +1,22 @@
/** /**
* ai-analyzer - Core utilities for parsers * ai-analyzer - Core utilities for parsers
* Main entry point that exports all modules * Main entry point that exports all modules
*/ */
// Export all utilities with clean namespace // Export all utilities with clean namespace
module.exports = { module.exports = {
// Logger utilities // Logger utilities
...require("./src/logger"), ...require("./src/logger"),
// AI analysis utilities // AI analysis utilities
...require("./src/ai-utils"), ...require("./src/ai-utils"),
// Text processing utilities // Text processing utilities
...require("./src/text-utils"), ...require("./src/text-utils"),
// Location validation utilities // Location validation utilities
...require("./src/location-utils"), ...require("./src/location-utils"),
// Test utilities // Test utilities
...require("./src/test-utils"), ...require("./src/test-utils"),
}; };

File diff suppressed because it is too large Load Diff

View File

@ -1,301 +1,301 @@
const { logger } = require("./logger"); const { logger } = require("./logger");
/** /**
* AI Analysis utilities for post processing with Ollama * AI Analysis utilities for post processing with Ollama
* Extracted from ai-analyzer-local.js for reuse across parsers * Extracted from ai-analyzer-local.js for reuse across parsers
*/ */
/** /**
* Check if Ollama is running and the model is available * Check if Ollama is running and the model is available
*/ */
async function checkOllamaStatus( async function checkOllamaStatus(
model = "mistral", model = "mistral",
ollamaHost = "http://localhost:11434" ollamaHost = "http://localhost:11434"
) { ) {
try { try {
// Check if Ollama is running // Check if Ollama is running
const response = await fetch(`${ollamaHost}/api/tags`); const response = await fetch(`${ollamaHost}/api/tags`);
if (!response.ok) { if (!response.ok) {
throw new Error(`Ollama not running on ${ollamaHost}`); throw new Error(`Ollama not running on ${ollamaHost}`);
} }
const data = await response.json(); const data = await response.json();
const availableModels = data.models.map((m) => m.name); const availableModels = data.models.map((m) => m.name);
logger.ai("Ollama is running"); logger.ai("Ollama is running");
logger.info( logger.info(
`📦 Available models: ${availableModels `📦 Available models: ${availableModels
.map((m) => m.split(":")[0]) .map((m) => m.split(":")[0])
.join(", ")}` .join(", ")}`
); );
// Check if requested model is available // Check if requested model is available
const modelExists = availableModels.some((m) => m.startsWith(model)); const modelExists = availableModels.some((m) => m.startsWith(model));
if (!modelExists) { if (!modelExists) {
logger.error(`Model "${model}" not found`); logger.error(`Model "${model}" not found`);
logger.error(`💡 Install it with: ollama pull ${model}`); logger.error(`💡 Install it with: ollama pull ${model}`);
logger.error( logger.error(
`💡 Or choose from: ${availableModels `💡 Or choose from: ${availableModels
.map((m) => m.split(":")[0]) .map((m) => m.split(":")[0])
.join(", ")}` .join(", ")}`
); );
return false; return false;
} }
logger.success(`Using model: ${model}`); logger.success(`Using model: ${model}`);
return true; return true;
} catch (error) { } catch (error) {
logger.error(`Error connecting to Ollama: ${error.message}`); logger.error(`Error connecting to Ollama: ${error.message}`);
logger.error("💡 Make sure Ollama is installed and running:"); logger.error("💡 Make sure Ollama is installed and running:");
logger.error(" 1. Install: https://ollama.ai/"); logger.error(" 1. Install: https://ollama.ai/");
logger.error(" 2. Start: ollama serve"); logger.error(" 2. Start: ollama serve");
logger.error(` 3. Install model: ollama pull ${model}`); logger.error(` 3. Install model: ollama pull ${model}`);
return false; return false;
} }
} }
/** /**
* Analyze multiple posts using local Ollama * Analyze multiple posts using local Ollama
*/ */
async function analyzeBatch( async function analyzeBatch(
posts, posts,
context, context,
model = "mistral", model = "mistral",
ollamaHost = "http://localhost:11434" ollamaHost = "http://localhost:11434"
) { ) {
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`); logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
try { try {
const prompt = `You are an expert at analyzing LinkedIn posts for relevance to specific contexts. const prompt = `You are an expert at analyzing LinkedIn posts for relevance to specific contexts.
CONTEXT TO MATCH: "${context}" CONTEXT TO MATCH: "${context}"
Analyze these ${ Analyze these ${
posts.length posts.length
} LinkedIn posts and determine if each relates to the context above. } LinkedIn posts and determine if each relates to the context above.
POSTS: POSTS:
${posts ${posts
.map( .map(
(post, i) => ` (post, i) => `
POST ${i + 1}: POST ${i + 1}:
"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}" "${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}"
` `
) )
.join("")} .join("")}
For each post, provide: For each post, provide:
- Is it relevant to "${context}"? (YES/NO) - Is it relevant to "${context}"? (YES/NO)
- Confidence level (0.0 to 1.0) - Confidence level (0.0 to 1.0)
- Brief reasoning - Brief reasoning
Respond in this EXACT format for each post: Respond in this EXACT format for each post:
POST 1: YES/NO | 0.X | brief reason POST 1: YES/NO | 0.X | brief reason
POST 2: YES/NO | 0.X | brief reason POST 2: YES/NO | 0.X | brief reason
POST 3: YES/NO | 0.X | brief reason POST 3: YES/NO | 0.X | brief reason
Examples: Examples:
- For layoff context: "laid off 50 employees" = YES | 0.9 | mentions layoffs - For layoff context: "laid off 50 employees" = YES | 0.9 | mentions layoffs
- For hiring context: "we're hiring developers" = YES | 0.8 | job posting - For hiring context: "we're hiring developers" = YES | 0.8 | job posting
- Unrelated content = NO | 0.1 | not relevant to context`; - Unrelated content = NO | 0.1 | not relevant to context`;
const response = await fetch(`${ollamaHost}/api/generate`, { const response = await fetch(`${ollamaHost}/api/generate`, {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ body: JSON.stringify({
model: model, model: model,
prompt: prompt, prompt: prompt,
stream: false, stream: false,
options: { options: {
temperature: 0.3, temperature: 0.3,
top_p: 0.9, top_p: 0.9,
}, },
}), }),
}); });
if (!response.ok) { if (!response.ok) {
throw new Error( throw new Error(
`Ollama API error: ${response.status} ${response.statusText}` `Ollama API error: ${response.status} ${response.statusText}`
); );
} }
const data = await response.json(); const data = await response.json();
const aiResponse = data.response.trim(); const aiResponse = data.response.trim();
// Parse the response // Parse the response
const analyses = []; const analyses = [];
const lines = aiResponse.split("\n").filter((line) => line.trim()); const lines = aiResponse.split("\n").filter((line) => line.trim());
for (let i = 0; i < posts.length; i++) { for (let i = 0; i < posts.length; i++) {
let analysis = { let analysis = {
postIndex: i + 1, postIndex: i + 1,
isRelevant: false, isRelevant: false,
confidence: 0.5, confidence: 0.5,
reasoning: "Could not parse AI response", reasoning: "Could not parse AI response",
}; };
// Look for lines that match "POST X:" pattern // Look for lines that match "POST X:" pattern
const postPattern = new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i"); const postPattern = new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i");
for (const line of lines) { for (const line of lines) {
const match = line.match(postPattern); const match = line.match(postPattern);
if (match) { if (match) {
const content = match[1].trim(); const content = match[1].trim();
// Parse: YES/NO | 0.X | reasoning // Parse: YES/NO | 0.X | reasoning
const parts = content.split("|").map((p) => p.trim()); const parts = content.split("|").map((p) => p.trim());
if (parts.length >= 3) { if (parts.length >= 3) {
analysis.isRelevant = parts[0].toUpperCase().includes("YES"); analysis.isRelevant = parts[0].toUpperCase().includes("YES");
analysis.confidence = Math.max( analysis.confidence = Math.max(
0, 0,
Math.min(1, parseFloat(parts[1]) || 0.5) Math.min(1, parseFloat(parts[1]) || 0.5)
); );
analysis.reasoning = parts[2] || "No reasoning provided"; analysis.reasoning = parts[2] || "No reasoning provided";
} else { } else {
// Fallback parsing // Fallback parsing
analysis.isRelevant = analysis.isRelevant =
content.toUpperCase().includes("YES") || content.toUpperCase().includes("YES") ||
content.toLowerCase().includes("relevant"); content.toLowerCase().includes("relevant");
analysis.confidence = 0.6; analysis.confidence = 0.6;
analysis.reasoning = content.substring(0, 100); analysis.reasoning = content.substring(0, 100);
} }
break; break;
} }
} }
analyses.push(analysis); analyses.push(analysis);
} }
// If we didn't get enough analyses, fill in defaults // If we didn't get enough analyses, fill in defaults
while (analyses.length < posts.length) { while (analyses.length < posts.length) {
analyses.push({ analyses.push({
postIndex: analyses.length + 1, postIndex: analyses.length + 1,
isRelevant: false, isRelevant: false,
confidence: 0.3, confidence: 0.3,
reasoning: "AI response parsing failed", reasoning: "AI response parsing failed",
}); });
} }
return analyses; return analyses;
} catch (error) { } catch (error) {
logger.error(`Error in batch AI analysis: ${error.message}`); logger.error(`Error in batch AI analysis: ${error.message}`);
// Fallback: mark all as relevant with low confidence // Fallback: mark all as relevant with low confidence
return posts.map((_, i) => ({ return posts.map((_, i) => ({
postIndex: i + 1, postIndex: i + 1,
isRelevant: true, isRelevant: true,
confidence: 0.3, confidence: 0.3,
reasoning: `Analysis failed: ${error.message}`, reasoning: `Analysis failed: ${error.message}`,
})); }));
} }
} }
/** /**
* Analyze a single post using local Ollama (fallback) * Analyze a single post using local Ollama (fallback)
*/ */
async function analyzeSinglePost( async function analyzeSinglePost(
text, text,
context, context,
model = "mistral", model = "mistral",
ollamaHost = "http://localhost:11434" ollamaHost = "http://localhost:11434"
) { ) {
const prompt = `Analyze this LinkedIn post for relevance to: "${context}" const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
Post: "${text}" Post: "${text}"
Is this post relevant to "${context}"? Provide: Is this post relevant to "${context}"? Provide:
1. YES or NO 1. YES or NO
2. Confidence (0.0 to 1.0) 2. Confidence (0.0 to 1.0)
3. Brief reason 3. Brief reason
Format: YES/NO | 0.X | reason`; Format: YES/NO | 0.X | reason`;
try { try {
const response = await fetch(`${ollamaHost}/api/generate`, { const response = await fetch(`${ollamaHost}/api/generate`, {
method: "POST", method: "POST",
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ body: JSON.stringify({
model: model, model: model,
prompt: prompt, prompt: prompt,
stream: false, stream: false,
options: { options: {
temperature: 0.3, temperature: 0.3,
}, },
}), }),
}); });
if (!response.ok) { if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`); throw new Error(`Ollama API error: ${response.status}`);
} }
const data = await response.json(); const data = await response.json();
const aiResponse = data.response.trim(); const aiResponse = data.response.trim();
// Parse response // Parse response
const parts = aiResponse.split("|").map((p) => p.trim()); const parts = aiResponse.split("|").map((p) => p.trim());
if (parts.length >= 3) { if (parts.length >= 3) {
return { return {
isRelevant: parts[0].toUpperCase().includes("YES"), isRelevant: parts[0].toUpperCase().includes("YES"),
confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)), confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)),
reasoning: parts[2], reasoning: parts[2],
}; };
} else { } else {
// Fallback parsing // Fallback parsing
return { return {
isRelevant: isRelevant:
aiResponse.toLowerCase().includes("yes") || aiResponse.toLowerCase().includes("yes") ||
aiResponse.toLowerCase().includes("relevant"), aiResponse.toLowerCase().includes("relevant"),
confidence: 0.6, confidence: 0.6,
reasoning: aiResponse.substring(0, 100), reasoning: aiResponse.substring(0, 100),
}; };
} }
} catch (error) { } catch (error) {
return { return {
isRelevant: true, // Default to include on error isRelevant: true, // Default to include on error
confidence: 0.3, confidence: 0.3,
reasoning: `Analysis failed: ${error.message}`, reasoning: `Analysis failed: ${error.message}`,
}; };
} }
} }
/** /**
* Find the most recent results file if none specified * Find the most recent results file if none specified
*/ */
function findLatestResultsFile(resultsDir = "results") { function findLatestResultsFile(resultsDir = "results") {
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");
if (!fs.existsSync(resultsDir)) { if (!fs.existsSync(resultsDir)) {
throw new Error("Results directory not found. Run the scraper first."); throw new Error("Results directory not found. Run the scraper first.");
} }
const files = fs const files = fs
.readdirSync(resultsDir) .readdirSync(resultsDir)
.filter( .filter(
(f) => (f) =>
(f.startsWith("results-") || f.startsWith("linkedin-results-")) && (f.startsWith("results-") || f.startsWith("linkedin-results-")) &&
f.endsWith(".json") && f.endsWith(".json") &&
!f.includes("-ai-") !f.includes("-ai-")
) )
.sort() .sort()
.reverse(); .reverse();
if (files.length === 0) { if (files.length === 0) {
throw new Error("No results files found. Run the scraper first."); throw new Error("No results files found. Run the scraper first.");
} }
return path.join(resultsDir, files[0]); return path.join(resultsDir, files[0]);
} }
module.exports = { module.exports = {
checkOllamaStatus, checkOllamaStatus,
analyzeBatch, analyzeBatch,
analyzeSinglePost, analyzeSinglePost,
findLatestResultsFile, findLatestResultsFile,
}; };

File diff suppressed because it is too large Load Diff

View File

@ -1,123 +1,123 @@
const chalk = require("chalk"); const chalk = require("chalk");
/** /**
* Configurable logger with color support and level controls * Configurable logger with color support and level controls
* Can enable/disable different log levels: debug, info, warning, error, success * Can enable/disable different log levels: debug, info, warning, error, success
*/ */
class Logger { class Logger {
constructor(options = {}) { constructor(options = {}) {
this.levels = { this.levels = {
debug: options.debug !== false, debug: options.debug !== false,
info: options.info !== false, info: options.info !== false,
warning: options.warning !== false, warning: options.warning !== false,
error: options.error !== false, error: options.error !== false,
success: options.success !== false, success: options.success !== false,
}; };
this.colors = options.colors !== false; this.colors = options.colors !== false;
} }
_formatMessage(level, message, prefix = "") { _formatMessage(level, message, prefix = "") {
const timestamp = new Date().toLocaleTimeString(); const timestamp = new Date().toLocaleTimeString();
const fullMessage = `${prefix}${message}`; const fullMessage = `${prefix}${message}`;
if (!this.colors) { if (!this.colors) {
return `[${timestamp}] [${level.toUpperCase()}] ${fullMessage}`; return `[${timestamp}] [${level.toUpperCase()}] ${fullMessage}`;
} }
switch (level) { switch (level) {
case "debug": case "debug":
return chalk.gray(`[${timestamp}] [DEBUG] ${fullMessage}`); return chalk.gray(`[${timestamp}] [DEBUG] ${fullMessage}`);
case "info": case "info":
return chalk.blue(`[${timestamp}] [INFO] ${fullMessage}`); return chalk.blue(`[${timestamp}] [INFO] ${fullMessage}`);
case "warning": case "warning":
return chalk.yellow(`[${timestamp}] [WARNING] ${fullMessage}`); return chalk.yellow(`[${timestamp}] [WARNING] ${fullMessage}`);
case "error": case "error":
return chalk.red(`[${timestamp}] [ERROR] ${fullMessage}`); return chalk.red(`[${timestamp}] [ERROR] ${fullMessage}`);
case "success": case "success":
return chalk.green(`[${timestamp}] [SUCCESS] ${fullMessage}`); return chalk.green(`[${timestamp}] [SUCCESS] ${fullMessage}`);
default: default:
return `[${timestamp}] [${level.toUpperCase()}] ${fullMessage}`; return `[${timestamp}] [${level.toUpperCase()}] ${fullMessage}`;
} }
} }
debug(message) { debug(message) {
if (this.levels.debug) { if (this.levels.debug) {
console.log(this._formatMessage("debug", message)); console.log(this._formatMessage("debug", message));
} }
} }
info(message) { info(message) {
if (this.levels.info) { if (this.levels.info) {
console.log(this._formatMessage("info", message)); console.log(this._formatMessage("info", message));
} }
} }
warning(message) { warning(message) {
if (this.levels.warning) { if (this.levels.warning) {
console.warn(this._formatMessage("warning", message)); console.warn(this._formatMessage("warning", message));
} }
} }
error(message) { error(message) {
if (this.levels.error) { if (this.levels.error) {
console.error(this._formatMessage("error", message)); console.error(this._formatMessage("error", message));
} }
} }
success(message) { success(message) {
if (this.levels.success) { if (this.levels.success) {
console.log(this._formatMessage("success", message)); console.log(this._formatMessage("success", message));
} }
} }
// Convenience methods with emoji prefixes for better UX // Convenience methods with emoji prefixes for better UX
step(message) { step(message) {
this.info(`🚀 ${message}`); this.info(`🚀 ${message}`);
} }
search(message) { search(message) {
this.info(`🔍 ${message}`); this.info(`🔍 ${message}`);
} }
ai(message) { ai(message) {
this.info(`🧠 ${message}`); this.info(`🧠 ${message}`);
} }
location(message) { location(message) {
this.info(`📍 ${message}`); this.info(`📍 ${message}`);
} }
file(message) { file(message) {
this.info(`📄 ${message}`); this.info(`📄 ${message}`);
} }
// Configure logger levels at runtime // Configure logger levels at runtime
setLevel(level, enabled) { setLevel(level, enabled) {
if (this.levels.hasOwnProperty(level)) { if (this.levels.hasOwnProperty(level)) {
this.levels[level] = enabled; this.levels[level] = enabled;
} }
} }
// Disable all logging // Disable all logging
silent() { silent() {
Object.keys(this.levels).forEach((level) => { Object.keys(this.levels).forEach((level) => {
this.levels[level] = false; this.levels[level] = false;
}); });
} }
// Enable all logging // Enable all logging
verbose() { verbose() {
Object.keys(this.levels).forEach((level) => { Object.keys(this.levels).forEach((level) => {
this.levels[level] = true; this.levels[level] = true;
}); });
} }
} }
// Create default logger instance // Create default logger instance
const logger = new Logger(); const logger = new Logger();
// Export both the class and default instance // Export both the class and default instance
module.exports = { module.exports = {
Logger, Logger,
logger, logger,
}; };

View File

@ -1,124 +1,124 @@
/** /**
* Shared test utilities for parsers * Shared test utilities for parsers
* Common mocks, helpers, and test data * Common mocks, helpers, and test data
*/ */
/** /**
* Mock Playwright page object for testing * Mock Playwright page object for testing
*/ */
function createMockPage() { function createMockPage() {
return { return {
goto: jest.fn().mockResolvedValue(undefined), goto: jest.fn().mockResolvedValue(undefined),
waitForSelector: jest.fn().mockResolvedValue(undefined), waitForSelector: jest.fn().mockResolvedValue(undefined),
$$: jest.fn().mockResolvedValue([]), $$: jest.fn().mockResolvedValue([]),
$: jest.fn().mockResolvedValue(null), $: jest.fn().mockResolvedValue(null),
textContent: jest.fn().mockResolvedValue(""), textContent: jest.fn().mockResolvedValue(""),
close: jest.fn().mockResolvedValue(undefined), close: jest.fn().mockResolvedValue(undefined),
}; };
} }
/** /**
* Mock fetch for AI API calls * Mock fetch for AI API calls
*/ */
function createMockFetch(response = {}) { function createMockFetch(response = {}) {
return jest.fn().mockResolvedValue({ return jest.fn().mockResolvedValue({
ok: true, ok: true,
status: 200, status: 200,
json: jest.fn().mockResolvedValue(response), json: jest.fn().mockResolvedValue(response),
...response, ...response,
}); });
} }
/** /**
* Sample test data for posts * Sample test data for posts
*/ */
const samplePosts = [ const samplePosts = [
{ {
text: "We are laying off 100 employees due to economic downturn.", text: "We are laying off 100 employees due to economic downturn.",
keyword: "layoff", keyword: "layoff",
profileLink: "https://linkedin.com/in/test-user-1", profileLink: "https://linkedin.com/in/test-user-1",
}, },
{ {
text: "Exciting opportunity! We are hiring senior developers for our team.", text: "Exciting opportunity! We are hiring senior developers for our team.",
keyword: "hiring", keyword: "hiring",
profileLink: "https://linkedin.com/in/test-user-2", profileLink: "https://linkedin.com/in/test-user-2",
}, },
]; ];
/** /**
* Sample location test data * Sample location test data
*/ */
const sampleLocations = [ const sampleLocations = [
"Toronto, Ontario, Canada", "Toronto, Ontario, Canada",
"Vancouver, BC", "Vancouver, BC",
"Calgary, Alberta", "Calgary, Alberta",
"Montreal, Quebec", "Montreal, Quebec",
"Halifax, Nova Scotia", "Halifax, Nova Scotia",
]; ];
/** /**
* Common test assertions * Common test assertions
*/ */
function expectValidPost(post) { function expectValidPost(post) {
expect(post).toHaveProperty("text"); expect(post).toHaveProperty("text");
expect(post).toHaveProperty("keyword"); expect(post).toHaveProperty("keyword");
expect(post).toHaveProperty("profileLink"); expect(post).toHaveProperty("profileLink");
expect(typeof post.text).toBe("string"); expect(typeof post.text).toBe("string");
expect(post.text.length).toBeGreaterThan(0); expect(post.text.length).toBeGreaterThan(0);
} }
function expectValidAIAnalysis(analysis) { function expectValidAIAnalysis(analysis) {
expect(analysis).toHaveProperty("isRelevant"); expect(analysis).toHaveProperty("isRelevant");
expect(analysis).toHaveProperty("confidence"); expect(analysis).toHaveProperty("confidence");
expect(analysis).toHaveProperty("reasoning"); expect(analysis).toHaveProperty("reasoning");
expect(typeof analysis.isRelevant).toBe("boolean"); expect(typeof analysis.isRelevant).toBe("boolean");
expect(analysis.confidence).toBeGreaterThanOrEqual(0); expect(analysis.confidence).toBeGreaterThanOrEqual(0);
expect(analysis.confidence).toBeLessThanOrEqual(1); expect(analysis.confidence).toBeLessThanOrEqual(1);
} }
function expectValidLocation(location) { function expectValidLocation(location) {
expect(typeof location).toBe("string"); expect(typeof location).toBe("string");
expect(location.length).toBeGreaterThan(0); expect(location.length).toBeGreaterThan(0);
} }
/** /**
* Test environment setup * Test environment setup
*/ */
function setupTestEnv() { function setupTestEnv() {
// Mock environment variables // Mock environment variables
process.env.NODE_ENV = "test"; process.env.NODE_ENV = "test";
process.env.OLLAMA_HOST = "http://localhost:11434"; process.env.OLLAMA_HOST = "http://localhost:11434";
process.env.AI_CONTEXT = "test context"; process.env.AI_CONTEXT = "test context";
// Suppress console output during tests // Suppress console output during tests
jest.spyOn(console, "log").mockImplementation(() => {}); jest.spyOn(console, "log").mockImplementation(() => {});
jest.spyOn(console, "error").mockImplementation(() => {}); jest.spyOn(console, "error").mockImplementation(() => {});
jest.spyOn(console, "warn").mockImplementation(() => {}); jest.spyOn(console, "warn").mockImplementation(() => {});
} }
/** /**
* Clean up test environment * Clean up test environment
*/ */
function teardownTestEnv() { function teardownTestEnv() {
// Restore console // Restore console
console.log.mockRestore(); console.log.mockRestore();
console.error.mockRestore(); console.error.mockRestore();
console.warn.mockRestore(); console.warn.mockRestore();
// Clear environment // Clear environment
delete process.env.NODE_ENV; delete process.env.NODE_ENV;
delete process.env.OLLAMA_HOST; delete process.env.OLLAMA_HOST;
delete process.env.AI_CONTEXT; delete process.env.AI_CONTEXT;
} }
module.exports = { module.exports = {
createMockPage, createMockPage,
createMockFetch, createMockFetch,
samplePosts, samplePosts,
sampleLocations, sampleLocations,
expectValidPost, expectValidPost,
expectValidAIAnalysis, expectValidAIAnalysis,
expectValidLocation, expectValidLocation,
setupTestEnv, setupTestEnv,
teardownTestEnv, teardownTestEnv,
}; };

View File

@ -1,107 +1,107 @@
/** /**
* Text processing utilities for cleaning and validating content * Text processing utilities for cleaning and validating content
* Extracted from linkedout.js for reuse across parsers * Extracted from linkedout.js for reuse across parsers
*/ */
/** /**
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace * Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
*/ */
function cleanText(text) { function cleanText(text) {
if (!text || typeof text !== "string") { if (!text || typeof text !== "string") {
return ""; return "";
} }
// Remove hashtags // Remove hashtags
text = text.replace(/#\w+/g, ""); text = text.replace(/#\w+/g, "");
// Remove hashtag mentions // Remove hashtag mentions
text = text.replace(/\bhashtag\b/gi, ""); text = text.replace(/\bhashtag\b/gi, "");
text = text.replace(/hashtag-\w+/gi, ""); text = text.replace(/hashtag-\w+/gi, "");
// Remove URLs // Remove URLs
text = text.replace(/https?:\/\/[^\s]+/g, ""); text = text.replace(/https?:\/\/[^\s]+/g, "");
// Remove emojis (Unicode ranges for common emoji) // Remove emojis (Unicode ranges for common emoji)
text = text.replace( text = text.replace(
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu, /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
"" ""
); );
// Normalize whitespace // Normalize whitespace
text = text.replace(/\s+/g, " ").trim(); text = text.replace(/\s+/g, " ").trim();
return text; return text;
} }
/** /**
* Check if text contains any of the specified keywords (case insensitive) * Check if text contains any of the specified keywords (case insensitive)
*/ */
function containsAnyKeyword(text, keywords) { function containsAnyKeyword(text, keywords) {
if (!text || !Array.isArray(keywords)) { if (!text || !Array.isArray(keywords)) {
return false; return false;
} }
const lowerText = text.toLowerCase(); const lowerText = text.toLowerCase();
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase())); return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
} }
/** /**
* Validate if text meets basic quality criteria * Validate if text meets basic quality criteria
*/ */
function isValidText(text, minLength = 30) { function isValidText(text, minLength = 30) {
if (!text || typeof text !== "string") { if (!text || typeof text !== "string") {
return false; return false;
} }
// Check minimum length // Check minimum length
if (text.length < minLength) { if (text.length < minLength) {
return false; return false;
} }
// Check if text contains alphanumeric characters // Check if text contains alphanumeric characters
if (!/[a-zA-Z0-9]/.test(text)) { if (!/[a-zA-Z0-9]/.test(text)) {
return false; return false;
} }
return true; return true;
} }
/** /**
* Extract domain from URL * Extract domain from URL
*/ */
function extractDomain(url) { function extractDomain(url) {
if (!url || typeof url !== "string") { if (!url || typeof url !== "string") {
return null; return null;
} }
try { try {
const urlObj = new URL(url); const urlObj = new URL(url);
return urlObj.hostname; return urlObj.hostname;
} catch (error) { } catch (error) {
return null; return null;
} }
} }
/** /**
* Normalize URL by removing query parameters and fragments * Normalize URL by removing query parameters and fragments
*/ */
function normalizeUrl(url) { function normalizeUrl(url) {
if (!url || typeof url !== "string") { if (!url || typeof url !== "string") {
return ""; return "";
} }
try { try {
const urlObj = new URL(url); const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`; return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
} catch (error) { } catch (error) {
return url; return url;
} }
} }
module.exports = { module.exports = {
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
isValidText, isValidText,
extractDomain, extractDomain,
normalizeUrl, normalizeUrl,
}; };

View File

@ -1,194 +1,194 @@
/** /**
* Test file for logger functionality * Test file for logger functionality
*/ */
const { Logger, logger } = require("../src/logger"); const { Logger, logger } = require("../src/logger");
describe("Logger", () => { describe("Logger", () => {
let consoleSpy; let consoleSpy;
let consoleWarnSpy; let consoleWarnSpy;
let consoleErrorSpy; let consoleErrorSpy;
beforeEach(() => { beforeEach(() => {
consoleSpy = jest.spyOn(console, "log").mockImplementation(); consoleSpy = jest.spyOn(console, "log").mockImplementation();
consoleWarnSpy = jest.spyOn(console, "warn").mockImplementation(); consoleWarnSpy = jest.spyOn(console, "warn").mockImplementation();
consoleErrorSpy = jest.spyOn(console, "error").mockImplementation(); consoleErrorSpy = jest.spyOn(console, "error").mockImplementation();
}); });
afterEach(() => { afterEach(() => {
consoleSpy.mockRestore(); consoleSpy.mockRestore();
consoleWarnSpy.mockRestore(); consoleWarnSpy.mockRestore();
consoleErrorSpy.mockRestore(); consoleErrorSpy.mockRestore();
}); });
test("should create default logger instance", () => { test("should create default logger instance", () => {
expect(logger).toBeDefined(); expect(logger).toBeDefined();
expect(logger).toBeInstanceOf(Logger); expect(logger).toBeInstanceOf(Logger);
}); });
test("should log info messages", () => { test("should log info messages", () => {
logger.info("Test message"); logger.info("Test message");
expect(consoleSpy).toHaveBeenCalled(); expect(consoleSpy).toHaveBeenCalled();
}); });
test("should create custom logger with disabled levels", () => { test("should create custom logger with disabled levels", () => {
const customLogger = new Logger({ debug: false }); const customLogger = new Logger({ debug: false });
customLogger.debug("This should not log"); customLogger.debug("This should not log");
expect(consoleSpy).not.toHaveBeenCalled(); expect(consoleSpy).not.toHaveBeenCalled();
}); });
test("should use emoji prefixes for convenience methods", () => { test("should use emoji prefixes for convenience methods", () => {
logger.step("Test step"); logger.step("Test step");
logger.ai("Test AI"); logger.ai("Test AI");
logger.location("Test location"); logger.location("Test location");
expect(consoleSpy).toHaveBeenCalledTimes(3); expect(consoleSpy).toHaveBeenCalledTimes(3);
}); });
test("should configure levels at runtime", () => { test("should configure levels at runtime", () => {
const customLogger = new Logger(); const customLogger = new Logger();
customLogger.setLevel("debug", false); customLogger.setLevel("debug", false);
customLogger.debug("This should not log"); customLogger.debug("This should not log");
expect(consoleSpy).not.toHaveBeenCalled(); expect(consoleSpy).not.toHaveBeenCalled();
}); });
test("should go silent when requested", () => { test("should go silent when requested", () => {
const customLogger = new Logger(); const customLogger = new Logger();
customLogger.silent(); customLogger.silent();
customLogger.info("This should not log"); customLogger.info("This should not log");
customLogger.error("This should not log"); customLogger.error("This should not log");
expect(consoleSpy).not.toHaveBeenCalled(); expect(consoleSpy).not.toHaveBeenCalled();
expect(consoleErrorSpy).not.toHaveBeenCalled(); expect(consoleErrorSpy).not.toHaveBeenCalled();
}); });
// Additional test cases for comprehensive coverage // Additional test cases for comprehensive coverage
test("should log warning messages", () => { test("should log warning messages", () => {
logger.warning("Test warning"); logger.warning("Test warning");
expect(consoleWarnSpy).toHaveBeenCalled(); expect(consoleWarnSpy).toHaveBeenCalled();
}); });
test("should log error messages", () => { test("should log error messages", () => {
logger.error("Test error"); logger.error("Test error");
expect(consoleErrorSpy).toHaveBeenCalled(); expect(consoleErrorSpy).toHaveBeenCalled();
}); });
test("should log success messages", () => { test("should log success messages", () => {
logger.success("Test success"); logger.success("Test success");
expect(consoleSpy).toHaveBeenCalled(); expect(consoleSpy).toHaveBeenCalled();
}); });
test("should log debug messages", () => { test("should log debug messages", () => {
logger.debug("Test debug"); logger.debug("Test debug");
expect(consoleSpy).toHaveBeenCalled(); expect(consoleSpy).toHaveBeenCalled();
}); });
test("should respect disabled warning level", () => { test("should respect disabled warning level", () => {
const customLogger = new Logger({ warning: false }); const customLogger = new Logger({ warning: false });
customLogger.warning("This should not log"); customLogger.warning("This should not log");
expect(consoleWarnSpy).not.toHaveBeenCalled(); expect(consoleWarnSpy).not.toHaveBeenCalled();
}); });
test("should respect disabled error level", () => { test("should respect disabled error level", () => {
const customLogger = new Logger({ error: false }); const customLogger = new Logger({ error: false });
customLogger.error("This should not log"); customLogger.error("This should not log");
expect(consoleErrorSpy).not.toHaveBeenCalled(); expect(consoleErrorSpy).not.toHaveBeenCalled();
}); });
test("should respect disabled success level", () => { test("should respect disabled success level", () => {
const customLogger = new Logger({ success: false }); const customLogger = new Logger({ success: false });
customLogger.success("This should not log"); customLogger.success("This should not log");
expect(consoleSpy).not.toHaveBeenCalled(); expect(consoleSpy).not.toHaveBeenCalled();
}); });
test("should respect disabled info level", () => { test("should respect disabled info level", () => {
const customLogger = new Logger({ info: false }); const customLogger = new Logger({ info: false });
customLogger.info("This should not log"); customLogger.info("This should not log");
expect(consoleSpy).not.toHaveBeenCalled(); expect(consoleSpy).not.toHaveBeenCalled();
}); });
test("should test all convenience methods", () => { test("should test all convenience methods", () => {
logger.step("Test step"); logger.step("Test step");
logger.search("Test search"); logger.search("Test search");
logger.ai("Test AI"); logger.ai("Test AI");
logger.location("Test location"); logger.location("Test location");
logger.file("Test file"); logger.file("Test file");
expect(consoleSpy).toHaveBeenCalledTimes(5); expect(consoleSpy).toHaveBeenCalledTimes(5);
}); });
test("should enable all levels with verbose method", () => { test("should enable all levels with verbose method", () => {
const customLogger = new Logger({ debug: false, info: false }); const customLogger = new Logger({ debug: false, info: false });
customLogger.verbose(); customLogger.verbose();
customLogger.debug("This should log"); customLogger.debug("This should log");
customLogger.info("This should log"); customLogger.info("This should log");
expect(consoleSpy).toHaveBeenCalledTimes(2); expect(consoleSpy).toHaveBeenCalledTimes(2);
}); });
test("should handle setLevel with invalid level gracefully", () => { test("should handle setLevel with invalid level gracefully", () => {
const customLogger = new Logger(); const customLogger = new Logger();
expect(() => { expect(() => {
customLogger.setLevel("invalid", false); customLogger.setLevel("invalid", false);
}).not.toThrow(); }).not.toThrow();
}); });
test("should format messages with timestamps", () => { test("should format messages with timestamps", () => {
logger.info("Test message"); logger.info("Test message");
const loggedMessage = consoleSpy.mock.calls[0][0]; const loggedMessage = consoleSpy.mock.calls[0][0];
expect(loggedMessage).toMatch(/\[\d{1,2}:\d{2}:\d{2}\]/); expect(loggedMessage).toMatch(/\[\d{1,2}:\d{2}:\d{2}\]/);
}); });
test("should include level in formatted messages", () => { test("should include level in formatted messages", () => {
logger.info("Test message"); logger.info("Test message");
const loggedMessage = consoleSpy.mock.calls[0][0]; const loggedMessage = consoleSpy.mock.calls[0][0];
expect(loggedMessage).toContain("[INFO]"); expect(loggedMessage).toContain("[INFO]");
}); });
test("should disable colors when colors option is false", () => { test("should disable colors when colors option is false", () => {
const customLogger = new Logger({ colors: false }); const customLogger = new Logger({ colors: false });
customLogger.info("Test message"); customLogger.info("Test message");
const loggedMessage = consoleSpy.mock.calls[0][0]; const loggedMessage = consoleSpy.mock.calls[0][0];
// Should not contain ANSI color codes // Should not contain ANSI color codes
expect(loggedMessage).not.toMatch(/\u001b\[/); expect(loggedMessage).not.toMatch(/\u001b\[/);
}); });
test("should enable colors by default", () => { test("should enable colors by default", () => {
logger.info("Test message"); logger.info("Test message");
const loggedMessage = consoleSpy.mock.calls[0][0]; const loggedMessage = consoleSpy.mock.calls[0][0];
// Should contain ANSI color codes // Should contain ANSI color codes
expect(loggedMessage).toMatch(/\u001b\[/); expect(loggedMessage).toMatch(/\u001b\[/);
}); });
test("should handle multiple level configurations", () => { test("should handle multiple level configurations", () => {
const customLogger = new Logger({ const customLogger = new Logger({
debug: false, debug: false,
info: true, info: true,
warning: false, warning: false,
error: true, error: true,
success: false, success: false,
}); });
customLogger.debug("Should not log"); customLogger.debug("Should not log");
customLogger.info("Should log"); customLogger.info("Should log");
customLogger.warning("Should not log"); customLogger.warning("Should not log");
customLogger.error("Should log"); customLogger.error("Should log");
customLogger.success("Should not log"); customLogger.success("Should not log");
expect(consoleSpy).toHaveBeenCalledTimes(1); expect(consoleSpy).toHaveBeenCalledTimes(1);
expect(consoleErrorSpy).toHaveBeenCalledTimes(1); expect(consoleErrorSpy).toHaveBeenCalledTimes(1);
expect(consoleWarnSpy).not.toHaveBeenCalled(); expect(consoleWarnSpy).not.toHaveBeenCalled();
}); });
test("should handle empty or undefined messages", () => { test("should handle empty or undefined messages", () => {
expect(() => { expect(() => {
logger.info(""); logger.info("");
logger.info(undefined); logger.info(undefined);
logger.info(null); logger.info(null);
}).not.toThrow(); }).not.toThrow();
}); });
test("should handle complex message objects", () => { test("should handle complex message objects", () => {
const testObj = { key: "value", nested: { data: "test" } }; const testObj = { key: "value", nested: { data: "test" } };
expect(() => { expect(() => {
logger.info(testObj); logger.info(testObj);
}).not.toThrow(); }).not.toThrow();
}); });
}); });

View File

@ -1,94 +1,94 @@
/** /**
* Authentication Manager * Authentication Manager
* *
* Handles login/authentication for different sites * Handles login/authentication for different sites
*/ */
class AuthManager { class AuthManager {
constructor(coreParser) { constructor(coreParser) {
this.coreParser = coreParser; this.coreParser = coreParser;
} }
/** /**
* Authenticate to a specific site * Authenticate to a specific site
*/ */
async authenticate(site, credentials, pageId = "default") { async authenticate(site, credentials, pageId = "default") {
const strategies = { const strategies = {
linkedin: this.authenticateLinkedIn.bind(this), linkedin: this.authenticateLinkedIn.bind(this),
// Add more auth strategies as needed // Add more auth strategies as needed
}; };
const strategy = strategies[site.toLowerCase()]; const strategy = strategies[site.toLowerCase()];
if (!strategy) { if (!strategy) {
throw new Error(`No authentication strategy found for site: ${site}`); throw new Error(`No authentication strategy found for site: ${site}`);
} }
return await strategy(credentials, pageId); return await strategy(credentials, pageId);
} }
/** /**
* LinkedIn authentication strategy * LinkedIn authentication strategy
*/ */
async authenticateLinkedIn(credentials, pageId = "default") { async authenticateLinkedIn(credentials, pageId = "default") {
const { username, password } = credentials; const { username, password } = credentials;
if (!username || !password) { if (!username || !password) {
throw new Error("LinkedIn authentication requires username and password"); throw new Error("LinkedIn authentication requires username and password");
} }
const page = this.coreParser.getPage(pageId); const page = this.coreParser.getPage(pageId);
if (!page) { if (!page) {
throw new Error(`Page with ID '${pageId}' not found`); throw new Error(`Page with ID '${pageId}' not found`);
} }
try { try {
// Navigate to LinkedIn login // Navigate to LinkedIn login
await this.coreParser.navigateTo("https://www.linkedin.com/login", { await this.coreParser.navigateTo("https://www.linkedin.com/login", {
pageId, pageId,
}); });
// Fill credentials // Fill credentials
await page.fill('input[name="session_key"]', username); await page.fill('input[name="session_key"]', username);
await page.fill('input[name="session_password"]', password); await page.fill('input[name="session_password"]', password);
// Submit form // Submit form
await page.click('button[type="submit"]'); await page.click('button[type="submit"]');
// Wait for successful login (profile image appears) // Wait for successful login (profile image appears)
await page.waitForSelector("img.global-nav__me-photo", { await page.waitForSelector("img.global-nav__me-photo", {
timeout: 15000, timeout: 15000,
}); });
return true; return true;
} catch (error) { } catch (error) {
throw new Error(`LinkedIn authentication failed: ${error.message}`); throw new Error(`LinkedIn authentication failed: ${error.message}`);
} }
} }
/** /**
* Check if currently authenticated to a site * Check if currently authenticated to a site
*/ */
async isAuthenticated(site, pageId = "default") { async isAuthenticated(site, pageId = "default") {
const page = this.coreParser.getPage(pageId); const page = this.coreParser.getPage(pageId);
if (!page) { if (!page) {
return false; return false;
} }
const checkers = { const checkers = {
linkedin: async () => { linkedin: async () => {
try { try {
await page.waitForSelector("img.global-nav__me-photo", { await page.waitForSelector("img.global-nav__me-photo", {
timeout: 2000, timeout: 2000,
}); });
return true; return true;
} catch { } catch {
return false; return false;
} }
}, },
}; };
const checker = checkers[site.toLowerCase()]; const checker = checkers[site.toLowerCase()];
return checker ? await checker() : false; return checker ? await checker() : false;
} }
} }
module.exports = AuthManager; module.exports = AuthManager;

63
core-parser/index.js Normal file
View File

@ -0,0 +1,63 @@
const playwright = require('playwright');
const AuthManager = require('./auth-manager');
const NavigationManager = require('./navigation');
class CoreParser {
constructor(config = {}) {
this.config = {
headless: true,
timeout: 60000, // Increased default timeout
...config
};
this.browser = null;
this.context = null;
this.pages = {};
this.authManager = new AuthManager(this);
this.navigationManager = new NavigationManager(this);
}
async init() {
this.browser = await playwright.chromium.launch({
headless: this.config.headless
});
this.context = await this.browser.newContext();
}
async createPage(id) {
if (!this.browser) await this.init();
const page = await this.context.newPage();
this.pages[id] = page;
return page;
}
getPage(id) {
return this.pages[id];
}
async authenticate(site, credentials, pageId) {
return this.authManager.authenticate(site, credentials, pageId);
}
async navigateTo(url, options = {}) {
const {
pageId = "default",
waitUntil = "networkidle", // Changed default to networkidle
retries = 1,
retryDelay = 2000,
timeout = this.config.timeout,
} = options;
return this.navigationManager.navigateTo(url, options);
}
async cleanup() {
if (this.browser) {
await this.browser.close();
this.browser = null;
this.context = null;
this.pages = {};
}
}
}
module.exports = CoreParser;

View File

@ -1,131 +1,131 @@
/** /**
* Navigation Manager * Navigation Manager
* *
* Handles page navigation with error handling, retries, and logging * Handles page navigation with error handling, retries, and logging
*/ */
class NavigationManager { class NavigationManager {
constructor(coreParser) { constructor(coreParser) {
this.coreParser = coreParser; this.coreParser = coreParser;
} }
/** /**
* Navigate to URL with comprehensive error handling * Navigate to URL with comprehensive error handling
*/ */
async navigateTo(url, options = {}) { async navigateTo(url, options = {}) {
const { const {
pageId = "default", pageId = "default",
waitUntil = "domcontentloaded", waitUntil = "domcontentloaded",
retries = 1, retries = 1,
retryDelay = 2000, retryDelay = 2000,
timeout = this.coreParser.config.timeout, timeout = this.coreParser.config.timeout,
} = options; } = options;
const page = this.coreParser.getPage(pageId); const page = this.coreParser.getPage(pageId);
if (!page) { if (!page) {
throw new Error(`Page with ID '${pageId}' not found`); throw new Error(`Page with ID '${pageId}' not found`);
} }
let lastError; let lastError;
for (let attempt = 0; attempt <= retries; attempt++) { for (let attempt = 0; attempt <= retries; attempt++) {
try { try {
console.log( console.log(
`🌐 Navigating to: ${url} (attempt ${attempt + 1}/${retries + 1})` `🌐 Navigating to: ${url} (attempt ${attempt + 1}/${retries + 1})`
); );
await page.goto(url, { await page.goto(url, {
waitUntil, waitUntil,
timeout, timeout,
}); });
console.log(`✅ Navigation successful: ${url}`); console.log(`✅ Navigation successful: ${url}`);
return true; return true;
} catch (error) { } catch (error) {
lastError = error; lastError = error;
console.warn( console.warn(
`⚠️ Navigation attempt ${attempt + 1} failed: ${error.message}` `⚠️ Navigation attempt ${attempt + 1} failed: ${error.message}`
); );
if (attempt < retries) { if (attempt < retries) {
console.log(`🔄 Retrying in ${retryDelay}ms...`); console.log(`🔄 Retrying in ${retryDelay}ms...`);
await this.delay(retryDelay); await this.delay(retryDelay);
} }
} }
} }
// All attempts failed // All attempts failed
const errorMessage = `Navigation failed after ${retries + 1} attempts: ${ const errorMessage = `Navigation failed after ${retries + 1} attempts: ${
lastError.message lastError.message
}`; }`;
console.error(`${errorMessage}`); console.error(`${errorMessage}`);
throw new Error(errorMessage); throw new Error(errorMessage);
} }
/** /**
* Navigate and wait for specific selector * Navigate and wait for specific selector
*/ */
async navigateAndWaitFor(url, selector, options = {}) { async navigateAndWaitFor(url, selector, options = {}) {
await this.navigateTo(url, options); await this.navigateTo(url, options);
const { pageId = "default", timeout = this.coreParser.config.timeout } = const { pageId = "default", timeout = this.coreParser.config.timeout } =
options; options;
const page = this.coreParser.getPage(pageId); const page = this.coreParser.getPage(pageId);
try { try {
await page.waitForSelector(selector, { timeout }); await page.waitForSelector(selector, { timeout });
console.log(`✅ Selector found: ${selector}`); console.log(`✅ Selector found: ${selector}`);
return true; return true;
} catch (error) { } catch (error) {
console.warn(`⚠️ Selector not found: ${selector} - ${error.message}`); console.warn(`⚠️ Selector not found: ${selector} - ${error.message}`);
return false; return false;
} }
} }
/** /**
* Check if current page has specific content * Check if current page has specific content
*/ */
async hasContent(content, options = {}) { async hasContent(content, options = {}) {
const { pageId = "default", timeout = 5000 } = options; const { pageId = "default", timeout = 5000 } = options;
const page = this.coreParser.getPage(pageId); const page = this.coreParser.getPage(pageId);
try { try {
await page.waitForFunction( await page.waitForFunction(
(text) => document.body.innerText.includes(text), (text) => document.body.innerText.includes(text),
content, content,
{ timeout } { timeout }
); );
return true; return true;
} catch { } catch {
return false; return false;
} }
} }
/** /**
* Utility delay function * Utility delay function
*/ */
async delay(ms) { async delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms)); return new Promise((resolve) => setTimeout(resolve, ms));
} }
/** /**
* Get current page URL * Get current page URL
*/ */
getCurrentUrl(pageId = "default") { getCurrentUrl(pageId = "default") {
const page = this.coreParser.getPage(pageId); const page = this.coreParser.getPage(pageId);
return page ? page.url() : null; return page ? page.url() : null;
} }
/** /**
* Take screenshot for debugging * Take screenshot for debugging
*/ */
async screenshot(filepath, pageId = "default") { async screenshot(filepath, pageId = "default") {
const page = this.coreParser.getPage(pageId); const page = this.coreParser.getPage(pageId);
if (page) { if (page) {
await page.screenshot({ path: filepath }); await page.screenshot({ path: filepath });
console.log(`📸 Screenshot saved: ${filepath}`); console.log(`📸 Screenshot saved: ${filepath}`);
} }
} }
} }
module.exports = NavigationManager; module.exports = NavigationManager;

View File

@ -1,27 +1,7 @@
{ {
"name": "core-parser", "name": "core-parser",
"version": "1.0.0", "version": "1.0.0",
"description": "Core browser automation and parsing engine for all parsers", "main": "index.js",
"main": "index.js", "description": "Core parser utilities for browser management",
"scripts": { "dependencies": {}
"test": "jest", }
"install:browsers": "npx playwright install chromium"
},
"keywords": [
"parser",
"playwright",
"browser",
"automation",
"core"
],
"author": "Job Market Intelligence Team",
"license": "ISC",
"type": "commonjs",
"dependencies": {
"playwright": "^1.53.2",
"dotenv": "^17.0.0"
},
"devDependencies": {
"jest": "^29.7.0"
}
}

View File

@ -1,497 +1,497 @@
# Job Search Parser - Job Market Intelligence # Job Search Parser - Job Market Intelligence
Specialized parser for job market intelligence, tracking job postings, market trends, and competitive analysis. Focuses on tech roles and industry insights. Specialized parser for job market intelligence, tracking job postings, market trends, and competitive analysis. Focuses on tech roles and industry insights.
## 🎯 Purpose ## 🎯 Purpose
The Job Search Parser is designed to: The Job Search Parser is designed to:
- **Track Job Market Trends**: Monitor demand for specific roles and skills - **Track Job Market Trends**: Monitor demand for specific roles and skills
- **Competitive Intelligence**: Analyze salary ranges and requirements - **Competitive Intelligence**: Analyze salary ranges and requirements
- **Industry Insights**: Track hiring patterns across different sectors - **Industry Insights**: Track hiring patterns across different sectors
- **Skill Gap Analysis**: Identify in-demand technologies and frameworks - **Skill Gap Analysis**: Identify in-demand technologies and frameworks
- **Market Demand Forecasting**: Predict job market trends - **Market Demand Forecasting**: Predict job market trends
## 🚀 Features ## 🚀 Features
### Core Functionality ### Core Functionality
- **Multi-Source Aggregation**: Collect job data from multiple platforms - **Multi-Source Aggregation**: Collect job data from multiple platforms
- **Role-Specific Tracking**: Focus on tech roles and emerging positions - **Role-Specific Tracking**: Focus on tech roles and emerging positions
- **Skill Analysis**: Extract and categorize required skills - **Skill Analysis**: Extract and categorize required skills
- **Salary Intelligence**: Track compensation ranges and trends - **Salary Intelligence**: Track compensation ranges and trends
- **Company Intelligence**: Monitor hiring companies and patterns - **Company Intelligence**: Monitor hiring companies and patterns
### Advanced Features ### Advanced Features
- **Market Trend Analysis**: Identify growing and declining job categories - **Market Trend Analysis**: Identify growing and declining job categories
- **Geographic Distribution**: Track job distribution by location - **Geographic Distribution**: Track job distribution by location
- **Experience Level Analysis**: Entry, mid, senior level tracking - **Experience Level Analysis**: Entry, mid, senior level tracking
- **Remote Work Trends**: Monitor remote/hybrid work patterns - **Remote Work Trends**: Monitor remote/hybrid work patterns
- **Technology Stack Tracking**: Framework and tool popularity - **Technology Stack Tracking**: Framework and tool popularity
## 🌐 Supported Job Sites ## 🌐 Supported Job Sites
### ✅ Implemented Parsers ### ✅ Implemented Parsers
#### SkipTheDrive Parser #### SkipTheDrive Parser
Remote job board specializing in work-from-home positions. Remote job board specializing in work-from-home positions.
**Features:** **Features:**
- Keyword-based job search with relevance sorting - Keyword-based job search with relevance sorting
- Job type filtering (full-time, part-time, contract) - Job type filtering (full-time, part-time, contract)
- Multi-page result parsing with pagination - Multi-page result parsing with pagination
- Featured/sponsored job identification - Featured/sponsored job identification
- AI-powered job relevance analysis - AI-powered job relevance analysis
- Automatic duplicate detection - Automatic duplicate detection
**Usage:** **Usage:**
```bash ```bash
# Parse SkipTheDrive for QA automation jobs # Parse SkipTheDrive for QA automation jobs
node index.js --sites=skipthedrive --keywords="automation qa,qa engineer" node index.js --sites=skipthedrive --keywords="automation qa,qa engineer"
# Filter by job type # Filter by job type
JOB_TYPES="full time,contract" node index.js --sites=skipthedrive JOB_TYPES="full time,contract" node index.js --sites=skipthedrive
# Run demo with limited results # Run demo with limited results
node index.js --sites=skipthedrive --demo node index.js --sites=skipthedrive --demo
``` ```
### 🚧 Planned Parsers ### 🚧 Planned Parsers
- **Indeed**: Comprehensive job aggregator - **Indeed**: Comprehensive job aggregator
- **Glassdoor**: Jobs with company reviews and salary data - **Glassdoor**: Jobs with company reviews and salary data
- **Monster**: Traditional job board - **Monster**: Traditional job board
- **SimplyHired**: Job aggregator with salary estimates - **SimplyHired**: Job aggregator with salary estimates
- **LinkedIn Jobs**: Professional network job postings - **LinkedIn Jobs**: Professional network job postings
- **AngelList**: Startup and tech jobs - **AngelList**: Startup and tech jobs
- **Remote.co**: Dedicated remote work jobs - **Remote.co**: Dedicated remote work jobs
- **FlexJobs**: Flexible and remote positions - **FlexJobs**: Flexible and remote positions
## 📦 Installation ## 📦 Installation
```bash ```bash
# Install dependencies # Install dependencies
npm install npm install
# Run tests # Run tests
npm test npm test
# Run demo # Run demo
node demo.js node demo.js
``` ```
## 🔧 Configuration ## 🔧 Configuration
### Environment Variables ### Environment Variables
Create a `.env` file in the parser directory: Create a `.env` file in the parser directory:
```env ```env
# Job Search Configuration # Job Search Configuration
SEARCH_SOURCES=linkedin,indeed,glassdoor SEARCH_SOURCES=linkedin,indeed,glassdoor
TARGET_ROLES=software engineer,data scientist,product manager TARGET_ROLES=software engineer,data scientist,product manager
LOCATION_FILTER=Toronto,Vancouver,Calgary LOCATION_FILTER=Toronto,Vancouver,Calgary
EXPERIENCE_LEVELS=entry,mid,senior EXPERIENCE_LEVELS=entry,mid,senior
REMOTE_PREFERENCE=remote,hybrid,onsite REMOTE_PREFERENCE=remote,hybrid,onsite
# Analysis Configuration # Analysis Configuration
ENABLE_SALARY_ANALYSIS=true ENABLE_SALARY_ANALYSIS=true
ENABLE_SKILL_ANALYSIS=true ENABLE_SKILL_ANALYSIS=true
ENABLE_TREND_ANALYSIS=true ENABLE_TREND_ANALYSIS=true
MIN_SALARY=50000 MIN_SALARY=50000
MAX_SALARY=200000 MAX_SALARY=200000
# Output Configuration # Output Configuration
OUTPUT_FORMAT=json,csv OUTPUT_FORMAT=json,csv
SAVE_RAW_DATA=true SAVE_RAW_DATA=true
ANALYSIS_INTERVAL=daily ANALYSIS_INTERVAL=daily
``` ```
### Command Line Options ### Command Line Options
```bash ```bash
# Basic usage # Basic usage
node index.js node index.js
# Specific roles # Specific roles
node index.js --roles="frontend developer,backend developer" node index.js --roles="frontend developer,backend developer"
# Geographic focus # Geographic focus
node index.js --locations="Toronto,Vancouver" node index.js --locations="Toronto,Vancouver"
# Experience level # Experience level
node index.js --experience="senior" node index.js --experience="senior"
# Output format # Output format
node index.js --output=results/job-market-analysis.json node index.js --output=results/job-market-analysis.json
``` ```
**Available Options:** **Available Options:**
- `--roles="role1,role2"`: Target job roles - `--roles="role1,role2"`: Target job roles
- `--locations="city1,city2"`: Geographic focus - `--locations="city1,city2"`: Geographic focus
- `--experience="entry|mid|senior"`: Experience level - `--experience="entry|mid|senior"`: Experience level
- `--remote="remote|hybrid|onsite"`: Remote work preference - `--remote="remote|hybrid|onsite"`: Remote work preference
- `--salary-min=NUMBER`: Minimum salary filter - `--salary-min=NUMBER`: Minimum salary filter
- `--salary-max=NUMBER`: Maximum salary filter - `--salary-max=NUMBER`: Maximum salary filter
- `--output=FILE`: Output filename - `--output=FILE`: Output filename
- `--format=json|csv`: Output format - `--format=json|csv`: Output format
- `--trends`: Enable trend analysis - `--trends`: Enable trend analysis
- `--skills`: Enable skill analysis - `--skills`: Enable skill analysis
## 📊 Keywords ## 📊 Keywords
### Role-Specific Keywords ### Role-Specific Keywords
Place keyword CSV files in the `keywords/` directory: Place keyword CSV files in the `keywords/` directory:
``` ```
job-search-parser/ job-search-parser/
├── keywords/ ├── keywords/
│ ├── job-search-keywords.csv # General job search terms │ ├── job-search-keywords.csv # General job search terms
│ ├── tech-roles.csv # Technology roles │ ├── tech-roles.csv # Technology roles
│ ├── data-roles.csv # Data science roles │ ├── data-roles.csv # Data science roles
│ ├── management-roles.csv # Management positions │ ├── management-roles.csv # Management positions
│ └── emerging-roles.csv # Emerging job categories │ └── emerging-roles.csv # Emerging job categories
└── index.js └── index.js
``` ```
### Tech Roles Keywords ### Tech Roles Keywords
```csv ```csv
keyword keyword
software engineer software engineer
frontend developer frontend developer
backend developer backend developer
full stack developer full stack developer
data scientist data scientist
machine learning engineer machine learning engineer
devops engineer devops engineer
site reliability engineer site reliability engineer
cloud architect cloud architect
security engineer security engineer
mobile developer mobile developer
iOS developer iOS developer
Android developer Android developer
react developer react developer
vue developer vue developer
angular developer angular developer
node.js developer node.js developer
python developer python developer
java developer java developer
golang developer golang developer
rust developer rust developer
data engineer data engineer
analytics engineer analytics engineer
``` ```
### Data Science Keywords ### Data Science Keywords
```csv ```csv
keyword keyword
data scientist data scientist
machine learning engineer machine learning engineer
data analyst data analyst
business analyst business analyst
data engineer data engineer
analytics engineer analytics engineer
ML engineer ML engineer
AI engineer AI engineer
statistician statistician
quantitative analyst quantitative analyst
research scientist research scientist
data architect data architect
BI developer BI developer
ETL developer ETL developer
``` ```
## 📈 Usage Examples ## 📈 Usage Examples
### Basic Job Search ### Basic Job Search
```bash ```bash
# Standard job market analysis # Standard job market analysis
node index.js node index.js
# Specific tech roles # Specific tech roles
node index.js --roles="software engineer,data scientist" node index.js --roles="software engineer,data scientist"
# Geographic focus # Geographic focus
node index.js --locations="Toronto,Vancouver,Calgary" node index.js --locations="Toronto,Vancouver,Calgary"
``` ```
### Advanced Analysis ### Advanced Analysis
```bash ```bash
# Senior level positions # Senior level positions
node index.js --experience="senior" --salary-min=100000 node index.js --experience="senior" --salary-min=100000
# Remote work opportunities # Remote work opportunities
node index.js --remote="remote" --roles="frontend developer" node index.js --remote="remote" --roles="frontend developer"
# Trend analysis # Trend analysis
node index.js --trends --skills --output=results/trends.json node index.js --trends --skills --output=results/trends.json
``` ```
### Market Intelligence ### Market Intelligence
```bash ```bash
# Salary analysis # Salary analysis
node index.js --salary-min=80000 --salary-max=150000 node index.js --salary-min=80000 --salary-max=150000
# Skill gap analysis # Skill gap analysis
node index.js --skills --roles="machine learning engineer" node index.js --skills --roles="machine learning engineer"
# Competitive intelligence # Competitive intelligence
node index.js --companies="Google,Microsoft,Amazon" node index.js --companies="Google,Microsoft,Amazon"
``` ```
## 📊 Output Format ## 📊 Output Format
### JSON Structure ### JSON Structure
```json ```json
{ {
"metadata": { "metadata": {
"timestamp": "2024-01-15T10:30:00Z", "timestamp": "2024-01-15T10:30:00Z",
"search_parameters": { "search_parameters": {
"roles": ["software engineer", "data scientist"], "roles": ["software engineer", "data scientist"],
"locations": ["Toronto", "Vancouver"], "locations": ["Toronto", "Vancouver"],
"experience_levels": ["mid", "senior"], "experience_levels": ["mid", "senior"],
"remote_preference": ["remote", "hybrid"] "remote_preference": ["remote", "hybrid"]
}, },
"total_jobs_found": 1250, "total_jobs_found": 1250,
"analysis_duration_seconds": 45 "analysis_duration_seconds": 45
}, },
"market_overview": { "market_overview": {
"total_jobs": 1250, "total_jobs": 1250,
"average_salary": 95000, "average_salary": 95000,
"salary_range": { "salary_range": {
"min": 65000, "min": 65000,
"max": 180000, "max": 180000,
"median": 92000 "median": 92000
}, },
"remote_distribution": { "remote_distribution": {
"remote": 45, "remote": 45,
"hybrid": 35, "hybrid": 35,
"onsite": 20 "onsite": 20
}, },
"experience_distribution": { "experience_distribution": {
"entry": 15, "entry": 15,
"mid": 45, "mid": 45,
"senior": 40 "senior": 40
} }
}, },
"trends": { "trends": {
"growing_skills": [ "growing_skills": [
{ "skill": "React", "growth_rate": 25 }, { "skill": "React", "growth_rate": 25 },
{ "skill": "Python", "growth_rate": 18 }, { "skill": "Python", "growth_rate": 18 },
{ "skill": "AWS", "growth_rate": 22 } { "skill": "AWS", "growth_rate": 22 }
], ],
"declining_skills": [ "declining_skills": [
{ "skill": "jQuery", "growth_rate": -12 }, { "skill": "jQuery", "growth_rate": -12 },
{ "skill": "PHP", "growth_rate": -8 } { "skill": "PHP", "growth_rate": -8 }
], ],
"emerging_roles": ["AI Engineer", "DevSecOps Engineer", "Data Engineer"] "emerging_roles": ["AI Engineer", "DevSecOps Engineer", "Data Engineer"]
}, },
"jobs": [ "jobs": [
{ {
"id": "job_1", "id": "job_1",
"title": "Senior Software Engineer", "title": "Senior Software Engineer",
"company": "TechCorp", "company": "TechCorp",
"location": "Toronto, Ontario", "location": "Toronto, Ontario",
"remote_type": "hybrid", "remote_type": "hybrid",
"salary": { "salary": {
"min": 100000, "min": 100000,
"max": 140000, "max": 140000,
"currency": "CAD" "currency": "CAD"
}, },
"required_skills": ["React", "Node.js", "TypeScript", "AWS"], "required_skills": ["React", "Node.js", "TypeScript", "AWS"],
"preferred_skills": ["GraphQL", "Docker", "Kubernetes"], "preferred_skills": ["GraphQL", "Docker", "Kubernetes"],
"experience_level": "senior", "experience_level": "senior",
"job_url": "https://example.com/job/1", "job_url": "https://example.com/job/1",
"posted_date": "2024-01-10T09:00:00Z", "posted_date": "2024-01-10T09:00:00Z",
"scraped_at": "2024-01-15T10:30:00Z" "scraped_at": "2024-01-15T10:30:00Z"
} }
], ],
"analysis": { "analysis": {
"skill_demand": { "skill_demand": {
"React": { "count": 45, "avg_salary": 98000 }, "React": { "count": 45, "avg_salary": 98000 },
"Python": { "count": 38, "avg_salary": 102000 }, "Python": { "count": 38, "avg_salary": 102000 },
"AWS": { "count": 32, "avg_salary": 105000 } "AWS": { "count": 32, "avg_salary": 105000 }
}, },
"company_insights": { "company_insights": {
"top_hirers": [ "top_hirers": [
{ "company": "TechCorp", "jobs": 25 }, { "company": "TechCorp", "jobs": 25 },
{ "company": "StartupXYZ", "jobs": 18 } { "company": "StartupXYZ", "jobs": 18 }
], ],
"salary_leaders": [ "salary_leaders": [
{ "company": "BigTech", "avg_salary": 120000 }, { "company": "BigTech", "avg_salary": 120000 },
{ "company": "FinTech", "avg_salary": 115000 } { "company": "FinTech", "avg_salary": 115000 }
] ]
} }
} }
} }
``` ```
### CSV Output ### CSV Output
The parser can also generate CSV files for easy analysis: The parser can also generate CSV files for easy analysis:
```csv ```csv
job_id,title,company,location,remote_type,salary_min,salary_max,required_skills,experience_level,posted_date job_id,title,company,location,remote_type,salary_min,salary_max,required_skills,experience_level,posted_date
job_1,Senior Software Engineer,TechCorp,Toronto,hybrid,100000,140000,"React,Node.js,TypeScript",senior,2024-01-10 job_1,Senior Software Engineer,TechCorp,Toronto,hybrid,100000,140000,"React,Node.js,TypeScript",senior,2024-01-10
job_2,Data Scientist,DataCorp,Vancouver,remote,90000,130000,"Python,SQL,ML",mid,2024-01-09 job_2,Data Scientist,DataCorp,Vancouver,remote,90000,130000,"Python,SQL,ML",mid,2024-01-09
``` ```
## 🔒 Security & Best Practices ## 🔒 Security & Best Practices
### Data Privacy ### Data Privacy
- Respect job site terms of service - Respect job site terms of service
- Implement appropriate rate limiting - Implement appropriate rate limiting
- Store data securely and responsibly - Store data securely and responsibly
- Anonymize sensitive information - Anonymize sensitive information
### Rate Limiting ### Rate Limiting
- Implement delays between requests - Implement delays between requests
- Respect API rate limits - Respect API rate limits
- Use multiple data sources - Use multiple data sources
- Monitor for blocking/detection - Monitor for blocking/detection
### Legal Compliance ### Legal Compliance
- Educational and research purposes only - Educational and research purposes only
- Respect website terms of service - Respect website terms of service
- Implement data retention policies - Implement data retention policies
- Monitor for legal changes - Monitor for legal changes
## 🧪 Testing ## 🧪 Testing
### Run Tests ### Run Tests
```bash ```bash
# All tests # All tests
npm test npm test
# Specific test suites # Specific test suites
npm test -- --testNamePattern="JobSearch" npm test -- --testNamePattern="JobSearch"
npm test -- --testNamePattern="Analysis" npm test -- --testNamePattern="Analysis"
npm test -- --testNamePattern="Trends" npm test -- --testNamePattern="Trends"
``` ```
### Test Coverage ### Test Coverage
```bash ```bash
npm run test:coverage npm run test:coverage
``` ```
## 🚀 Performance Optimization ## 🚀 Performance Optimization
### Recommended Settings ### Recommended Settings
#### Fast Analysis #### Fast Analysis
```bash ```bash
node index.js --roles="software engineer" --locations="Toronto" node index.js --roles="software engineer" --locations="Toronto"
``` ```
#### Comprehensive Analysis #### Comprehensive Analysis
```bash ```bash
node index.js --trends --skills --experience="all" node index.js --trends --skills --experience="all"
``` ```
#### Focused Intelligence #### Focused Intelligence
```bash ```bash
node index.js --salary-min=80000 --remote="remote" --trends node index.js --salary-min=80000 --remote="remote" --trends
``` ```
### Performance Tips ### Performance Tips
- Use specific role filters to reduce data volume - Use specific role filters to reduce data volume
- Implement caching for repeated searches - Implement caching for repeated searches
- Use parallel processing for multiple sources - Use parallel processing for multiple sources
- Optimize data storage and retrieval - Optimize data storage and retrieval
## 🔧 Troubleshooting ## 🔧 Troubleshooting
### Common Issues ### Common Issues
#### Rate Limiting #### Rate Limiting
```bash ```bash
# Reduce request frequency # Reduce request frequency
export REQUEST_DELAY=2000 export REQUEST_DELAY=2000
node index.js node index.js
``` ```
#### Data Source Issues #### Data Source Issues
```bash ```bash
# Use specific sources # Use specific sources
node index.js --sources="linkedin,indeed" node index.js --sources="linkedin,indeed"
# Check source availability # Check source availability
node index.js --test-sources node index.js --test-sources
``` ```
#### Output Issues #### Output Issues
```bash ```bash
# Check output directory # Check output directory
mkdir -p results mkdir -p results
node index.js --output=results/analysis.json node index.js --output=results/analysis.json
# Verify file permissions # Verify file permissions
chmod 755 results/ chmod 755 results/
``` ```
## 📈 Monitoring & Analytics ## 📈 Monitoring & Analytics
### Key Metrics ### Key Metrics
- **Job Volume**: Total jobs found per search - **Job Volume**: Total jobs found per search
- **Salary Trends**: Average and median salary changes - **Salary Trends**: Average and median salary changes
- **Skill Demand**: Most requested skills - **Skill Demand**: Most requested skills
- **Remote Adoption**: Remote work trend analysis - **Remote Adoption**: Remote work trend analysis
- **Market Velocity**: Job posting frequency - **Market Velocity**: Job posting frequency
### Dashboard Integration ### Dashboard Integration
- Real-time market monitoring - Real-time market monitoring
- Trend visualization - Trend visualization
- Salary benchmarking - Salary benchmarking
- Skill gap analysis - Skill gap analysis
- Competitive intelligence - Competitive intelligence
## 🤝 Contributing ## 🤝 Contributing
### Development Setup ### Development Setup
1. Fork the repository 1. Fork the repository
2. Create feature branch 2. Create feature branch
3. Add tests for new functionality 3. Add tests for new functionality
4. Ensure all tests pass 4. Ensure all tests pass
5. Submit pull request 5. Submit pull request
### Code Standards ### Code Standards
- Follow existing code style - Follow existing code style
- Add JSDoc comments - Add JSDoc comments
- Maintain test coverage - Maintain test coverage
- Update documentation - Update documentation
## 📄 License ## 📄 License
This parser is part of the LinkedOut platform and follows the same licensing terms. This parser is part of the LinkedOut platform and follows the same licensing terms.
--- ---
**Note**: This tool is designed for educational and research purposes. Always respect website terms of service and implement appropriate rate limiting and ethical usage practices. **Note**: This tool is designed for educational and research purposes. Always respect website terms of service and implement appropriate rate limiting and ethical usage practices.

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,9 @@
keyword keyword
qa automation qa automation
automation test automation test
sdet sdet
qa lead qa lead
automation lead automation lead
playwright playwright
cypress cypress
quality assurance engineer quality assurance engineer
1 keyword
2 qa automation
3 automation test
4 sdet
5 qa lead
6 automation lead
7 playwright
8 cypress
9 quality assurance engineer

View File

@ -1,129 +1,129 @@
#!/usr/bin/env node #!/usr/bin/env node
/** /**
* SkipTheDrive Parser Demo * SkipTheDrive Parser Demo
* *
* Demonstrates the SkipTheDrive job parser functionality * Demonstrates the SkipTheDrive job parser functionality
*/ */
const { parseSkipTheDrive } = require("./skipthedrive"); const { parseSkipTheDrive } = require("./skipthedrive");
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");
const { logger } = require("../../ai-analyzer"); const { logger } = require("../../ai-analyzer");
// Load environment variables // Load environment variables
require("dotenv").config({ path: path.join(__dirname, "..", ".env") }); require("dotenv").config({ path: path.join(__dirname, "..", ".env") });
async function runDemo() { async function runDemo() {
logger.step("🚀 SkipTheDrive Parser Demo"); logger.step("🚀 SkipTheDrive Parser Demo");
// Demo configuration // Demo configuration
const options = { const options = {
// Search for QA automation jobs (from your example) // Search for QA automation jobs (from your example)
keywords: process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [ keywords: process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
"automation qa", "automation qa",
"qa engineer", "qa engineer",
"test automation", "test automation",
], ],
// Job type filters - can be: "part time", "full time", "contract" // Job type filters - can be: "part time", "full time", "contract"
jobTypes: process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [], jobTypes: process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
// Location filter (optional) // Location filter (optional)
locationFilter: process.env.LOCATION_FILTER || "", locationFilter: process.env.LOCATION_FILTER || "",
// Maximum pages to parse // Maximum pages to parse
maxPages: parseInt(process.env.MAX_PAGES) || 3, maxPages: parseInt(process.env.MAX_PAGES) || 3,
// Browser headless mode // Browser headless mode
headless: process.env.HEADLESS !== "false", headless: process.env.HEADLESS !== "false",
// AI analysis // AI analysis
enableAI: process.env.ENABLE_AI_ANALYSIS !== "false", enableAI: process.env.ENABLE_AI_ANALYSIS !== "false",
aiContext: "remote QA and test automation job opportunities", aiContext: "remote QA and test automation job opportunities",
}; };
logger.info("Configuration:"); logger.info("Configuration:");
logger.info(`- Keywords: ${options.keywords.join(", ")}`); logger.info(`- Keywords: ${options.keywords.join(", ")}`);
logger.info( logger.info(
`- Job Types: ${ `- Job Types: ${
options.jobTypes.length > 0 ? options.jobTypes.join(", ") : "All types" options.jobTypes.length > 0 ? options.jobTypes.join(", ") : "All types"
}` }`
); );
logger.info(`- Location Filter: ${options.locationFilter || "None"}`); logger.info(`- Location Filter: ${options.locationFilter || "None"}`);
logger.info(`- Max Pages: ${options.maxPages}`); logger.info(`- Max Pages: ${options.maxPages}`);
logger.info(`- Headless: ${options.headless}`); logger.info(`- Headless: ${options.headless}`);
logger.info(`- AI Analysis: ${options.enableAI}`); logger.info(`- AI Analysis: ${options.enableAI}`);
logger.info("\nStarting parser..."); logger.info("\nStarting parser...");
try { try {
const startTime = Date.now(); const startTime = Date.now();
const results = await parseSkipTheDrive(options); const results = await parseSkipTheDrive(options);
const duration = ((Date.now() - startTime) / 1000).toFixed(2); const duration = ((Date.now() - startTime) / 1000).toFixed(2);
// Save results // Save results
const timestamp = new Date() const timestamp = new Date()
.toISOString() .toISOString()
.replace(/[:.]/g, "-") .replace(/[:.]/g, "-")
.slice(0, -5); .slice(0, -5);
const resultsDir = path.join(__dirname, "..", "results"); const resultsDir = path.join(__dirname, "..", "results");
if (!fs.existsSync(resultsDir)) { if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir, { recursive: true }); fs.mkdirSync(resultsDir, { recursive: true });
} }
const resultsFile = path.join( const resultsFile = path.join(
resultsDir, resultsDir,
`skipthedrive-results-${timestamp}.json` `skipthedrive-results-${timestamp}.json`
); );
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2)); fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2));
// Display summary // Display summary
logger.step("\n📊 Parsing Summary:"); logger.step("\n📊 Parsing Summary:");
logger.info(`- Duration: ${duration} seconds`); logger.info(`- Duration: ${duration} seconds`);
logger.info(`- Jobs Found: ${results.results.length}`); logger.info(`- Jobs Found: ${results.results.length}`);
logger.info(`- Jobs Rejected: ${results.rejectedResults.length}`); logger.info(`- Jobs Rejected: ${results.rejectedResults.length}`);
logger.file(`- Results saved to: ${resultsFile}`); logger.file(`- Results saved to: ${resultsFile}`);
// Show sample results // Show sample results
if (results.results.length > 0) { if (results.results.length > 0) {
logger.info("\n🔍 Sample Jobs Found:"); logger.info("\n🔍 Sample Jobs Found:");
results.results.slice(0, 5).forEach((job, index) => { results.results.slice(0, 5).forEach((job, index) => {
logger.info(`\n${index + 1}. ${job.title}`); logger.info(`\n${index + 1}. ${job.title}`);
logger.info(` Company: ${job.company}`); logger.info(` Company: ${job.company}`);
logger.info(` Posted: ${job.daysAgo}`); logger.info(` Posted: ${job.daysAgo}`);
logger.info(` Featured: ${job.isFeatured ? "Yes" : "No"}`); logger.info(` Featured: ${job.isFeatured ? "Yes" : "No"}`);
logger.info(` URL: ${job.jobUrl}`); logger.info(` URL: ${job.jobUrl}`);
if (job.aiAnalysis) { if (job.aiAnalysis) {
logger.ai( logger.ai(
` AI Relevant: ${job.aiAnalysis.isRelevant ? "Yes" : "No"} (${( ` AI Relevant: ${job.aiAnalysis.isRelevant ? "Yes" : "No"} (${(
job.aiAnalysis.confidence * 100 job.aiAnalysis.confidence * 100
).toFixed(0)}% confidence)` ).toFixed(0)}% confidence)`
); );
} }
}); });
} }
// Show rejection reasons // Show rejection reasons
if (results.rejectedResults.length > 0) { if (results.rejectedResults.length > 0) {
const rejectionReasons = {}; const rejectionReasons = {};
results.rejectedResults.forEach((job) => { results.rejectedResults.forEach((job) => {
rejectionReasons[job.reason] = (rejectionReasons[job.reason] || 0) + 1; rejectionReasons[job.reason] = (rejectionReasons[job.reason] || 0) + 1;
}); });
logger.info("\n❌ Rejection Reasons:"); logger.info("\n❌ Rejection Reasons:");
Object.entries(rejectionReasons).forEach(([reason, count]) => { Object.entries(rejectionReasons).forEach(([reason, count]) => {
logger.info(` ${reason}: ${count}`); logger.info(` ${reason}: ${count}`);
}); });
} }
} catch (error) { } catch (error) {
logger.error("\n❌ Demo failed:", error.message); logger.error("\n❌ Demo failed:", error.message);
process.exit(1); process.exit(1);
} }
} }
// Run the demo // Run the demo
runDemo().catch((err) => { runDemo().catch((err) => {
logger.error("Fatal error:", err); logger.error("Fatal error:", err);
process.exit(1); process.exit(1);
}); });

View File

@ -1,332 +1,332 @@
/** /**
* SkipTheDrive Job Parser * SkipTheDrive Job Parser
* *
* Parses remote job listings from SkipTheDrive.com * Parses remote job listings from SkipTheDrive.com
* Supports keyword search, job type filters, and pagination * Supports keyword search, job type filters, and pagination
*/ */
const { chromium } = require("playwright"); const { chromium } = require("playwright");
const path = require("path"); const path = require("path");
// Import from ai-analyzer core package // Import from ai-analyzer core package
const { const {
logger, logger,
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
parseLocationFilters, parseLocationFilters,
validateLocationAgainstFilters, validateLocationAgainstFilters,
extractLocationFromProfile, extractLocationFromProfile,
analyzeBatch, analyzeBatch,
checkOllamaStatus, checkOllamaStatus,
} = require("../../ai-analyzer"); } = require("../../ai-analyzer");
/** /**
* Build search URL for SkipTheDrive * Build search URL for SkipTheDrive
* @param {string} keyword - Search keyword * @param {string} keyword - Search keyword
* @param {string} orderBy - Sort order (date, relevance) * @param {string} orderBy - Sort order (date, relevance)
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract) * @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
* @returns {string} - Formatted search URL * @returns {string} - Formatted search URL
*/ */
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`; let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
if (orderBy) { if (orderBy) {
url += `&orderby=${orderBy}`; url += `&orderby=${orderBy}`;
} }
// Add job type filters // Add job type filters
jobTypes.forEach((type) => { jobTypes.forEach((type) => {
url += `&jobtype=${encodeURIComponent(type)}`; url += `&jobtype=${encodeURIComponent(type)}`;
}); });
return url; return url;
} }
/** /**
* Extract job data from a single job listing element * Extract job data from a single job listing element
* @param {Element} article - Job listing DOM element * @param {Element} article - Job listing DOM element
* @returns {Object} - Extracted job data * @returns {Object} - Extracted job data
*/ */
async function extractJobData(article) { async function extractJobData(article) {
try { try {
// Extract job title and URL // Extract job title and URL
const titleElement = await article.$("h2.post-title a"); const titleElement = await article.$("h2.post-title a");
const title = titleElement ? await titleElement.textContent() : ""; const title = titleElement ? await titleElement.textContent() : "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : ""; const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract date // Extract date
const dateElement = await article.$("time.post-date"); const dateElement = await article.$("time.post-date");
const datePosted = dateElement const datePosted = dateElement
? await dateElement.getAttribute("datetime") ? await dateElement.getAttribute("datetime")
: ""; : "";
const dateText = dateElement ? await dateElement.textContent() : ""; const dateText = dateElement ? await dateElement.textContent() : "";
// Extract company name // Extract company name
const companyElement = await article.$( const companyElement = await article.$(
".custom_fields_company_name_display_search_results" ".custom_fields_company_name_display_search_results"
); );
let company = companyElement ? await companyElement.textContent() : ""; let company = companyElement ? await companyElement.textContent() : "";
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract days ago // Extract days ago
const daysAgoElement = await article.$( const daysAgoElement = await article.$(
".custom_fields_job_date_display_search_results" ".custom_fields_job_date_display_search_results"
); );
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : ""; let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract job description excerpt // Extract job description excerpt
const excerptElement = await article.$(".excerpt_part"); const excerptElement = await article.$(".excerpt_part");
const description = excerptElement const description = excerptElement
? await excerptElement.textContent() ? await excerptElement.textContent()
: ""; : "";
// Check if featured/sponsored // Check if featured/sponsored
const featuredElement = await article.$(".custom_fields_sponsored_job"); const featuredElement = await article.$(".custom_fields_sponsored_job");
const isFeatured = !!featuredElement; const isFeatured = !!featuredElement;
// Extract job ID from article ID // Extract job ID from article ID
const articleId = await article.getAttribute("id"); const articleId = await article.getAttribute("id");
const jobId = articleId ? articleId.replace("post-", "") : ""; const jobId = articleId ? articleId.replace("post-", "") : "";
return { return {
jobId, jobId,
title: cleanText(title), title: cleanText(title),
company: cleanText(company), company: cleanText(company),
jobUrl, jobUrl,
datePosted, datePosted,
dateText: cleanText(dateText), dateText: cleanText(dateText),
daysAgo: cleanText(daysAgo), daysAgo: cleanText(daysAgo),
description: cleanText(description), description: cleanText(description),
isFeatured, isFeatured,
source: "skipthedrive", source: "skipthedrive",
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
}; };
} catch (error) { } catch (error) {
logger.error(`Error extracting job data: ${error.message}`); logger.error(`Error extracting job data: ${error.message}`);
return null; return null;
} }
} }
/** /**
* Parse SkipTheDrive job listings * Parse SkipTheDrive job listings
* @param {Object} options - Parser options * @param {Object} options - Parser options
* @returns {Promise<Array>} - Array of parsed job listings * @returns {Promise<Array>} - Array of parsed job listings
*/ */
async function parseSkipTheDrive(options = {}) { async function parseSkipTheDrive(options = {}) {
const { const {
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [ keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
"software engineer", "software engineer",
"developer", "developer",
], ],
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [], jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
locationFilter = process.env.LOCATION_FILTER || "", locationFilter = process.env.LOCATION_FILTER || "",
maxPages = parseInt(process.env.MAX_PAGES) || 5, maxPages = parseInt(process.env.MAX_PAGES) || 5,
headless = process.env.HEADLESS !== "false", headless = process.env.HEADLESS !== "false",
enableAI = process.env.ENABLE_AI_ANALYSIS === "true", enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis", aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
} = options; } = options;
logger.step("Starting SkipTheDrive parser..."); logger.step("Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`); logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.info( logger.info(
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}` `📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
); );
logger.info(`📍 Location Filter: ${locationFilter || "None"}`); logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`); logger.info(`📄 Max Pages: ${maxPages}`);
const browser = await chromium.launch({ const browser = await chromium.launch({
headless, headless,
args: [ args: [
"--no-sandbox", "--no-sandbox",
"--disable-setuid-sandbox", "--disable-setuid-sandbox",
"--disable-dev-shm-usage", "--disable-dev-shm-usage",
], ],
}); });
const context = await browser.newContext({ const context = await browser.newContext({
userAgent: userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}); });
const results = []; const results = [];
const rejectedResults = []; const rejectedResults = [];
const seenJobs = new Set(); const seenJobs = new Set();
try { try {
// Search for each keyword // Search for each keyword
for (const keyword of keywords) { for (const keyword of keywords) {
logger.info(`\n🔍 Searching for: ${keyword}`); logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes); const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
const page = await context.newPage(); const page = await context.newPage();
try { try {
logger.info( logger.info(
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}` `Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
); );
await page.goto(searchUrl, { await page.goto(searchUrl, {
waitUntil: "domcontentloaded", waitUntil: "domcontentloaded",
timeout: 30000, timeout: 30000,
}); });
logger.info( logger.info(
`Navigation completed successfully at ${new Date().toISOString()}` `Navigation completed successfully at ${new Date().toISOString()}`
); );
// Wait for job listings to load // Wait for job listings to load
logger.info("Waiting for selector #loops-wrapper"); logger.info("Waiting for selector #loops-wrapper");
await page await page
.waitForSelector("#loops-wrapper", { timeout: 5000 }) .waitForSelector("#loops-wrapper", { timeout: 5000 })
.catch(() => { .catch(() => {
logger.warning(`No results found for keyword: ${keyword}`); logger.warning(`No results found for keyword: ${keyword}`);
}); });
logger.info("Selector wait completed"); logger.info("Selector wait completed");
let currentPage = 1; let currentPage = 1;
let hasNextPage = true; let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) { while (hasNextPage && currentPage <= maxPages) {
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`); logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
// Extract all job articles on current page // Extract all job articles on current page
const jobArticles = await page.$$("article[id^='post-']"); const jobArticles = await page.$$("article[id^='post-']");
logger.info( logger.info(
`Found ${jobArticles.length} job listings on page ${currentPage}` `Found ${jobArticles.length} job listings on page ${currentPage}`
); );
for (const article of jobArticles) { for (const article of jobArticles) {
const jobData = await extractJobData(article); const jobData = await extractJobData(article);
if (!jobData || seenJobs.has(jobData.jobId)) { if (!jobData || seenJobs.has(jobData.jobId)) {
continue; continue;
} }
seenJobs.add(jobData.jobId); seenJobs.add(jobData.jobId);
// Add keyword that found this job // Add keyword that found this job
jobData.searchKeyword = keyword; jobData.searchKeyword = keyword;
// Validate job against keywords // Validate job against keywords
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`; const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
if (!containsAnyKeyword(fullText, keywords)) { if (!containsAnyKeyword(fullText, keywords)) {
rejectedResults.push({ rejectedResults.push({
...jobData, ...jobData,
rejected: true, rejected: true,
reason: "Keywords not found in job listing", reason: "Keywords not found in job listing",
}); });
continue; continue;
} }
// Location validation (if enabled) // Location validation (if enabled)
if (locationFilter) { if (locationFilter) {
const locationFilters = parseLocationFilters(locationFilter); const locationFilters = parseLocationFilters(locationFilter);
// For SkipTheDrive, most jobs are remote, but we can check the title/description // For SkipTheDrive, most jobs are remote, but we can check the title/description
const locationValid = const locationValid =
fullText.toLowerCase().includes("remote") || fullText.toLowerCase().includes("remote") ||
locationFilters.some((filter) => locationFilters.some((filter) =>
fullText.toLowerCase().includes(filter.toLowerCase()) fullText.toLowerCase().includes(filter.toLowerCase())
); );
if (!locationValid) { if (!locationValid) {
rejectedResults.push({ rejectedResults.push({
...jobData, ...jobData,
rejected: true, rejected: true,
reason: "Location requirements not met", reason: "Location requirements not met",
}); });
continue; continue;
} }
jobData.locationValid = locationValid; jobData.locationValid = locationValid;
} }
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`); logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
results.push(jobData); results.push(jobData);
} }
// Check for next page // Check for next page
const nextPageLink = await page.$("a.nextp"); const nextPageLink = await page.$("a.nextp");
if (nextPageLink && currentPage < maxPages) { if (nextPageLink && currentPage < maxPages) {
logger.info("📄 Moving to next page..."); logger.info("📄 Moving to next page...");
await nextPageLink.click(); await nextPageLink.click();
await page.waitForLoadState("domcontentloaded"); await page.waitForLoadState("domcontentloaded");
await page.waitForTimeout(2000); // Wait for content to load await page.waitForTimeout(2000); // Wait for content to load
currentPage++; currentPage++;
} else { } else {
hasNextPage = false; hasNextPage = false;
} }
} }
} catch (error) { } catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`); logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} finally { } finally {
await page.close(); await page.close();
} }
} }
logger.success(`\n✅ Parsing complete!`); logger.success(`\n✅ Parsing complete!`);
logger.info(`📊 Total jobs found: ${results.length}`); logger.info(`📊 Total jobs found: ${results.length}`);
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`); logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
// Run AI analysis if enabled // Run AI analysis if enabled
let aiAnalysis = null; let aiAnalysis = null;
if (enableAI && results.length > 0) { if (enableAI && results.length > 0) {
logger.step("Running AI analysis on job listings..."); logger.step("Running AI analysis on job listings...");
const aiAvailable = await checkOllamaStatus(); const aiAvailable = await checkOllamaStatus();
if (aiAvailable) { if (aiAvailable) {
const analysisData = results.map((job) => ({ const analysisData = results.map((job) => ({
text: `${job.title} at ${job.company}. ${job.description}`, text: `${job.title} at ${job.company}. ${job.description}`,
metadata: { metadata: {
jobId: job.jobId, jobId: job.jobId,
company: job.company, company: job.company,
daysAgo: job.daysAgo, daysAgo: job.daysAgo,
}, },
})); }));
aiAnalysis = await analyzeBatch(analysisData, aiContext); aiAnalysis = await analyzeBatch(analysisData, aiContext);
// Merge AI analysis with results // Merge AI analysis with results
results.forEach((job, index) => { results.forEach((job, index) => {
if (aiAnalysis && aiAnalysis[index]) { if (aiAnalysis && aiAnalysis[index]) {
job.aiAnalysis = { job.aiAnalysis = {
isRelevant: aiAnalysis[index].isRelevant, isRelevant: aiAnalysis[index].isRelevant,
confidence: aiAnalysis[index].confidence, confidence: aiAnalysis[index].confidence,
reasoning: aiAnalysis[index].reasoning, reasoning: aiAnalysis[index].reasoning,
}; };
} }
}); });
logger.success("✅ AI analysis completed"); logger.success("✅ AI analysis completed");
} else { } else {
logger.warning("⚠️ AI not available - skipping analysis"); logger.warning("⚠️ AI not available - skipping analysis");
} }
} }
return { return {
results, results,
rejectedResults, rejectedResults,
metadata: { metadata: {
source: "skipthedrive", source: "skipthedrive",
totalJobs: results.length, totalJobs: results.length,
rejectedJobs: rejectedResults.length, rejectedJobs: rejectedResults.length,
keywords: keywords, keywords: keywords,
jobTypes: jobTypes, jobTypes: jobTypes,
locationFilter: locationFilter, locationFilter: locationFilter,
aiAnalysisEnabled: enableAI, aiAnalysisEnabled: enableAI,
aiAnalysisCompleted: !!aiAnalysis, aiAnalysisCompleted: !!aiAnalysis,
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
}, },
}; };
} catch (error) { } catch (error) {
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`); logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
throw error; throw error;
} finally { } finally {
await browser.close(); await browser.close();
} }
} }
// Export the parser // Export the parser
module.exports = { module.exports = {
parseSkipTheDrive, parseSkipTheDrive,
buildSearchUrl, buildSearchUrl,
extractJobData, extractJobData,
}; };

View File

@ -1,302 +1,302 @@
/** /**
* SkipTheDrive Parsing Strategy * SkipTheDrive Parsing Strategy
* *
* Uses core-parser for browser management and ai-analyzer for utilities * Uses core-parser for browser management and ai-analyzer for utilities
*/ */
const { const {
logger, logger,
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
validateLocationAgainstFilters, validateLocationAgainstFilters,
} = require("ai-analyzer"); } = require("ai-analyzer");
/** /**
* SkipTheDrive URL builder * SkipTheDrive URL builder
*/ */
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
const baseUrl = "https://www.skipthedrive.com/"; const baseUrl = "https://www.skipthedrive.com/";
const params = new URLSearchParams({ const params = new URLSearchParams({
s: keyword, s: keyword,
orderby: orderBy, orderby: orderBy,
}); });
if (jobTypes && jobTypes.length > 0) { if (jobTypes && jobTypes.length > 0) {
params.append("job_type", jobTypes.join(",")); params.append("job_type", jobTypes.join(","));
} }
return `${baseUrl}?${params.toString()}`; return `${baseUrl}?${params.toString()}`;
} }
/** /**
* SkipTheDrive parsing strategy function * SkipTheDrive parsing strategy function
*/ */
async function skipthedriveStrategy(coreParser, options = {}) { async function skipthedriveStrategy(coreParser, options = {}) {
const { const {
keywords = ["software engineer", "developer", "programmer"], keywords = ["software engineer", "developer", "programmer"],
locationFilter = null, locationFilter = null,
maxPages = 5, maxPages = 5,
jobTypes = [], jobTypes = [],
} = options; } = options;
const results = []; const results = [];
const rejectedResults = []; const rejectedResults = [];
const seenJobs = new Set(); const seenJobs = new Set();
try { try {
// Create main page // Create main page
const page = await coreParser.createPage("skipthedrive-main"); const page = await coreParser.createPage("skipthedrive-main");
logger.info("🚀 Starting SkipTheDrive parser..."); logger.info("🚀 Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`); logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`); logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`); logger.info(`📄 Max Pages: ${maxPages}`);
// Search for each keyword // Search for each keyword
for (const keyword of keywords) { for (const keyword of keywords) {
logger.info(`\n🔍 Searching for: ${keyword}`); logger.info(`\n🔍 Searching for: ${keyword}`);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes); const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
try { try {
// Navigate to search results // Navigate to search results
await coreParser.navigateTo(searchUrl, { await coreParser.navigateTo(searchUrl, {
pageId: "skipthedrive-main", pageId: "skipthedrive-main",
retries: 2, retries: 2,
timeout: 30000, timeout: 30000,
}); });
// Wait for job listings to load // Wait for job listings to load
const hasResults = await coreParser const hasResults = await coreParser
.waitForSelector( .waitForSelector(
"#loops-wrapper", "#loops-wrapper",
{ {
timeout: 5000, timeout: 5000,
}, },
"skipthedrive-main" "skipthedrive-main"
) )
.catch(() => { .catch(() => {
logger.warning(`No results found for keyword: ${keyword}`); logger.warning(`No results found for keyword: ${keyword}`);
return false; return false;
}); });
if (!hasResults) { if (!hasResults) {
continue; continue;
} }
// Process multiple pages // Process multiple pages
let currentPage = 1; let currentPage = 1;
let hasNextPage = true; let hasNextPage = true;
while (hasNextPage && currentPage <= maxPages) { while (hasNextPage && currentPage <= maxPages) {
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`); logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
// Extract jobs from current page // Extract jobs from current page
const pageJobs = await extractJobsFromPage( const pageJobs = await extractJobsFromPage(
page, page,
keyword, keyword,
locationFilter locationFilter
); );
for (const job of pageJobs) { for (const job of pageJobs) {
// Skip duplicates // Skip duplicates
if (seenJobs.has(job.jobId)) continue; if (seenJobs.has(job.jobId)) continue;
seenJobs.add(job.jobId); seenJobs.add(job.jobId);
// Validate location if filtering enabled // Validate location if filtering enabled
if (locationFilter) { if (locationFilter) {
const locationValid = validateLocationAgainstFilters( const locationValid = validateLocationAgainstFilters(
job.location, job.location,
locationFilter locationFilter
); );
if (!locationValid) { if (!locationValid) {
rejectedResults.push({ rejectedResults.push({
...job, ...job,
rejectionReason: "Location filter mismatch", rejectionReason: "Location filter mismatch",
}); });
continue; continue;
} }
} }
results.push(job); results.push(job);
} }
// Check for next page // Check for next page
hasNextPage = await hasNextPageAvailable(page); hasNextPage = await hasNextPageAvailable(page);
if (hasNextPage && currentPage < maxPages) { if (hasNextPage && currentPage < maxPages) {
await navigateToNextPage(page, currentPage + 1); await navigateToNextPage(page, currentPage + 1);
currentPage++; currentPage++;
// Wait for new page to load // Wait for new page to load
await page.waitForTimeout(2000); await page.waitForTimeout(2000);
} else { } else {
hasNextPage = false; hasNextPage = false;
} }
} }
} catch (error) { } catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`); logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} }
} }
logger.info( logger.info(
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected` `🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
); );
return { return {
results, results,
rejectedResults, rejectedResults,
summary: { summary: {
totalJobs: results.length, totalJobs: results.length,
totalRejected: rejectedResults.length, totalRejected: rejectedResults.length,
keywords: keywords.join(", "), keywords: keywords.join(", "),
locationFilter, locationFilter,
source: "skipthedrive", source: "skipthedrive",
}, },
}; };
} catch (error) { } catch (error) {
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`); logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
throw error; throw error;
} }
} }
/** /**
* Extract jobs from current page * Extract jobs from current page
*/ */
async function extractJobsFromPage(page, keyword, locationFilter) { async function extractJobsFromPage(page, keyword, locationFilter) {
const jobs = []; const jobs = [];
try { try {
// Get all job article elements // Get all job article elements
const jobElements = await page.$$("article.job_listing"); const jobElements = await page.$$("article.job_listing");
for (const jobElement of jobElements) { for (const jobElement of jobElements) {
try { try {
const job = await extractJobData(jobElement, keyword); const job = await extractJobData(jobElement, keyword);
if (job) { if (job) {
jobs.push(job); jobs.push(job);
} }
} catch (error) { } catch (error) {
logger.warning(`Failed to extract job data: ${error.message}`); logger.warning(`Failed to extract job data: ${error.message}`);
} }
} }
} catch (error) { } catch (error) {
logger.error(`Failed to extract jobs from page: ${error.message}`); logger.error(`Failed to extract jobs from page: ${error.message}`);
} }
return jobs; return jobs;
} }
/** /**
* Extract data from individual job element * Extract data from individual job element
*/ */
async function extractJobData(jobElement, keyword) { async function extractJobData(jobElement, keyword) {
try { try {
// Extract job ID // Extract job ID
const articleId = (await jobElement.getAttribute("id")) || ""; const articleId = (await jobElement.getAttribute("id")) || "";
const jobId = articleId ? articleId.replace("post-", "") : ""; const jobId = articleId ? articleId.replace("post-", "") : "";
// Extract title // Extract title
const titleElement = await jobElement.$(".job_listing-title a"); const titleElement = await jobElement.$(".job_listing-title a");
const title = titleElement const title = titleElement
? cleanText(await titleElement.textContent()) ? cleanText(await titleElement.textContent())
: ""; : "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : ""; const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract company // Extract company
const companyElement = await jobElement.$(".company"); const companyElement = await jobElement.$(".company");
const company = companyElement const company = companyElement
? cleanText(await companyElement.textContent()) ? cleanText(await companyElement.textContent())
: ""; : "";
// Extract location // Extract location
const locationElement = await jobElement.$(".location"); const locationElement = await jobElement.$(".location");
const location = locationElement const location = locationElement
? cleanText(await locationElement.textContent()) ? cleanText(await locationElement.textContent())
: ""; : "";
// Extract date posted // Extract date posted
const dateElement = await jobElement.$(".job-date"); const dateElement = await jobElement.$(".job-date");
const dateText = dateElement const dateText = dateElement
? cleanText(await dateElement.textContent()) ? cleanText(await dateElement.textContent())
: ""; : "";
// Extract description // Extract description
const descElement = await jobElement.$(".job_listing-description"); const descElement = await jobElement.$(".job_listing-description");
const description = descElement const description = descElement
? cleanText(await descElement.textContent()) ? cleanText(await descElement.textContent())
: ""; : "";
// Check if featured // Check if featured
const featuredElement = await jobElement.$(".featured"); const featuredElement = await jobElement.$(".featured");
const isFeatured = featuredElement !== null; const isFeatured = featuredElement !== null;
// Parse date // Parse date
let datePosted = null; let datePosted = null;
let daysAgo = null; let daysAgo = null;
if (dateText) { if (dateText) {
const match = dateText.match(/(\d+)\s+days?\s+ago/); const match = dateText.match(/(\d+)\s+days?\s+ago/);
if (match) { if (match) {
daysAgo = parseInt(match[1]); daysAgo = parseInt(match[1]);
const date = new Date(); const date = new Date();
date.setDate(date.getDate() - daysAgo); date.setDate(date.getDate() - daysAgo);
datePosted = date.toISOString().split("T")[0]; datePosted = date.toISOString().split("T")[0];
} }
} }
return { return {
jobId, jobId,
title, title,
company, company,
location, location,
jobUrl, jobUrl,
datePosted, datePosted,
dateText, dateText,
daysAgo, daysAgo,
description, description,
isFeatured, isFeatured,
keyword, keyword,
extractedAt: new Date().toISOString(), extractedAt: new Date().toISOString(),
source: "skipthedrive", source: "skipthedrive",
}; };
} catch (error) { } catch (error) {
logger.warning(`Error extracting job data: ${error.message}`); logger.warning(`Error extracting job data: ${error.message}`);
return null; return null;
} }
} }
/** /**
* Check if next page is available * Check if next page is available
*/ */
async function hasNextPageAvailable(page) { async function hasNextPageAvailable(page) {
try { try {
const nextButton = await page.$(".next-page"); const nextButton = await page.$(".next-page");
return nextButton !== null; return nextButton !== null;
} catch { } catch {
return false; return false;
} }
} }
/** /**
* Navigate to next page * Navigate to next page
*/ */
async function navigateToNextPage(page, pageNumber) { async function navigateToNextPage(page, pageNumber) {
try { try {
const nextButton = await page.$(".next-page"); const nextButton = await page.$(".next-page");
if (nextButton) { if (nextButton) {
await nextButton.click(); await nextButton.click();
} }
} catch (error) { } catch (error) {
logger.warning( logger.warning(
`Failed to navigate to page ${pageNumber}: ${error.message}` `Failed to navigate to page ${pageNumber}: ${error.message}`
); );
} }
} }
module.exports = { module.exports = {
skipthedriveStrategy, skipthedriveStrategy,
buildSearchUrl, buildSearchUrl,
extractJobsFromPage, extractJobsFromPage,
extractJobData, extractJobData,
}; };

View File

@ -1,412 +1,412 @@
/** /**
* LinkedIn Parser Demo * LinkedIn Parser Demo
* *
* Demonstrates the LinkedIn Parser's capabilities for scraping LinkedIn content * Demonstrates the LinkedIn Parser's capabilities for scraping LinkedIn content
* with keyword-based searching, location filtering, and AI analysis. * with keyword-based searching, location filtering, and AI analysis.
* *
* This demo uses simulated data for safety and demonstration purposes. * This demo uses simulated data for safety and demonstration purposes.
*/ */
const { logger } = require("../ai-analyzer"); const { logger } = require("../ai-analyzer");
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");
// Terminal colors for demo output // Terminal colors for demo output
const colors = { const colors = {
reset: "\x1b[0m", reset: "\x1b[0m",
bright: "\x1b[1m", bright: "\x1b[1m",
cyan: "\x1b[36m", cyan: "\x1b[36m",
green: "\x1b[32m", green: "\x1b[32m",
yellow: "\x1b[33m", yellow: "\x1b[33m",
blue: "\x1b[34m", blue: "\x1b[34m",
magenta: "\x1b[35m", magenta: "\x1b[35m",
red: "\x1b[31m", red: "\x1b[31m",
}; };
const demo = { const demo = {
title: (text) => title: (text) =>
console.log(`\n${colors.bright}${colors.cyan}${text}${colors.reset}`), console.log(`\n${colors.bright}${colors.cyan}${text}${colors.reset}`),
section: (text) => section: (text) =>
console.log(`\n${colors.bright}${colors.magenta}${text}${colors.reset}`), console.log(`\n${colors.bright}${colors.magenta}${text}${colors.reset}`),
success: (text) => console.log(`${colors.green}${text}${colors.reset}`), success: (text) => console.log(`${colors.green}${text}${colors.reset}`),
info: (text) => console.log(`${colors.blue} ${text}${colors.reset}`), info: (text) => console.log(`${colors.blue} ${text}${colors.reset}`),
warning: (text) => console.log(`${colors.yellow}⚠️ ${text}${colors.reset}`), warning: (text) => console.log(`${colors.yellow}⚠️ ${text}${colors.reset}`),
error: (text) => console.log(`${colors.red}${text}${colors.reset}`), error: (text) => console.log(`${colors.red}${text}${colors.reset}`),
code: (text) => console.log(`${colors.cyan}${text}${colors.reset}`), code: (text) => console.log(`${colors.cyan}${text}${colors.reset}`),
}; };
// Mock data for demonstration // Mock data for demonstration
const mockPosts = [ const mockPosts = [
{ {
id: "post_1", id: "post_1",
content: content:
"Just got laid off from my software engineering role at TechCorp. Looking for new opportunities in Toronto. This is really tough but I'm staying positive!", "Just got laid off from my software engineering role at TechCorp. Looking for new opportunities in Toronto. This is really tough but I'm staying positive!",
original_content: original_content:
"Just got #laidoff from my software engineering role at TechCorp! Looking for new opportunities in #Toronto. This is really tough but I'm staying positive! 🚀", "Just got #laidoff from my software engineering role at TechCorp! Looking for new opportunities in #Toronto. This is really tough but I'm staying positive! 🚀",
author: { author: {
name: "John Doe", name: "John Doe",
title: "Software Engineer", title: "Software Engineer",
company: "TechCorp", company: "TechCorp",
location: "Toronto, Ontario, Canada", location: "Toronto, Ontario, Canada",
profile_url: "https://linkedin.com/in/johndoe", profile_url: "https://linkedin.com/in/johndoe",
}, },
engagement: { likes: 45, comments: 12, shares: 3 }, engagement: { likes: 45, comments: 12, shares: 3 },
metadata: { metadata: {
post_date: "2024-01-10T14:30:00Z", post_date: "2024-01-10T14:30:00Z",
scraped_at: "2024-01-15T10:30:00Z", scraped_at: "2024-01-15T10:30:00Z",
search_keyword: "layoff", search_keyword: "layoff",
location_validated: true, location_validated: true,
}, },
}, },
{ {
id: "post_2", id: "post_2",
content: content:
"Our company is downsizing and I'm affected. This is really tough news but I'm grateful for the time I had here.", "Our company is downsizing and I'm affected. This is really tough news but I'm grateful for the time I had here.",
original_content: original_content:
"Our company is #downsizing and I'm affected. This is really tough news but I'm grateful for the time I had here. #RIF #layoff", "Our company is #downsizing and I'm affected. This is really tough news but I'm grateful for the time I had here. #RIF #layoff",
author: { author: {
name: "Jane Smith", name: "Jane Smith",
title: "Product Manager", title: "Product Manager",
company: "StartupXYZ", company: "StartupXYZ",
location: "Vancouver, British Columbia, Canada", location: "Vancouver, British Columbia, Canada",
profile_url: "https://linkedin.com/in/janesmith", profile_url: "https://linkedin.com/in/janesmith",
}, },
engagement: { likes: 23, comments: 8, shares: 1 }, engagement: { likes: 23, comments: 8, shares: 1 },
metadata: { metadata: {
post_date: "2024-01-09T16:45:00Z", post_date: "2024-01-09T16:45:00Z",
scraped_at: "2024-01-15T10:30:00Z", scraped_at: "2024-01-15T10:30:00Z",
search_keyword: "downsizing", search_keyword: "downsizing",
location_validated: true, location_validated: true,
}, },
}, },
{ {
id: "post_3", id: "post_3",
content: content:
"Open to work! Looking for new opportunities in software development. I have 5 years of experience in React, Node.js, and cloud technologies.", "Open to work! Looking for new opportunities in software development. I have 5 years of experience in React, Node.js, and cloud technologies.",
original_content: original_content:
"Open to work! Looking for new opportunities in software development. I have 5 years of experience in #React, #NodeJS, and #cloud technologies. #opentowork #jobsearch", "Open to work! Looking for new opportunities in software development. I have 5 years of experience in #React, #NodeJS, and #cloud technologies. #opentowork #jobsearch",
author: { author: {
name: "Bob Wilson", name: "Bob Wilson",
title: "Full Stack Developer", title: "Full Stack Developer",
company: "Freelance", company: "Freelance",
location: "Calgary, Alberta, Canada", location: "Calgary, Alberta, Canada",
profile_url: "https://linkedin.com/in/bobwilson", profile_url: "https://linkedin.com/in/bobwilson",
}, },
engagement: { likes: 67, comments: 15, shares: 8 }, engagement: { likes: 67, comments: 15, shares: 8 },
metadata: { metadata: {
post_date: "2024-01-08T11:20:00Z", post_date: "2024-01-08T11:20:00Z",
scraped_at: "2024-01-15T10:30:00Z", scraped_at: "2024-01-15T10:30:00Z",
search_keyword: "open to work", search_keyword: "open to work",
location_validated: true, location_validated: true,
}, },
}, },
]; ];
async function runDemo() { async function runDemo() {
demo.title("=== LinkedIn Parser Demo ==="); demo.title("=== LinkedIn Parser Demo ===");
demo.info( demo.info(
"This demo showcases the LinkedIn Parser's capabilities for scraping LinkedIn content." "This demo showcases the LinkedIn Parser's capabilities for scraping LinkedIn content."
); );
demo.info("All data shown is simulated for demonstration purposes."); demo.info("All data shown is simulated for demonstration purposes.");
demo.info("Press Enter to continue through each section...\n"); demo.info("Press Enter to continue through each section...\n");
await waitForEnter(); await waitForEnter();
// 1. Configuration Demo // 1. Configuration Demo
await demonstrateConfiguration(); await demonstrateConfiguration();
// 2. Keyword Loading Demo // 2. Keyword Loading Demo
await demonstrateKeywordLoading(); await demonstrateKeywordLoading();
// 3. Search Process Demo // 3. Search Process Demo
await demonstrateSearchProcess(); await demonstrateSearchProcess();
// 4. Location Filtering Demo // 4. Location Filtering Demo
await demonstrateLocationFiltering(); await demonstrateLocationFiltering();
// 5. AI Analysis Demo // 5. AI Analysis Demo
await demonstrateAIAnalysis(); await demonstrateAIAnalysis();
// 6. Output Generation Demo // 6. Output Generation Demo
await demonstrateOutputGeneration(); await demonstrateOutputGeneration();
demo.title("=== Demo Complete ==="); demo.title("=== Demo Complete ===");
demo.success("LinkedIn Parser demo completed successfully!"); demo.success("LinkedIn Parser demo completed successfully!");
demo.info("Check the README.md for detailed usage instructions."); demo.info("Check the README.md for detailed usage instructions.");
} }
async function demonstrateConfiguration() { async function demonstrateConfiguration() {
demo.section("1. Configuration Setup"); demo.section("1. Configuration Setup");
demo.info( demo.info(
"The LinkedIn Parser uses environment variables and command-line options for configuration." "The LinkedIn Parser uses environment variables and command-line options for configuration."
); );
demo.code("// Environment Variables (.env file)"); demo.code("// Environment Variables (.env file)");
demo.info("LINKEDIN_USERNAME=your_email@example.com"); demo.info("LINKEDIN_USERNAME=your_email@example.com");
demo.info("LINKEDIN_PASSWORD=your_password"); demo.info("LINKEDIN_PASSWORD=your_password");
demo.info("CITY=Toronto"); demo.info("CITY=Toronto");
demo.info("DATE_POSTED=past-week"); demo.info("DATE_POSTED=past-week");
demo.info("SORT_BY=date_posted"); demo.info("SORT_BY=date_posted");
demo.info("WHEELS=5"); demo.info("WHEELS=5");
demo.info("LOCATION_FILTER=Ontario,Manitoba"); demo.info("LOCATION_FILTER=Ontario,Manitoba");
demo.info("ENABLE_LOCATION_CHECK=true"); demo.info("ENABLE_LOCATION_CHECK=true");
demo.info("ENABLE_LOCAL_AI=true"); demo.info("ENABLE_LOCAL_AI=true");
demo.info('AI_CONTEXT="job layoffs and workforce reduction"'); demo.info('AI_CONTEXT="job layoffs and workforce reduction"');
demo.info("OLLAMA_MODEL=mistral"); demo.info("OLLAMA_MODEL=mistral");
demo.code("// Command Line Options"); demo.code("// Command Line Options");
demo.info('node index.js --keyword="layoff,downsizing" --city="Vancouver"'); demo.info('node index.js --keyword="layoff,downsizing" --city="Vancouver"');
demo.info("node index.js --no-location --no-ai"); demo.info("node index.js --no-location --no-ai");
demo.info("node index.js --output=results/my-results.json"); demo.info("node index.js --output=results/my-results.json");
demo.info("node index.js --ai-after"); demo.info("node index.js --ai-after");
await waitForEnter(); await waitForEnter();
} }
async function demonstrateKeywordLoading() { async function demonstrateKeywordLoading() {
demo.section("2. Keyword Loading"); demo.section("2. Keyword Loading");
demo.info( demo.info(
"Keywords can be loaded from CSV files or specified via command line." "Keywords can be loaded from CSV files or specified via command line."
); );
// Simulate loading keywords from CSV // Simulate loading keywords from CSV
demo.code("// Loading keywords from CSV file"); demo.code("// Loading keywords from CSV file");
logger.step("Loading keywords from keywords/linkedin-keywords.csv"); logger.step("Loading keywords from keywords/linkedin-keywords.csv");
const keywords = [ const keywords = [
"layoff", "layoff",
"downsizing", "downsizing",
"reduction in force", "reduction in force",
"RIF", "RIF",
"termination", "termination",
"job loss", "job loss",
"workforce reduction", "workforce reduction",
"open to work", "open to work",
"actively seeking", "actively seeking",
"job search", "job search",
]; ];
demo.success(`Loaded ${keywords.length} keywords from CSV file`); demo.success(`Loaded ${keywords.length} keywords from CSV file`);
demo.info("Keywords: " + keywords.slice(0, 5).join(", ") + "..."); demo.info("Keywords: " + keywords.slice(0, 5).join(", ") + "...");
demo.code("// Command line keyword override"); demo.code("// Command line keyword override");
demo.info('node index.js --keyword="layoff,downsizing"'); demo.info('node index.js --keyword="layoff,downsizing"');
demo.info('node index.js --add-keyword="hiring freeze"'); demo.info('node index.js --add-keyword="hiring freeze"');
await waitForEnter(); await waitForEnter();
} }
async function demonstrateSearchProcess() { async function demonstrateSearchProcess() {
demo.section("3. Search Process Simulation"); demo.section("3. Search Process Simulation");
demo.info( demo.info(
"The parser performs automated LinkedIn searches for each keyword." "The parser performs automated LinkedIn searches for each keyword."
); );
const keywords = ["layoff", "downsizing", "open to work"]; const keywords = ["layoff", "downsizing", "open to work"];
for (const keyword of keywords) { for (const keyword of keywords) {
demo.code(`// Searching for keyword: "${keyword}"`); demo.code(`// Searching for keyword: "${keyword}"`);
logger.search(`Searching for "${keyword}" in Toronto`); logger.search(`Searching for "${keyword}" in Toronto`);
// Simulate search process // Simulate search process
await simulateSearch(); await simulateSearch();
const foundCount = Math.floor(Math.random() * 50) + 10; const foundCount = Math.floor(Math.random() * 50) + 10;
const acceptedCount = Math.floor(foundCount * 0.3); const acceptedCount = Math.floor(foundCount * 0.3);
logger.info(`Found ${foundCount} posts, checking profiles for location...`); logger.info(`Found ${foundCount} posts, checking profiles for location...`);
logger.success(`Accepted ${acceptedCount} posts after location validation`); logger.success(`Accepted ${acceptedCount} posts after location validation`);
console.log(); console.log();
} }
await waitForEnter(); await waitForEnter();
} }
async function demonstrateLocationFiltering() { async function demonstrateLocationFiltering() {
demo.section("4. Location Filtering"); demo.section("4. Location Filtering");
demo.info( demo.info(
"Posts are filtered based on author location using geographic validation." "Posts are filtered based on author location using geographic validation."
); );
demo.code("// Location filter configuration"); demo.code("// Location filter configuration");
demo.info("LOCATION_FILTER=Ontario,Manitoba"); demo.info("LOCATION_FILTER=Ontario,Manitoba");
demo.info("ENABLE_LOCATION_CHECK=true"); demo.info("ENABLE_LOCATION_CHECK=true");
demo.code("// Location validation examples"); demo.code("// Location validation examples");
const testLocations = [ const testLocations = [
{ location: "Toronto, Ontario, Canada", valid: true }, { location: "Toronto, Ontario, Canada", valid: true },
{ location: "Vancouver, British Columbia, Canada", valid: false }, { location: "Vancouver, British Columbia, Canada", valid: false },
{ location: "Calgary, Alberta, Canada", valid: false }, { location: "Calgary, Alberta, Canada", valid: false },
{ location: "Winnipeg, Manitoba, Canada", valid: true }, { location: "Winnipeg, Manitoba, Canada", valid: true },
{ location: "New York, NY, USA", valid: false }, { location: "New York, NY, USA", valid: false },
]; ];
testLocations.forEach(({ location, valid }) => { testLocations.forEach(({ location, valid }) => {
logger.location(`Checking location: ${location}`); logger.location(`Checking location: ${location}`);
if (valid) { if (valid) {
logger.success(`✅ Location valid - post accepted`); logger.success(`✅ Location valid - post accepted`);
} else { } else {
logger.warning(`❌ Location invalid - post rejected`); logger.warning(`❌ Location invalid - post rejected`);
} }
}); });
await waitForEnter(); await waitForEnter();
} }
async function demonstrateAIAnalysis() { async function demonstrateAIAnalysis() {
demo.section("5. AI Analysis"); demo.section("5. AI Analysis");
demo.info( demo.info(
"Posts can be analyzed using local Ollama or OpenAI for relevance scoring." "Posts can be analyzed using local Ollama or OpenAI for relevance scoring."
); );
demo.code("// AI analysis configuration"); demo.code("// AI analysis configuration");
demo.info("ENABLE_LOCAL_AI=true"); demo.info("ENABLE_LOCAL_AI=true");
demo.info('AI_CONTEXT="job layoffs and workforce reduction"'); demo.info('AI_CONTEXT="job layoffs and workforce reduction"');
demo.info("OLLAMA_MODEL=mistral"); demo.info("OLLAMA_MODEL=mistral");
demo.code("// Analyzing posts with AI"); demo.code("// Analyzing posts with AI");
logger.ai("Starting AI analysis of accepted posts..."); logger.ai("Starting AI analysis of accepted posts...");
for (let i = 0; i < mockPosts.length; i++) { for (let i = 0; i < mockPosts.length; i++) {
const post = mockPosts[i]; const post = mockPosts[i];
logger.info(`Analyzing post ${i + 1}: ${post.content.substring(0, 50)}...`); logger.info(`Analyzing post ${i + 1}: ${post.content.substring(0, 50)}...`);
// Simulate AI analysis // Simulate AI analysis
await simulateProcessing(); await simulateProcessing();
const relevanceScore = 0.7 + Math.random() * 0.3; const relevanceScore = 0.7 + Math.random() * 0.3;
const confidence = 0.8 + Math.random() * 0.2; const confidence = 0.8 + Math.random() * 0.2;
logger.success( logger.success(
`Relevance: ${relevanceScore.toFixed( `Relevance: ${relevanceScore.toFixed(
2 2
)}, Confidence: ${confidence.toFixed(2)}` )}, Confidence: ${confidence.toFixed(2)}`
); );
// Add AI analysis to post // Add AI analysis to post
post.ai_analysis = { post.ai_analysis = {
relevance_score: relevanceScore, relevance_score: relevanceScore,
confidence: confidence, confidence: confidence,
context_match: relevanceScore > 0.7, context_match: relevanceScore > 0.7,
analysis_text: `This post discusses ${post.metadata.search_keyword} and is relevant to the search context.`, analysis_text: `This post discusses ${post.metadata.search_keyword} and is relevant to the search context.`,
}; };
} }
await waitForEnter(); await waitForEnter();
} }
async function demonstrateOutputGeneration() { async function demonstrateOutputGeneration() {
demo.section("6. Output Generation"); demo.section("6. Output Generation");
demo.info("Results are saved to JSON files with comprehensive metadata."); demo.info("Results are saved to JSON files with comprehensive metadata.");
demo.code("// Generating output file"); demo.code("// Generating output file");
logger.file("Saving results to JSON file..."); logger.file("Saving results to JSON file...");
const outputData = { const outputData = {
metadata: { metadata: {
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
keywords: ["layoff", "downsizing", "open to work"], keywords: ["layoff", "downsizing", "open to work"],
city: "Toronto", city: "Toronto",
date_posted: "past-week", date_posted: "past-week",
sort_by: "date_posted", sort_by: "date_posted",
total_posts_found: 150, total_posts_found: 150,
accepted_posts: mockPosts.length, accepted_posts: mockPosts.length,
rejected_posts: 147, rejected_posts: 147,
processing_time_seconds: 180, processing_time_seconds: 180,
}, },
posts: mockPosts, posts: mockPosts,
}; };
// Save to demo file // Save to demo file
const outputPath = path.join(__dirname, "demo-results.json"); const outputPath = path.join(__dirname, "demo-results.json");
fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2)); fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2));
demo.success(`Results saved to: ${outputPath}`); demo.success(`Results saved to: ${outputPath}`);
demo.info(`Total posts processed: ${outputData.metadata.total_posts_found}`); demo.info(`Total posts processed: ${outputData.metadata.total_posts_found}`);
demo.info(`Posts accepted: ${outputData.metadata.accepted_posts}`); demo.info(`Posts accepted: ${outputData.metadata.accepted_posts}`);
demo.info(`Posts rejected: ${outputData.metadata.rejected_posts}`); demo.info(`Posts rejected: ${outputData.metadata.rejected_posts}`);
demo.code("// Output file structure"); demo.code("// Output file structure");
demo.info("📁 demo-results.json"); demo.info("📁 demo-results.json");
demo.info(" ├── metadata"); demo.info(" ├── metadata");
demo.info(" │ ├── timestamp"); demo.info(" │ ├── timestamp");
demo.info(" │ ├── keywords"); demo.info(" │ ├── keywords");
demo.info(" │ ├── city"); demo.info(" │ ├── city");
demo.info(" │ ├── total_posts_found"); demo.info(" │ ├── total_posts_found");
demo.info(" │ ├── accepted_posts"); demo.info(" │ ├── accepted_posts");
demo.info(" │ └── processing_time_seconds"); demo.info(" │ └── processing_time_seconds");
demo.info(" └── posts[]"); demo.info(" └── posts[]");
demo.info(" ├── id"); demo.info(" ├── id");
demo.info(" ├── content"); demo.info(" ├── content");
demo.info(" ├── author"); demo.info(" ├── author");
demo.info(" ├── engagement"); demo.info(" ├── engagement");
demo.info(" ├── ai_analysis"); demo.info(" ├── ai_analysis");
demo.info(" └── metadata"); demo.info(" └── metadata");
await waitForEnter(); await waitForEnter();
} }
// Helper functions // Helper functions
function waitForEnter() { function waitForEnter() {
return new Promise((resolve) => { return new Promise((resolve) => {
const readline = require("readline"); const readline = require("readline");
const rl = readline.createInterface({ const rl = readline.createInterface({
input: process.stdin, input: process.stdin,
output: process.stdout, output: process.stdout,
}); });
rl.question("\nPress Enter to continue...", () => { rl.question("\nPress Enter to continue...", () => {
rl.close(); rl.close();
resolve(); resolve();
}); });
}); });
} }
async function simulateSearch() { async function simulateSearch() {
return new Promise((resolve) => { return new Promise((resolve) => {
const steps = [ const steps = [
"Launching browser", "Launching browser",
"Logging in", "Logging in",
"Navigating to search", "Navigating to search",
"Loading results", "Loading results",
]; ];
let i = 0; let i = 0;
const interval = setInterval(() => { const interval = setInterval(() => {
if (i < steps.length) { if (i < steps.length) {
logger.info(steps[i]); logger.info(steps[i]);
i++; i++;
} else { } else {
clearInterval(interval); clearInterval(interval);
resolve(); resolve();
} }
}, 800); }, 800);
}); });
} }
async function simulateProcessing() { async function simulateProcessing() {
return new Promise((resolve) => { return new Promise((resolve) => {
const dots = [".", "..", "..."]; const dots = [".", "..", "..."];
let i = 0; let i = 0;
const interval = setInterval(() => { const interval = setInterval(() => {
process.stdout.write(`\rProcessing${dots[i]}`); process.stdout.write(`\rProcessing${dots[i]}`);
i = (i + 1) % dots.length; i = (i + 1) % dots.length;
}, 500); }, 500);
setTimeout(() => { setTimeout(() => {
clearInterval(interval); clearInterval(interval);
process.stdout.write("\r"); process.stdout.write("\r");
resolve(); resolve();
}, 1500); }, 1500);
}); });
} }
// Run the demo if this file is executed directly // Run the demo if this file is executed directly
if (require.main === module) { if (require.main === module) {
runDemo().catch((error) => { runDemo().catch((error) => {
demo.error(`Demo failed: ${error.message}`); demo.error(`Demo failed: ${error.message}`);
process.exit(1); process.exit(1);
}); });
} }
module.exports = { runDemo }; module.exports = { runDemo };

View File

@ -1,51 +1,51 @@
keyword keyword
acquisition acquisition
actively seeking actively seeking
bankruptcy bankruptcy
business realignment business realignment
career transition career transition
company closure company closure
company reorganization company reorganization
cost cutting cost cutting
department closure department closure
downsizing downsizing
furlough furlough
headcount reduction headcount reduction
hiring hiring
hiring freeze hiring freeze
involuntary separation involuntary separation
job cuts job cuts
job elimination job elimination
job loss job loss
job opportunity job opportunity
job search job search
layoff layoff
looking for opportunities looking for opportunities
mass layoff mass layoff
merger merger
new position new position
new role new role
office closure office closure
open to work open to work
organizational change organizational change
outplacement outplacement
plant closure plant closure
position elimination position elimination
recruiting recruiting
reduction in force reduction in force
redundancies redundancies
redundancy redundancy
restructuring restructuring
rightsizing rightsizing
RIF RIF
role elimination role elimination
separation separation
site closure site closure
staff reduction staff reduction
terminated terminated
termination termination
voluntary separation voluntary separation
workforce adjustment workforce adjustment
workforce optimization workforce optimization
workforce reduction workforce reduction
workforce transition workforce transition

1 keyword
2 acquisition
3 actively seeking
4 bankruptcy
5 business realignment
6 career transition
7 company closure
8 company reorganization
9 cost cutting
10 department closure
11 downsizing
12 furlough
13 headcount reduction
14 hiring
15 hiring freeze
16 involuntary separation
17 job cuts
18 job elimination
19 job loss
20 job opportunity
21 job search
22 layoff
23 looking for opportunities
24 mass layoff
25 merger
26 new position
27 new role
28 office closure
29 open to work
30 organizational change
31 outplacement
32 plant closure
33 position elimination
34 recruiting
35 reduction in force
36 redundancies
37 redundancy
38 restructuring
39 rightsizing
40 RIF
41 role elimination
42 separation
43 site closure
44 staff reduction
45 terminated
46 termination
47 voluntary separation
48 workforce adjustment
49 workforce optimization
50 workforce reduction
51 workforce transition

View File

@ -1,230 +1,230 @@
/** /**
* LinkedIn Parsing Strategy * LinkedIn Parsing Strategy
* *
* Uses core-parser for browser management and ai-analyzer for utilities * Uses core-parser for browser management and ai-analyzer for utilities
*/ */
const { const {
logger, logger,
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
validateLocationAgainstFilters, validateLocationAgainstFilters,
extractLocationFromProfile, extractLocationFromProfile,
} = require("ai-analyzer"); } = require("ai-analyzer");
/** /**
* LinkedIn parsing strategy function * LinkedIn parsing strategy function
*/ */
async function linkedinStrategy(coreParser, options = {}) { async function linkedinStrategy(coreParser, options = {}) {
const { const {
keywords = ["layoff", "downsizing", "job cuts"], keywords = ["layoff", "downsizing", "job cuts"],
locationFilter = null, locationFilter = null,
maxResults = 50, maxResults = 50,
credentials = {}, credentials = {},
} = options; } = options;
const results = []; const results = [];
const rejectedResults = []; const rejectedResults = [];
const seenPosts = new Set(); const seenPosts = new Set();
const seenProfiles = new Set(); const seenProfiles = new Set();
try { try {
// Create main page // Create main page
const page = await coreParser.createPage("linkedin-main"); const page = await coreParser.createPage("linkedin-main");
// Authenticate to LinkedIn // Authenticate to LinkedIn
logger.info("🔐 Authenticating to LinkedIn..."); logger.info("🔐 Authenticating to LinkedIn...");
await coreParser.authenticate("linkedin", credentials, "linkedin-main"); await coreParser.authenticate("linkedin", credentials, "linkedin-main");
logger.info("✅ LinkedIn authentication successful"); logger.info("✅ LinkedIn authentication successful");
// Search for posts with each keyword // Search for posts with each keyword
for (const keyword of keywords) { for (const keyword of keywords) {
logger.info(`🔍 Searching LinkedIn for: "${keyword}"`); logger.info(`🔍 Searching LinkedIn for: "${keyword}"`);
const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent( const searchUrl = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
keyword keyword
)}&sortBy=date_posted`; )}&sortBy=date_posted`;
await coreParser.navigateTo(searchUrl, { await coreParser.navigateTo(searchUrl, {
pageId: "linkedin-main", pageId: "linkedin-main",
retries: 2, retries: 2,
}); });
// Wait for search results // Wait for search results
const hasResults = await coreParser.navigationManager.navigateAndWaitFor( const hasResults = await coreParser.navigationManager.navigateAndWaitFor(
searchUrl, searchUrl,
".search-results-container", ".search-results-container",
{ pageId: "linkedin-main", timeout: 10000 } { pageId: "linkedin-main", timeout: 10000 }
); );
if (!hasResults) { if (!hasResults) {
logger.warning(`No search results found for keyword: ${keyword}`); logger.warning(`No search results found for keyword: ${keyword}`);
continue; continue;
} }
// Extract posts from current page // Extract posts from current page
const posts = await extractPostsFromPage(page, keyword); const posts = await extractPostsFromPage(page, keyword);
for (const post of posts) { for (const post of posts) {
// Skip duplicates // Skip duplicates
if (seenPosts.has(post.postId)) continue; if (seenPosts.has(post.postId)) continue;
seenPosts.add(post.postId); seenPosts.add(post.postId);
// Validate location if filtering enabled // Validate location if filtering enabled
if (locationFilter) { if (locationFilter) {
const locationValid = validateLocationAgainstFilters( const locationValid = validateLocationAgainstFilters(
post.location || post.profileLocation, post.location || post.profileLocation,
locationFilter locationFilter
); );
if (!locationValid) { if (!locationValid) {
rejectedResults.push({ rejectedResults.push({
...post, ...post,
rejectionReason: "Location filter mismatch", rejectionReason: "Location filter mismatch",
}); });
continue; continue;
} }
} }
results.push(post); results.push(post);
if (results.length >= maxResults) { if (results.length >= maxResults) {
logger.info(`📊 Reached maximum results limit: ${maxResults}`); logger.info(`📊 Reached maximum results limit: ${maxResults}`);
break; break;
} }
} }
if (results.length >= maxResults) break; if (results.length >= maxResults) break;
} }
logger.info( logger.info(
`🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected` `🎯 LinkedIn parsing completed: ${results.length} posts found, ${rejectedResults.length} rejected`
); );
return { return {
results, results,
rejectedResults, rejectedResults,
summary: { summary: {
totalPosts: results.length, totalPosts: results.length,
totalRejected: rejectedResults.length, totalRejected: rejectedResults.length,
keywords: keywords.join(", "), keywords: keywords.join(", "),
locationFilter, locationFilter,
}, },
}; };
} catch (error) { } catch (error) {
logger.error(`❌ LinkedIn parsing failed: ${error.message}`); logger.error(`❌ LinkedIn parsing failed: ${error.message}`);
throw error; throw error;
} }
} }
/** /**
* Extract posts from current search results page * Extract posts from current search results page
*/ */
async function extractPostsFromPage(page, keyword) { async function extractPostsFromPage(page, keyword) {
const posts = []; const posts = [];
try { try {
// Get all post elements // Get all post elements
const postElements = await page.$$(".feed-shared-update-v2"); const postElements = await page.$$(".feed-shared-update-v2");
for (const postElement of postElements) { for (const postElement of postElements) {
try { try {
const post = await extractPostData(postElement, keyword); const post = await extractPostData(postElement, keyword);
if (post) { if (post) {
posts.push(post); posts.push(post);
} }
} catch (error) { } catch (error) {
logger.warning(`Failed to extract post data: ${error.message}`); logger.warning(`Failed to extract post data: ${error.message}`);
} }
} }
} catch (error) { } catch (error) {
logger.error(`Failed to extract posts from page: ${error.message}`); logger.error(`Failed to extract posts from page: ${error.message}`);
} }
return posts; return posts;
} }
/** /**
* Extract data from individual post element * Extract data from individual post element
*/ */
async function extractPostData(postElement, keyword) { async function extractPostData(postElement, keyword) {
try { try {
// Extract post ID // Extract post ID
const postId = (await postElement.getAttribute("data-urn")) || ""; const postId = (await postElement.getAttribute("data-urn")) || "";
// Extract author info // Extract author info
const authorElement = await postElement.$(".feed-shared-actor__name"); const authorElement = await postElement.$(".feed-shared-actor__name");
const authorName = authorElement const authorName = authorElement
? cleanText(await authorElement.textContent()) ? cleanText(await authorElement.textContent())
: ""; : "";
const authorLinkElement = await postElement.$(".feed-shared-actor__name a"); const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
const authorUrl = authorLinkElement const authorUrl = authorLinkElement
? await authorLinkElement.getAttribute("href") ? await authorLinkElement.getAttribute("href")
: ""; : "";
// Extract post content // Extract post content
const contentElement = await postElement.$(".feed-shared-text"); const contentElement = await postElement.$(".feed-shared-text");
const content = contentElement const content = contentElement
? cleanText(await contentElement.textContent()) ? cleanText(await contentElement.textContent())
: ""; : "";
// Extract timestamp // Extract timestamp
const timeElement = await postElement.$( const timeElement = await postElement.$(
".feed-shared-actor__sub-description time" ".feed-shared-actor__sub-description time"
); );
const timestamp = timeElement const timestamp = timeElement
? await timeElement.getAttribute("datetime") ? await timeElement.getAttribute("datetime")
: ""; : "";
// Extract engagement metrics // Extract engagement metrics
const likesElement = await postElement.$(".social-counts-reactions__count"); const likesElement = await postElement.$(".social-counts-reactions__count");
const likesText = likesElement const likesText = likesElement
? cleanText(await likesElement.textContent()) ? cleanText(await likesElement.textContent())
: "0"; : "0";
const commentsElement = await postElement.$( const commentsElement = await postElement.$(
".social-counts-comments__count" ".social-counts-comments__count"
); );
const commentsText = commentsElement const commentsText = commentsElement
? cleanText(await commentsElement.textContent()) ? cleanText(await commentsElement.textContent())
: "0"; : "0";
// Check if post contains relevant keywords // Check if post contains relevant keywords
const isRelevant = containsAnyKeyword(content, [keyword]); const isRelevant = containsAnyKeyword(content, [keyword]);
if (!isRelevant) { if (!isRelevant) {
return null; // Skip irrelevant posts return null; // Skip irrelevant posts
} }
return { return {
postId: cleanText(postId), postId: cleanText(postId),
authorName, authorName,
authorUrl, authorUrl,
content, content,
timestamp, timestamp,
keyword, keyword,
likes: extractNumber(likesText), likes: extractNumber(likesText),
comments: extractNumber(commentsText), comments: extractNumber(commentsText),
extractedAt: new Date().toISOString(), extractedAt: new Date().toISOString(),
source: "linkedin", source: "linkedin",
}; };
} catch (error) { } catch (error) {
logger.warning(`Error extracting post data: ${error.message}`); logger.warning(`Error extracting post data: ${error.message}`);
return null; return null;
} }
} }
/** /**
* Extract numbers from text (e.g., "15 likes" -> 15) * Extract numbers from text (e.g., "15 likes" -> 15)
*/ */
function extractNumber(text) { function extractNumber(text) {
const match = text.match(/\d+/); const match = text.match(/\d+/);
return match ? parseInt(match[0]) : 0; return match ? parseInt(match[0]) : 0;
} }
module.exports = { module.exports = {
linkedinStrategy, linkedinStrategy,
extractPostsFromPage, extractPostsFromPage,
extractPostData, extractPostData,
}; };

View File

@ -1,34 +1,34 @@
{ {
"results": [ "results": [
{ {
"text": "Just got laid off from my software engineering role. Looking for new opportunities in the Toronto area.", "text": "Just got laid off from my software engineering role. Looking for new opportunities in the Toronto area.",
"location": "Toronto, Ontario, Canada", "location": "Toronto, Ontario, Canada",
"keyword": "layoff", "keyword": "layoff",
"timestamp": "2024-01-15T10:30:00Z" "timestamp": "2024-01-15T10:30:00Z"
}, },
{ {
"text": "Excited to share that I'm starting a new position as a Senior Developer at TechCorp!", "text": "Excited to share that I'm starting a new position as a Senior Developer at TechCorp!",
"location": "Vancouver, BC, Canada", "location": "Vancouver, BC, Canada",
"keyword": "hiring", "keyword": "hiring",
"timestamp": "2024-01-15T11:00:00Z" "timestamp": "2024-01-15T11:00:00Z"
}, },
{ {
"text": "Our company is going through a restructuring and unfortunately had to let go of 50 employees.", "text": "Our company is going through a restructuring and unfortunately had to let go of 50 employees.",
"location": "Montreal, Quebec, Canada", "location": "Montreal, Quebec, Canada",
"keyword": "layoff", "keyword": "layoff",
"timestamp": "2024-01-15T11:30:00Z" "timestamp": "2024-01-15T11:30:00Z"
}, },
{ {
"text": "Beautiful weather today! Perfect for a walk in the park.", "text": "Beautiful weather today! Perfect for a walk in the park.",
"location": "Calgary, Alberta, Canada", "location": "Calgary, Alberta, Canada",
"keyword": "weather", "keyword": "weather",
"timestamp": "2024-01-15T12:00:00Z" "timestamp": "2024-01-15T12:00:00Z"
}, },
{ {
"text": "We're hiring! Looking for talented developers to join our growing team.", "text": "We're hiring! Looking for talented developers to join our growing team.",
"location": "Ottawa, Ontario, Canada", "location": "Ottawa, Ontario, Canada",
"keyword": "hiring", "keyword": "hiring",
"timestamp": "2024-01-15T12:30:00Z" "timestamp": "2024-01-15T12:30:00Z"
} }
] ]
} }