- Updated `ai-utils.js` to improve AI response parsing and added timeout handling for API requests. - Modified `linkedin-parser` to refine search keyword handling and improve post extraction reliability. - Enhanced location filtering logic and added more robust selectors for extracting post data. - Improved logging for debugging purposes, including detailed extraction results and fallback mechanisms.
217 lines
7.1 KiB
JavaScript
217 lines
7.1 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
/**
|
|
* LinkedIn Parser - Refactored
|
|
*
|
|
* Uses core-parser for browser management and linkedin-strategy for parsing logic
|
|
*/
|
|
|
|
const path = require("path");
|
|
const fs = require("fs");
|
|
const CoreParser = require("../core-parser");
|
|
const { linkedinStrategy } = require("./strategies/linkedin-strategy");
|
|
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
|
|
|
|
// Load environment variables - check both linkedin-parser/.env and root .env
|
|
const localEnvPath = path.join(__dirname, ".env");
|
|
const rootEnvPath = path.join(__dirname, "..", ".env");
|
|
|
|
// Try local .env first, then root .env
|
|
if (fs.existsSync(localEnvPath)) {
|
|
require("dotenv").config({ path: localEnvPath });
|
|
} else if (fs.existsSync(rootEnvPath)) {
|
|
require("dotenv").config({ path: rootEnvPath });
|
|
} else {
|
|
// Try default dotenv behavior (looks in current directory and parent directories)
|
|
require("dotenv").config();
|
|
}
|
|
|
|
// Configuration from environment
|
|
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
|
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
|
const HEADLESS = process.env.HEADLESS !== "false";
|
|
const SEARCH_KEYWORDS =
|
|
process.env.SEARCH_KEYWORDS || "layoff,downsizing";//,job cuts";
|
|
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
|
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
|
|
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
|
|
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
|
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
|
|
|
|
/**
|
|
* Main LinkedIn parser function
|
|
*/
|
|
async function startLinkedInParser(options = {}) {
|
|
const coreParser = new CoreParser({
|
|
headless: HEADLESS,
|
|
timeout: 30000,
|
|
});
|
|
|
|
try {
|
|
logger.step("🚀 LinkedIn Parser Starting...");
|
|
|
|
// Validate credentials
|
|
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
|
|
throw new Error(
|
|
"LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file"
|
|
);
|
|
}
|
|
|
|
// Parse keywords
|
|
const keywords = SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
|
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
|
logger.info(`📍 Location Filter: ${LOCATION_FILTER || "None"}`);
|
|
logger.info(
|
|
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
|
|
);
|
|
logger.info(`📊 Max Results: ${MAX_RESULTS}`);
|
|
|
|
// Run LinkedIn parsing strategy
|
|
const parseResult = await linkedinStrategy(coreParser, {
|
|
keywords,
|
|
locationFilter: LOCATION_FILTER,
|
|
maxResults: MAX_RESULTS,
|
|
credentials: {
|
|
username: LINKEDIN_USERNAME,
|
|
password: LINKEDIN_PASSWORD,
|
|
},
|
|
});
|
|
|
|
const { results, rejectedResults, summary } = parseResult;
|
|
|
|
// AI Analysis if enabled - embed results into each post
|
|
let resultsWithAI = results;
|
|
let aiAnalysisCompleted = false;
|
|
if (ENABLE_AI_ANALYSIS && results.length > 0) {
|
|
logger.step("🧠 Running AI Analysis...");
|
|
|
|
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL);
|
|
if (ollamaAvailable) {
|
|
// Prepare data for analysis (analyzeBatch expects posts with 'text' field)
|
|
const analysisData = results.map((post) => ({
|
|
text: post.text || post.content || "",
|
|
location: post.location || "",
|
|
keyword: post.keyword || "",
|
|
timestamp: post.timestamp || post.extractedAt || "",
|
|
}));
|
|
|
|
const analysisResults = await analyzeBatch(
|
|
analysisData,
|
|
AI_CONTEXT,
|
|
OLLAMA_MODEL
|
|
);
|
|
|
|
// Embed AI analysis into each result
|
|
resultsWithAI = results.map((post, index) => {
|
|
const aiResult = analysisResults[index];
|
|
return {
|
|
...post,
|
|
aiAnalysis: {
|
|
isRelevant: aiResult.isRelevant,
|
|
confidence: aiResult.confidence,
|
|
reasoning: aiResult.reasoning,
|
|
context: AI_CONTEXT,
|
|
model: OLLAMA_MODEL,
|
|
analyzedAt: new Date().toISOString(),
|
|
},
|
|
};
|
|
});
|
|
|
|
aiAnalysisCompleted = true;
|
|
logger.success(`✅ AI Analysis completed for ${results.length} posts`);
|
|
} else {
|
|
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
|
}
|
|
}
|
|
|
|
// Prepare results with embedded AI analysis
|
|
const outputData = {
|
|
metadata: {
|
|
timestamp: new Date().toISOString(),
|
|
totalPosts: resultsWithAI.length,
|
|
rejectedPosts: rejectedResults.length,
|
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
|
aiAnalysisCompleted: aiAnalysisCompleted,
|
|
aiContext: aiAnalysisCompleted ? AI_CONTEXT : undefined,
|
|
aiModel: aiAnalysisCompleted ? OLLAMA_MODEL : undefined,
|
|
locationFilter: LOCATION_FILTER || undefined,
|
|
parser: "linkedin-parser",
|
|
version: "2.0.0",
|
|
},
|
|
results: resultsWithAI,
|
|
};
|
|
|
|
// Prepare rejected posts file
|
|
const rejectedData = rejectedResults.map((post) => ({
|
|
rejected: true,
|
|
reason: post.rejectionReason || "Location filter failed: Location not in filter",
|
|
keyword: post.keyword,
|
|
text: post.text || post.content,
|
|
profileLink: post.profileLink || post.authorUrl,
|
|
location: post.location || post.profileLocation,
|
|
timestamp: post.timestamp || post.extractedAt,
|
|
}));
|
|
|
|
const resultsDir = path.join(__dirname, "results");
|
|
if (!fs.existsSync(resultsDir)) {
|
|
fs.mkdirSync(resultsDir, { recursive: true });
|
|
}
|
|
|
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
const resultsFilename = `linkedin-results-${timestamp}.json`;
|
|
const rejectedFilename = `linkedin-rejected-${timestamp}.json`;
|
|
const resultsFilepath = path.join(resultsDir, resultsFilename);
|
|
const rejectedFilepath = path.join(resultsDir, rejectedFilename);
|
|
|
|
// Save results with AI analysis
|
|
fs.writeFileSync(resultsFilepath, JSON.stringify(outputData, null, 2));
|
|
|
|
// Save rejected posts separately
|
|
if (rejectedData.length > 0) {
|
|
fs.writeFileSync(
|
|
rejectedFilepath,
|
|
JSON.stringify(rejectedData, null, 2)
|
|
);
|
|
}
|
|
|
|
// Final summary
|
|
logger.success("✅ LinkedIn parsing completed successfully!");
|
|
logger.info(`📊 Total posts found: ${resultsWithAI.length}`);
|
|
logger.info(`❌ Total rejected: ${rejectedResults.length}`);
|
|
logger.info(`📁 Results saved to: ${resultsFilepath}`);
|
|
if (rejectedData.length > 0) {
|
|
logger.info(`📁 Rejected posts saved to: ${rejectedFilepath}`);
|
|
}
|
|
|
|
return outputData;
|
|
} catch (error) {
|
|
logger.error(`❌ LinkedIn parser failed: ${error.message}`);
|
|
throw error;
|
|
} finally {
|
|
await coreParser.cleanup();
|
|
}
|
|
}
|
|
|
|
// CLI handling
|
|
if (require.main === module) {
|
|
const args = process.argv.slice(2);
|
|
const options = {};
|
|
|
|
// Parse command line arguments
|
|
args.forEach((arg) => {
|
|
if (arg.startsWith("--")) {
|
|
const [key, value] = arg.slice(2).split("=");
|
|
options[key] = value || true;
|
|
}
|
|
});
|
|
|
|
startLinkedInParser(options)
|
|
.then(() => process.exit(0))
|
|
.catch((error) => {
|
|
console.error("Fatal error:", error.message);
|
|
process.exit(1);
|
|
});
|
|
}
|
|
|
|
module.exports = { startLinkedInParser };
|