Compare commits

...

No commits in common. "indeed" and "master" have entirely different histories.

18 changed files with 1931 additions and 13699 deletions

7
.gitignore vendored
View File

@ -8,10 +8,3 @@ zip*
*.7z *.7z
*obfuscated.js *obfuscated.js
.history .history
# Debug files
debug-*.js
debug-*.png
*.png
*.log
# Install scripts (optional - remove if you want to commit)
install-ollama.sh

2
ai-analyzer/cli.js Executable file → Normal file
View File

@ -1,4 +1,4 @@
#!/usr/bin/env node #!/usr/bin/env node
/** /**
* AI Analyzer CLI * AI Analyzer CLI

View File

@ -1,491 +1,301 @@
const { logger } = require("./logger"); const { logger } = require("./logger");
/** /**
* AI Analysis utilities for post processing with Ollama * AI Analysis utilities for post processing with Ollama
* Extracted from ai-analyzer-local.js for reuse across parsers * Extracted from ai-analyzer-local.js for reuse across parsers
*/ */
// Default model from environment variable or fallback to "mistral" /**
const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral"; * Check if Ollama is running and the model is available
*/
/** async function checkOllamaStatus(
* Check if Ollama is running and the model is available model = "mistral",
*/ ollamaHost = "http://localhost:11434"
async function checkOllamaStatus( ) {
model = DEFAULT_MODEL, try {
ollamaHost = "http://localhost:11434" // Check if Ollama is running
) { const response = await fetch(`${ollamaHost}/api/tags`);
try { if (!response.ok) {
// Check if Ollama is running throw new Error(`Ollama not running on ${ollamaHost}`);
const response = await fetch(`${ollamaHost}/api/tags`); }
if (!response.ok) {
throw new Error(`Ollama not running on ${ollamaHost}`); const data = await response.json();
} const availableModels = data.models.map((m) => m.name);
const data = await response.json(); logger.ai("Ollama is running");
const availableModels = data.models.map((m) => m.name); logger.info(
`📦 Available models: ${availableModels
logger.ai("Ollama is running"); .map((m) => m.split(":")[0])
logger.info( .join(", ")}`
`📦 Available models: ${availableModels );
.map((m) => m.split(":")[0])
.join(", ")}` // Check if requested model is available
); const modelExists = availableModels.some((m) => m.startsWith(model));
if (!modelExists) {
// Check if requested model is available logger.error(`Model "${model}" not found`);
const modelExists = availableModels.some((m) => m.startsWith(model)); logger.error(`💡 Install it with: ollama pull ${model}`);
if (!modelExists) { logger.error(
logger.error(`Model "${model}" not found`); `💡 Or choose from: ${availableModels
logger.error(`💡 Install it with: ollama pull ${model}`); .map((m) => m.split(":")[0])
logger.error( .join(", ")}`
`💡 Or choose from: ${availableModels );
.map((m) => m.split(":")[0]) return false;
.join(", ")}` }
);
return false; logger.success(`Using model: ${model}`);
} return true;
} catch (error) {
logger.success(`Using model: ${model}`); logger.error(`Error connecting to Ollama: ${error.message}`);
return true; logger.error("💡 Make sure Ollama is installed and running:");
} catch (error) { logger.error(" 1. Install: https://ollama.ai/");
logger.error(`Error connecting to Ollama: ${error.message}`); logger.error(" 2. Start: ollama serve");
logger.error("💡 Make sure Ollama is installed and running:"); logger.error(` 3. Install model: ollama pull ${model}`);
logger.error(" 1. Install: https://ollama.ai/"); return false;
logger.error(" 2. Start: ollama serve"); }
logger.error(` 3. Install model: ollama pull ${model}`); }
return false;
} /**
} * Analyze multiple posts using local Ollama
*/
/** async function analyzeBatch(
* Analyze multiple posts using local Ollama posts,
*/ context,
async function analyzeBatch( model = "mistral",
posts, ollamaHost = "http://localhost:11434"
context, ) {
model = DEFAULT_MODEL, logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
ollamaHost = "http://localhost:11434"
) { try {
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`); const prompt = `You are an expert at analyzing LinkedIn posts for relevance to specific contexts.
try { CONTEXT TO MATCH: "${context}"
// Detect if context is about a student profile
const isStudentContext = /student|undergraduate|first year|second year|third year|fourth year|freshman|sophomore|junior|senior|co-op|internship/i.test(context); Analyze these ${
posts.length
// Build enhanced prompt based on context type } LinkedIn posts and determine if each relates to the context above.
let analysisInstructions = "";
if (isStudentContext) { POSTS:
analysisInstructions = ` ${posts
ANALYSIS FOCUS (Student Context Detected): .map(
- Pay special attention to the "Requirements" section (post, i) => `
- Evaluate if the job requirements match the student's level (${context}) POST ${i + 1}:
- Consider: Are requirements too advanced? Are they appropriate for entry-level/co-op/internship? "${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}"
- Check if the role duties are suitable for a student's skill level `
- Look for keywords like "co-op", "internship", "entry-level", "student", "junior" )
- If requirements mention "years of experience", "senior", "expert", "PhD", etc., this may not be suitable .join("")}
- If requirements are reasonable for a student (basic skills, willingness to learn), mark as relevant`;
} else { For each post, provide:
analysisInstructions = ` - Is it relevant to "${context}"? (YES/NO)
ANALYSIS FOCUS: - Confidence level (0.0 to 1.0)
- Evaluate overall relevance to: "${context}" - Brief reasoning
- Consider job title, description, duties, and requirements
- Assess if the job matches the specified criteria`; Respond in this EXACT format for each post:
} POST 1: YES/NO | 0.X | brief reason
POST 2: YES/NO | 0.X | brief reason
const prompt = `Analyze ${posts.length} job postings for relevance to: "${context}" POST 3: YES/NO | 0.X | brief reason
${analysisInstructions} Examples:
- For layoff context: "laid off 50 employees" = YES | 0.9 | mentions layoffs
JOB POSTINGS: - For hiring context: "we're hiring developers" = YES | 0.8 | job posting
${posts - Unrelated content = NO | 0.1 | not relevant to context`;
.map(
(post, i) => { const response = await fetch(`${ollamaHost}/api/generate`, {
// For student contexts, prioritize Requirements section if text is too long method: "POST",
let jobText = post.text; headers: {
if (isStudentContext && jobText.length > 1200) { "Content-Type": "application/json",
// Try to extract Requirements section if present },
const requirementsMatch = jobText.match(/Requirements?:[\s\S]{0,600}/i); body: JSON.stringify({
const dutiesMatch = jobText.match(/Role Duties?:[\s\S]{0,300}/i); model: model,
const titleMatch = jobText.match(/Title:[\s\S]{0,100}/i); prompt: prompt,
stream: false,
if (requirementsMatch) { options: {
// Prioritize: Title + Requirements (most important for students) temperature: 0.3,
jobText = (titleMatch ? titleMatch[0] + "\n\n" : "") + top_p: 0.9,
(requirementsMatch ? requirementsMatch[0] : "") + },
(dutiesMatch ? "\n\n" + dutiesMatch[0] : ""); }),
} else { });
// Fallback to truncation
jobText = jobText.substring(0, 1200) + "..."; if (!response.ok) {
} throw new Error(
} else if (jobText.length > 1200) { `Ollama API error: ${response.status} ${response.statusText}`
jobText = jobText.substring(0, 1200) + "..."; );
} }
return ` const data = await response.json();
JOB ${i + 1}: const aiResponse = data.response.trim();
${jobText}
`; // Parse the response
} const analyses = [];
) const lines = aiResponse.split("\n").filter((line) => line.trim());
.join("")}
for (let i = 0; i < posts.length; i++) {
REQUIRED FORMAT - Respond with EXACTLY ${posts.length} lines, one per post: let analysis = {
JOB 1: YES | 0.8 | reason here postIndex: i + 1,
JOB 2: NO | 0.2 | reason here isRelevant: false,
JOB 3: YES | 0.9 | reason here confidence: 0.5,
reasoning: "Could not parse AI response",
RULES: };
- Use YES or NO (uppercase)
- Use pipe character | as separator // Look for lines that match "POST X:" pattern
- Confidence must be 0.0 to 1.0 (decimal number) const postPattern = new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i");
- Keep reasoning brief (one sentence)
- MUST include all ${posts.length} jobs in order for (const line of lines) {
${isStudentContext ? "- When analyzing requirements, explicitly mention if requirements are too advanced or appropriate for the student level" : ""} const match = line.match(postPattern);
if (match) {
Examples: const content = match[1].trim();
JOB 1: YES | 0.9 | co-op position suitable for first year students
JOB 2: NO | 0.2 | requires 5+ years experience, too advanced // Parse: YES/NO | 0.X | reasoning
JOB 3: YES | 0.7 | entry-level role with basic requirements appropriate for students`; const parts = content.split("|").map((p) => p.trim());
// Add timeout to prevent hanging (5 minutes max) if (parts.length >= 3) {
const controller = new AbortController(); analysis.isRelevant = parts[0].toUpperCase().includes("YES");
const timeoutId = setTimeout(() => controller.abort(), 5 * 60 * 1000); // 5 minutes analysis.confidence = Math.max(
0,
try { Math.min(1, parseFloat(parts[1]) || 0.5)
const response = await fetch(`${ollamaHost}/api/generate`, { );
method: "POST", analysis.reasoning = parts[2] || "No reasoning provided";
headers: { } else {
"Content-Type": "application/json", // Fallback parsing
}, analysis.isRelevant =
body: JSON.stringify({ content.toUpperCase().includes("YES") ||
model: model, content.toLowerCase().includes("relevant");
prompt: prompt, analysis.confidence = 0.6;
stream: false, analysis.reasoning = content.substring(0, 100);
options: { }
temperature: 0.3, break;
top_p: 0.9, }
}, }
}),
signal: controller.signal, analyses.push(analysis);
}); }
clearTimeout(timeoutId); // If we didn't get enough analyses, fill in defaults
while (analyses.length < posts.length) {
if (!response.ok) { analyses.push({
throw new Error( postIndex: analyses.length + 1,
`Ollama API error: ${response.status} ${response.statusText}` isRelevant: false,
); confidence: 0.3,
} reasoning: "AI response parsing failed",
});
const data = await response.json(); }
const aiResponse = data.response.trim();
return analyses;
// Parse the response } catch (error) {
const analyses = []; logger.error(`Error in batch AI analysis: ${error.message}`);
const lines = aiResponse.split("\n").filter((line) => line.trim());
// Fallback: mark all as relevant with low confidence
// Log the raw response for debugging return posts.map((_, i) => ({
logger.debug(`AI Response length: ${aiResponse.length} chars`); postIndex: i + 1,
if (aiResponse.length > 0) { isRelevant: true,
logger.debug(`AI Response (first 1000 chars):\n${aiResponse.substring(0, 1000)}`); confidence: 0.3,
} else { reasoning: `Analysis failed: ${error.message}`,
logger.warning("⚠️ AI response is empty!"); }));
} }
}
for (let i = 0; i < posts.length; i++) {
let analysis = { /**
postIndex: i + 1, * Analyze a single post using local Ollama (fallback)
isRelevant: false, */
confidence: 0.5, async function analyzeSinglePost(
reasoning: "Could not parse AI response", text,
}; context,
model = "mistral",
// Try multiple patterns to find the post analysis ollamaHost = "http://localhost:11434"
// IMPORTANT: Try numbered patterns first, only use generic pattern as last resort ) {
const numberedPatterns = [ const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
// Exact format: POST 1: YES | 0.8 | reason
new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i"), Post: "${text}"
// Numbered list: 1. YES | 0.8 | reason
new RegExp(`^\\s*${i + 1}[.)]\\s*(.+)`, "i"), Is this post relevant to "${context}"? Provide:
// Just the number: 1: YES | 0.8 | reason 1. YES or NO
new RegExp(`^\\s*${i + 1}:\\s*(.+)`, "i"), 2. Confidence (0.0 to 1.0)
]; 3. Brief reason
let found = false; Format: YES/NO | 0.X | reason`;
let matchedContent = null;
try {
// First, try to find a line with the specific post number const response = await fetch(`${ollamaHost}/api/generate`, {
for (const line of lines) { method: "POST",
for (const pattern of numberedPatterns) { headers: {
const match = line.match(pattern); "Content-Type": "application/json",
if (match) { },
matchedContent = match[1].trim(); body: JSON.stringify({
found = true; model: model,
break; prompt: prompt,
} stream: false,
} options: {
if (found) break; temperature: 0.3,
} },
}),
// If not found with numbered patterns, try position-based matching as fallback });
if (!found && lines.length > i) {
const targetLine = lines[i]; if (!response.ok) {
if (targetLine) { throw new Error(`Ollama API error: ${response.status}`);
// Try to parse the line even without post number }
const genericMatch = targetLine.match(/^(?:POST\s*\d+:?\s*)?(.+)$/i);
if (genericMatch) { const data = await response.json();
matchedContent = genericMatch[1].trim(); const aiResponse = data.response.trim();
found = true;
} // Parse response
} const parts = aiResponse.split("|").map((p) => p.trim());
}
if (parts.length >= 3) {
if (found && matchedContent) { return {
const content = matchedContent; isRelevant: parts[0].toUpperCase().includes("YES"),
confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)),
// Try to parse: YES/NO | 0.X | reasoning reasoning: parts[2],
let parts = content.split("|").map((p) => p.trim()); };
} else {
// If no pipe separator, try other separators // Fallback parsing
if (parts.length < 2) { return {
// Try colon separator: YES: 0.8: reason isRelevant:
parts = content.split(":").map((p) => p.trim()); aiResponse.toLowerCase().includes("yes") ||
} aiResponse.toLowerCase().includes("relevant"),
if (parts.length < 2) { confidence: 0.6,
// Try dash separator: YES - 0.8 - reason reasoning: aiResponse.substring(0, 100),
parts = content.split("-").map((p) => p.trim()); };
} }
} catch (error) {
// Extract YES/NO return {
const relevanceText = parts[0] || content; isRelevant: true, // Default to include on error
analysis.isRelevant = confidence: 0.3,
relevanceText.toUpperCase().includes("YES") || reasoning: `Analysis failed: ${error.message}`,
relevanceText.toLowerCase().includes("relevant") || };
relevanceText.toLowerCase().includes("yes"); }
}
// Extract confidence (look for number between 0 and 1)
if (parts.length >= 2) { /**
const confidenceMatch = parts[1].match(/(0?\.\d+|1\.0|0|1)/); * Find the most recent results file if none specified
if (confidenceMatch) { */
analysis.confidence = Math.max( function findLatestResultsFile(resultsDir = "results") {
0, const fs = require("fs");
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5) const path = require("path");
);
} if (!fs.existsSync(resultsDir)) {
} else { throw new Error("Results directory not found. Run the scraper first.");
// Try to find confidence in the whole content }
const confidenceMatch = content.match(/(0?\.\d+|1\.0|0|1)/);
if (confidenceMatch) { const files = fs
analysis.confidence = Math.max( .readdirSync(resultsDir)
0, .filter(
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5) (f) =>
); (f.startsWith("results-") || f.startsWith("linkedin-results-")) &&
} f.endsWith(".json") &&
} !f.includes("-ai-")
)
// Extract reasoning (everything after confidence, or whole content if no structure) .sort()
if (parts.length >= 3) { .reverse();
analysis.reasoning = parts.slice(2).join(" ").trim() || parts[2] || "No reasoning provided";
} else if (parts.length === 2) { if (files.length === 0) {
// If only 2 parts, second part might be reasoning throw new Error("No results files found. Run the scraper first.");
analysis.reasoning = parts[1].substring(0, 200); }
} else {
// Use the whole content as reasoning, but remove YES/NO and confidence return path.join(resultsDir, files[0]);
let reasoning = content }
.replace(/YES|NO/gi, "")
.replace(/0?\.\d+|1\.0/g, "") module.exports = {
.replace(/\|/g, "") checkOllamaStatus,
.trim(); analyzeBatch,
analysis.reasoning = reasoning || "Analysis provided but format unclear"; analyzeSinglePost,
} findLatestResultsFile,
} };
// If still not found, try to extract from the entire response by position
if (!found && lines.length > 0) {
// Try to get the line at position i (allowing for some variance)
const targetLine = lines[Math.min(i, lines.length - 1)];
if (targetLine) {
// Extract any YES/NO indication
analysis.isRelevant =
targetLine.toUpperCase().includes("YES") ||
targetLine.toLowerCase().includes("relevant");
// Extract confidence
const confidenceMatch = targetLine.match(/(0?\.\d+|1\.0|0|1)/);
if (confidenceMatch) {
analysis.confidence = Math.max(
0,
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5)
);
}
// Use the line as reasoning
analysis.reasoning = targetLine.substring(0, 200).trim() || "Parsed from unstructured response";
found = true;
}
}
// Last resort: if still not found, try to extract from the entire response text
if (!found && aiResponse.length > 0) {
// Look for any mention of relevance in the response
const responseLower = aiResponse.toLowerCase();
const hasRelevant = responseLower.includes("relevant") || responseLower.includes("yes");
analysis.isRelevant = hasRelevant;
// Try to find any confidence number
const allConfidenceMatches = aiResponse.match(/(0?\.\d+|1\.0|0|1)/g);
if (allConfidenceMatches && allConfidenceMatches.length > i) {
analysis.confidence = Math.max(
0,
Math.min(1, parseFloat(allConfidenceMatches[i]) || 0.5)
);
}
// Use a portion of the response as reasoning
const responseSnippet = aiResponse.substring(i * 100, (i + 1) * 200).trim();
analysis.reasoning = responseSnippet || "Could not parse structured response, using fallback";
logger.warning(`⚠️ Post ${i + 1}: Using fallback parsing - AI response format unclear`);
}
analyses.push(analysis);
}
// If we didn't get enough analyses, fill in defaults
while (analyses.length < posts.length) {
analyses.push({
postIndex: analyses.length + 1,
isRelevant: false,
confidence: 0.3,
reasoning: "AI response parsing failed",
});
}
return analyses;
} catch (error) {
clearTimeout(timeoutId);
if (error.name === 'AbortError') {
throw new Error('Request timeout: AI analysis took longer than 5 minutes');
}
throw error;
}
} catch (error) {
logger.error(`Error in batch AI analysis: ${error.message}`);
// Fallback: mark all as relevant with low confidence
return posts.map((_, i) => ({
postIndex: i + 1,
isRelevant: true,
confidence: 0.3,
reasoning: `Analysis failed: ${error.message}`,
}));
}
}
/**
* Analyze a single post using local Ollama (fallback)
*/
async function analyzeSinglePost(
text,
context,
model = DEFAULT_MODEL,
ollamaHost = "http://localhost:11434"
) {
const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
Post: "${text}"
Is this post relevant to "${context}"? Provide:
1. YES or NO
2. Confidence (0.0 to 1.0)
3. Brief reason
Format: YES/NO | 0.X | reason`;
try {
const response = await fetch(`${ollamaHost}/api/generate`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
model: model,
prompt: prompt,
stream: false,
options: {
temperature: 0.3,
},
}),
});
if (!response.ok) {
throw new Error(`Ollama API error: ${response.status}`);
}
const data = await response.json();
const aiResponse = data.response.trim();
// Parse response
const parts = aiResponse.split("|").map((p) => p.trim());
if (parts.length >= 3) {
return {
isRelevant: parts[0].toUpperCase().includes("YES"),
confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)),
reasoning: parts[2],
};
} else {
// Fallback parsing
return {
isRelevant:
aiResponse.toLowerCase().includes("yes") ||
aiResponse.toLowerCase().includes("relevant"),
confidence: 0.6,
reasoning: aiResponse.substring(0, 100),
};
}
} catch (error) {
return {
isRelevant: true, // Default to include on error
confidence: 0.3,
reasoning: `Analysis failed: ${error.message}`,
};
}
}
/**
* Find the most recent results file if none specified
*/
function findLatestResultsFile(resultsDir = "results") {
const fs = require("fs");
const path = require("path");
if (!fs.existsSync(resultsDir)) {
throw new Error("Results directory not found. Run the scraper first.");
}
const files = fs
.readdirSync(resultsDir)
.filter(
(f) =>
(f.startsWith("results-") || f.startsWith("linkedin-results-")) &&
f.endsWith(".json") &&
!f.includes("-ai-")
)
.sort()
.reverse();
if (files.length === 0) {
throw new Error("No results files found. Run the scraper first.");
}
return path.join(resultsDir, files[0]);
}
module.exports = {
checkOllamaStatus,
analyzeBatch,
analyzeSinglePost,
findLatestResultsFile,
DEFAULT_MODEL, // Export so other modules can use it
};

View File

@ -1,146 +1,107 @@
/** /**
* Text processing utilities for cleaning and validating content * Text processing utilities for cleaning and validating content
* Extracted from linkedout.js for reuse across parsers * Extracted from linkedout.js for reuse across parsers
*/ */
/** /**
* Clean text by removing hashtags, URLs, emojis, and normalizing whitespace * Clean text by removing hashtags, URLs, emojis, and normalizing whitespace
*/ */
function cleanText(text) { function cleanText(text) {
if (!text || typeof text !== "string") { if (!text || typeof text !== "string") {
return ""; return "";
} }
// Remove hashtags // Remove hashtags
text = text.replace(/#\w+/g, ""); text = text.replace(/#\w+/g, "");
// Remove hashtag mentions // Remove hashtag mentions
text = text.replace(/\bhashtag\b/gi, ""); text = text.replace(/\bhashtag\b/gi, "");
text = text.replace(/hashtag-\w+/gi, ""); text = text.replace(/hashtag-\w+/gi, "");
// Remove URLs // Remove URLs
text = text.replace(/https?:\/\/[^\s]+/g, ""); text = text.replace(/https?:\/\/[^\s]+/g, "");
// Remove emojis (Unicode ranges for common emoji) // Remove emojis (Unicode ranges for common emoji)
text = text.replace( text = text.replace(
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu, /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
"" ""
); );
// Normalize whitespace // Normalize whitespace
text = text.replace(/\s+/g, " ").trim(); text = text.replace(/\s+/g, " ").trim();
return text; return text;
} }
/** /**
* Check if text contains any of the specified keywords (case insensitive) * Check if text contains any of the specified keywords (case insensitive)
*/ */
function containsAnyKeyword(text, keywords) { function containsAnyKeyword(text, keywords) {
if (!text || !Array.isArray(keywords)) { if (!text || !Array.isArray(keywords)) {
return false; return false;
} }
const lowerText = text.toLowerCase(); const lowerText = text.toLowerCase();
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase())); return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
} }
/** /**
* Check if text contains all of the specified keywords (case insensitive) * Validate if text meets basic quality criteria
*/ */
function containsAllKeywords(text, keywords) { function isValidText(text, minLength = 30) {
if (!text || !Array.isArray(keywords)) { if (!text || typeof text !== "string") {
return false; return false;
} }
const lowerText = text.toLowerCase(); // Check minimum length
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase())); if (text.length < minLength) {
} return false;
}
/**
* Check if text matches keyword groups with AND logic between groups and OR logic within groups // Check if text contains alphanumeric characters
* @param {string} text - Text to search in if (!/[a-zA-Z0-9]/.test(text)) {
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords return false;
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic) }
*/
function matchesKeywordGroups(text, keywordGroups) { return true;
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) { }
return false;
} /**
* Extract domain from URL
const lowerText = text.toLowerCase(); */
function extractDomain(url) {
// All groups must match (AND logic) if (!url || typeof url !== "string") {
return keywordGroups.every((group) => { return null;
if (!Array.isArray(group) || group.length === 0) { }
return false;
} try {
// At least one keyword in the group must match (OR logic) const urlObj = new URL(url);
return group.some((keyword) => return urlObj.hostname;
lowerText.includes(keyword.toLowerCase().trim()) } catch (error) {
); return null;
}); }
} }
/** /**
* Validate if text meets basic quality criteria * Normalize URL by removing query parameters and fragments
*/ */
function isValidText(text, minLength = 30) { function normalizeUrl(url) {
if (!text || typeof text !== "string") { if (!url || typeof url !== "string") {
return false; return "";
} }
// Check minimum length try {
if (text.length < minLength) { const urlObj = new URL(url);
return false; return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
} } catch (error) {
return url;
// Check if text contains alphanumeric characters }
if (!/[a-zA-Z0-9]/.test(text)) { }
return false;
} module.exports = {
cleanText,
return true; containsAnyKeyword,
} isValidText,
extractDomain,
/** normalizeUrl,
* Extract domain from URL };
*/
function extractDomain(url) {
if (!url || typeof url !== "string") {
return null;
}
try {
const urlObj = new URL(url);
return urlObj.hostname;
} catch (error) {
return null;
}
}
/**
* Normalize URL by removing query parameters and fragments
*/
function normalizeUrl(url) {
if (!url || typeof url !== "string") {
return "";
}
try {
const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}${urlObj.pathname}`;
} catch (error) {
return url;
}
}
module.exports = {
cleanText,
containsAnyKeyword,
containsAllKeywords,
matchesKeywordGroups,
isValidText,
extractDomain,
normalizeUrl,
};

View File

@ -20,26 +20,7 @@ class CoreParser {
this.browser = await playwright.chromium.launch({ this.browser = await playwright.chromium.launch({
headless: this.config.headless headless: this.config.headless
}); });
this.context = await this.browser.newContext();
// Create context with user agent to appear more like a real browser
const contextOptions = {
userAgent: this.config.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/New_York',
};
// Add extra HTTP headers to appear more legitimate
contextOptions.extraHTTPHeaders = {
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
};
this.context = await this.browser.newContext(contextOptions);
} }
async createPage(id) { async createPage(id) {
@ -80,7 +61,3 @@ class CoreParser {
} }
module.exports = CoreParser; module.exports = CoreParser;

View File

@ -1,9 +1,7 @@
{ {
"name": "core-parser", "name": "core-parser",
"version": "1.0.0", "version": "1.0.0",
"main": "index.js", "main": "index.js",
"description": "Core parser utilities for browser management", "description": "Core parser utilities for browser management",
"dependencies": { "dependencies": {}
"playwright": "^1.40.0" }
}
}

File diff suppressed because it is too large Load Diff

View File

@ -10,10 +10,7 @@ const path = require("path");
const fs = require("fs"); const fs = require("fs");
const CoreParser = require("../core-parser"); const CoreParser = require("../core-parser");
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy"); const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy"); const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
const { indeedStrategy } = require("./strategies/indeed-strategy");
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
const { convertResultsToCsv } = require("./src/csv-utils");
// Load environment variables // Load environment variables
require("dotenv").config({ path: path.join(__dirname, ".env") }); require("dotenv").config({ path: path.join(__dirname, ".env") });
@ -21,23 +18,16 @@ require("dotenv").config({ path: path.join(__dirname, ".env") });
// Configuration from environment // Configuration from environment
const HEADLESS = process.env.HEADLESS !== "false"; const HEADLESS = process.env.HEADLESS !== "false";
const SEARCH_KEYWORDS = const SEARCH_KEYWORDS =
process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer"; process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer";
const LOCATION_FILTER = process.env.LOCATION_FILTER; const LOCATION_FILTER = process.env.LOCATION_FILTER;
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true"; const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
const AI_CONTEXT = process.env.AI_CONTEXT || "Job market analysis focusing on job postings, skills, and trends";
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5; const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both"
const MIN_DATE = process.env.MIN_DATE; // Minimum posted date (format: YYYY-MM-DD)
const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for keywords
// Available site strategies // Available site strategies
const SITE_STRATEGIES = { const SITE_STRATEGIES = {
skipthedrive: skipthedriveStrategy, skipthedrive: skipthedriveStrategy,
linkedin: linkedinJobsStrategy,
indeed: indeedStrategy,
// Add more site strategies here // Add more site strategies here
// indeed: indeedStrategy,
// glassdoor: glassdoorStrategy, // glassdoor: glassdoorStrategy,
}; };
@ -51,10 +41,6 @@ function parseArguments() {
keywords: null, keywords: null,
locationFilter: null, locationFilter: null,
maxPages: MAX_PAGES, maxPages: MAX_PAGES,
excludeRejected: EXCLUDE_REJECTED,
outputFormat: OUTPUT_FORMAT,
minDate: MIN_DATE,
useAndLogic: USE_AND_LOGIC, // Use AND logic instead of OR logic for keywords (from env or CLI)
}; };
args.forEach((arg) => { args.forEach((arg) => {
@ -71,26 +57,7 @@ function parseArguments() {
} else if (arg.startsWith("--location=")) { } else if (arg.startsWith("--location=")) {
options.locationFilter = arg.split("=")[1]; options.locationFilter = arg.split("=")[1];
} else if (arg.startsWith("--max-pages=")) { } else if (arg.startsWith("--max-pages=")) {
const value = arg.split("=")[1]; options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES;
// Support "all" or "0" to mean unlimited pages
if (value === "all" || value === "0") {
options.maxPages = 0; // 0 means unlimited
} else {
options.maxPages = parseInt(value) || MAX_PAGES;
}
} else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
options.excludeRejected = true;
} else if (arg.startsWith("--output=") || arg.startsWith("--format=")) {
const format = arg.split("=")[1].toLowerCase();
if (["json", "csv", "both"].includes(format)) {
options.outputFormat = format;
} else {
logger.warning(`⚠️ Unknown output format: ${format}. Using default: json`);
}
} else if (arg.startsWith("--min-date=")) {
options.minDate = arg.split("=")[1];
} else if (arg === "--and" || arg === "--all-keywords") {
options.useAndLogic = true; // CLI flag overrides env variable
} }
}); });
@ -113,136 +80,21 @@ async function startJobSearchParser(options = {}) {
logger.step("🚀 Job Search Parser Starting..."); logger.step("🚀 Job Search Parser Starting...");
// Parse keywords // Parse keywords
let keywords = const keywords =
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim()); finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
// Parse keyword groups if AND logic is enabled and keywords contain pipe (|) separator
// Format: "co-op|intern,summer 2026" means (co-op OR intern) AND (summer 2026)
let keywordGroups = null;
if (finalOptions.useAndLogic && keywords.some(k => k.includes('|'))) {
keywordGroups = keywords.map(group =>
group.split('|').map(k => k.trim()).filter(k => k.length > 0)
);
logger.info(`🔍 Keyword Groups: ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
}
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER; const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
const sites = finalOptions.sites; const sites = finalOptions.sites;
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
logger.info(`📦 Selected job sites: ${sites.join(", ")}`); logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`); logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
if (keywordGroups) {
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
} else {
logger.info(`🔗 Keyword Logic: ${finalOptions.useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
}
logger.info(`📍 Location Filter: ${locationFilter || "None"}`); logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
const minDate = finalOptions.minDate || MIN_DATE;
if (minDate) {
logger.info(`📅 Min Date Filter: ${minDate} (jobs posted after this date)`);
}
logger.info( logger.info(
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}` `🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
); );
if (ENABLE_AI_ANALYSIS) {
logger.info(` Context: "${AI_CONTEXT}"`);
logger.info(` Model: ${OLLAMA_MODEL}`);
}
const allResults = []; const allResults = [];
const allRejectedResults = []; const allRejectedResults = [];
const siteResults = {}; const siteResults = {};
let analysisResults = null;
// Initialize results directory and file for incremental saving
const resultsDir = path.join(__dirname, "results");
if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir, { recursive: true });
}
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT;
let incrementalJsonFilepath = null;
let incrementalCsvFilepath = null;
// Initialize incremental save files
if (outputFormat === "json" || outputFormat === "both") {
const jsonFilename = `job-search-results-${timestamp}.json`;
incrementalJsonFilepath = path.join(resultsDir, jsonFilename);
}
if (outputFormat === "csv" || outputFormat === "both") {
const csvFilename = `job-search-results-${timestamp}.csv`;
incrementalCsvFilepath = path.join(resultsDir, csvFilename);
}
/**
* Save results incrementally as they're found
*/
const saveIncrementalResults = (currentResults, currentRejectedResults, currentSiteResults, currentAnalysisResults = null, isComplete = false) => {
try {
const outputData = {
metadata: {
extractedAt: new Date().toISOString(),
parser: "job-search-parser",
version: "2.0.0",
sites: sites,
keywords: keywords.join(", "),
locationFilter,
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
analysisResults: currentAnalysisResults,
rejectedJobsExcluded: excludeRejected,
isComplete: isComplete,
lastUpdated: new Date().toISOString(),
},
results: currentResults,
siteResults: currentSiteResults,
};
if (!excludeRejected) {
outputData.rejectedResults = currentRejectedResults;
}
// Save JSON incrementally
if (incrementalJsonFilepath) {
fs.writeFileSync(incrementalJsonFilepath, JSON.stringify(outputData, null, 2));
}
// Save CSV incrementally (convert on each save)
if (incrementalCsvFilepath) {
const csvContent = convertResultsToCsv(outputData);
fs.writeFileSync(incrementalCsvFilepath, csvContent);
}
if (!isComplete) {
logger.info(`💾 Incremental save: ${currentResults.length} results saved to ${incrementalJsonFilepath || incrementalCsvFilepath}`);
}
} catch (error) {
logger.warning(`⚠️ Failed to save incremental results: ${error.message}`);
}
};
// Save initial empty state
saveIncrementalResults([], [], {}, null, false);
// Set up signal handlers for graceful shutdown
let isShuttingDown = false;
const gracefulShutdown = async (signal) => {
if (isShuttingDown) return;
isShuttingDown = true;
logger.warning(`\n⚠️ Received ${signal}, saving current results before exit...`);
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
logger.info(`💾 Saved ${allResults.length} results before shutdown`);
await coreParser.cleanup();
process.exit(0);
};
process.on('SIGINT', () => gracefulShutdown('SIGINT'));
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
// Process each selected site // Process each selected site
for (const site of sites) { for (const site of sites) {
@ -256,49 +108,18 @@ async function startJobSearchParser(options = {}) {
logger.step(`\n🌐 Parsing ${site}...`); logger.step(`\n🌐 Parsing ${site}...`);
const startTime = Date.now(); const startTime = Date.now();
// Prepare strategy options const parseResult = await strategy(coreParser, {
const strategyOptions = {
keywords, keywords,
keywordGroups, // Pass grouped keywords if available
locationFilter, locationFilter,
maxPages: finalOptions.maxPages, maxPages: finalOptions.maxPages,
useAndLogic: finalOptions.useAndLogic || false, });
};
// Add credentials for LinkedIn
if (site === "linkedin") {
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
siteResults[site] = {
count: 0,
rejected: 0,
duration: "0s",
error: "LinkedIn credentials not found",
};
continue;
}
strategyOptions.credentials = {
username: LINKEDIN_USERNAME,
password: LINKEDIN_PASSWORD,
};
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
strategyOptions.minDate = minDate; // Add date filter for LinkedIn
}
const parseResult = await strategy(coreParser, strategyOptions);
const { results, rejectedResults, summary } = parseResult; const { results, rejectedResults, summary } = parseResult;
const duration = ((Date.now() - startTime) / 1000).toFixed(2); const duration = ((Date.now() - startTime) / 1000).toFixed(2);
// Collect results // Collect results
logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
allResults.push(...results); allResults.push(...results);
allRejectedResults.push(...rejectedResults); allRejectedResults.push(...rejectedResults);
logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
siteResults[site] = { siteResults[site] = {
count: results.length, count: results.length,
@ -310,9 +131,6 @@ async function startJobSearchParser(options = {}) {
logger.success( logger.success(
`${site} completed in ${duration}s - Found ${results.length} jobs` `${site} completed in ${duration}s - Found ${results.length} jobs`
); );
// Save results incrementally after each site
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
} catch (error) { } catch (error) {
logger.error(`${site} parsing failed: ${error.message}`); logger.error(`${site} parsing failed: ${error.message}`);
siteResults[site] = { siteResults[site] = {
@ -321,126 +139,60 @@ async function startJobSearchParser(options = {}) {
duration: "0s", duration: "0s",
error: error.message, error: error.message,
}; };
// Save even on error to preserve what we have
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
} }
} }
// AI Analysis if enabled // AI Analysis if enabled
// Save results before AI analysis (in case AI analysis takes a long time) let analysisResults = null;
if (allResults.length > 0) {
saveIncrementalResults(allResults, allRejectedResults, siteResults, null, false);
}
if (ENABLE_AI_ANALYSIS && allResults.length > 0) { if (ENABLE_AI_ANALYSIS && allResults.length > 0) {
logger.step("🧠 Running AI Analysis..."); logger.step("🧠 Running AI Analysis...");
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL); const ollamaStatus = await checkOllamaStatus();
if (ollamaAvailable) { if (ollamaStatus.available) {
// Prepare data for analysis (analyzeBatch expects objects with 'text' field) analysisResults = await analyzeBatch(allResults, {
const analysisData = allResults.map((job) => { context:
// Build comprehensive text including all available job information "Job market analysis focusing on job postings, skills, and trends",
const parts = [];
if (job.title) parts.push(`Title: ${job.title}`);
if (job.company) parts.push(`Company: ${job.company}`);
if (job.description) parts.push(`Description: ${job.description}`);
if (job.roleDuties) parts.push(`Role Duties: ${job.roleDuties}`);
if (job.jobRequirements) parts.push(`Requirements: ${job.jobRequirements}`);
return {
text: parts.join("\n\n"),
location: job.location || "",
keyword: job.keyword || "",
timestamp: job.extractedAt || job.postedDate || "",
roleDuties: job.roleDuties || "",
jobRequirements: job.jobRequirements || "",
};
}); });
// Process in smaller batches to avoid timeouts (5 jobs per batch)
const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5;
analysisResults = [];
for (let i = 0; i < analysisData.length; i += BATCH_SIZE) {
const batch = analysisData.slice(i, i + BATCH_SIZE);
const batchNumber = Math.floor(i / BATCH_SIZE) + 1;
const totalBatches = Math.ceil(analysisData.length / BATCH_SIZE);
logger.info(` Processing batch ${batchNumber}/${totalBatches} (${batch.length} jobs)...`);
try {
const batchResults = await analyzeBatch(
batch,
AI_CONTEXT,
OLLAMA_MODEL
);
analysisResults.push(...batchResults);
logger.success(` ✅ Batch ${batchNumber} completed`);
} catch (error) {
logger.error(` ❌ Batch ${batchNumber} failed: ${error.message}`);
// Add fallback results for this batch
const fallbackResults = batch.map((_, idx) => ({
postIndex: i + idx + 1,
isRelevant: true,
confidence: 0.3,
reasoning: `Analysis failed: ${error.message}`,
}));
analysisResults.push(...fallbackResults);
}
}
// Embed AI analysis into each job result
allResults.forEach((job, index) => {
if (analysisResults && analysisResults[index]) {
job.aiAnalysis = {
isRelevant: analysisResults[index].isRelevant,
confidence: analysisResults[index].confidence,
reasoning: analysisResults[index].reasoning,
context: AI_CONTEXT,
model: OLLAMA_MODEL,
analyzedAt: new Date().toISOString(),
};
}
});
logger.success( logger.success(
`✅ AI Analysis completed for ${allResults.length} jobs` `✅ AI Analysis completed for ${allResults.length} jobs`
); );
// Save results after AI analysis completes
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
} else { } else {
logger.warning("⚠️ Ollama not available, skipping AI analysis"); logger.warning("⚠️ Ollama not available, skipping AI analysis");
} }
} }
// Final save with complete flag // Save results
logger.info(`💾 Preparing final save: ${allResults.length} results, ${allRejectedResults.length} rejected`); const outputData = {
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`); metadata: {
extractedAt: new Date().toISOString(),
if (!excludeRejected) { parser: "job-search-parser",
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`); version: "2.0.0",
} else { sites: sites,
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`); keywords: keywords.join(", "),
} locationFilter,
analysisResults,
logger.info(`💾 Final output: ${allResults.length} results, ${allRejectedResults.length} rejected`); },
results: allResults,
rejectedResults: allRejectedResults,
siteResults,
};
// Final save with isComplete flag const resultsDir = path.join(__dirname, "results");
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, true); if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir, { recursive: true });
const savedFiles = []; }
if (incrementalJsonFilepath) savedFiles.push(incrementalJsonFilepath);
if (incrementalCsvFilepath) savedFiles.push(incrementalCsvFilepath); const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const filename = `job-search-results-${timestamp}.json`;
const filepath = path.join(resultsDir, filename);
fs.writeFileSync(filepath, JSON.stringify(outputData, null, 2));
// Final summary // Final summary
logger.step("\n📊 Job Search Parser Summary"); logger.step("\n📊 Job Search Parser Summary");
logger.success(`✅ Total jobs found: ${allResults.length}`); logger.success(`✅ Total jobs found: ${allResults.length}`);
logger.info(`❌ Total rejected: ${allRejectedResults.length}`); logger.info(`❌ Total rejected: ${allRejectedResults.length}`);
logger.info(`📁 Results saved to:`); logger.info(`📁 Results saved to: ${filepath}`);
savedFiles.forEach(filepath => {
logger.info(` ${filepath}`);
});
logger.info("\n📈 Results by site:"); logger.info("\n📈 Results by site:");
for (const [site, stats] of Object.entries(siteResults)) { for (const [site, stats] of Object.entries(siteResults)) {
@ -455,31 +207,6 @@ async function startJobSearchParser(options = {}) {
logger.success("\n✅ Job Search Parser completed successfully!"); logger.success("\n✅ Job Search Parser completed successfully!");
// Construct output data for return
const outputData = {
metadata: {
extractedAt: new Date().toISOString(),
parser: "job-search-parser",
version: "2.0.0",
sites: sites,
keywords: keywords.join(", "),
locationFilter,
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
analysisResults: analysisResults,
rejectedJobsExcluded: excludeRejected,
isComplete: true,
lastUpdated: new Date().toISOString(),
},
results: allResults,
siteResults: siteResults,
};
if (!excludeRejected) {
outputData.rejectedResults = allRejectedResults;
}
return outputData; return outputData;
} catch (error) { } catch (error) {
logger.error(`❌ Job Search Parser failed: ${error.message}`); logger.error(`❌ Job Search Parser failed: ${error.message}`);

View File

@ -1,345 +1,332 @@
/** /**
* SkipTheDrive Job Parser * SkipTheDrive Job Parser
* *
* Parses remote job listings from SkipTheDrive.com * Parses remote job listings from SkipTheDrive.com
* Supports keyword search, job type filters, and pagination * Supports keyword search, job type filters, and pagination
*/ */
const { chromium } = require("playwright"); const { chromium } = require("playwright");
const path = require("path"); const path = require("path");
// Import from ai-analyzer core package // Import from ai-analyzer core package
const { const {
logger, logger,
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
containsAllKeywords, parseLocationFilters,
parseLocationFilters, validateLocationAgainstFilters,
validateLocationAgainstFilters, extractLocationFromProfile,
extractLocationFromProfile, analyzeBatch,
analyzeBatch, checkOllamaStatus,
checkOllamaStatus, } = require("../../ai-analyzer");
} = require("../../ai-analyzer");
/**
/** * Build search URL for SkipTheDrive
* Build search URL for SkipTheDrive * @param {string} keyword - Search keyword
* @param {string} keyword - Search keyword * @param {string} orderBy - Sort order (date, relevance)
* @param {string} orderBy - Sort order (date, relevance) * @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
* @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract) * @returns {string} - Formatted search URL
* @returns {string} - Formatted search URL */
*/ function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;
if (orderBy) {
if (orderBy) { url += `&orderby=${orderBy}`;
url += `&orderby=${orderBy}`; }
}
// Add job type filters
// Add job type filters jobTypes.forEach((type) => {
jobTypes.forEach((type) => { url += `&jobtype=${encodeURIComponent(type)}`;
url += `&jobtype=${encodeURIComponent(type)}`; });
});
return url;
return url; }
}
/**
/** * Extract job data from a single job listing element
* Extract job data from a single job listing element * @param {Element} article - Job listing DOM element
* @param {Element} article - Job listing DOM element * @returns {Object} - Extracted job data
* @returns {Object} - Extracted job data */
*/ async function extractJobData(article) {
async function extractJobData(article) { try {
try { // Extract job title and URL
// Extract job title and URL const titleElement = await article.$("h2.post-title a");
const titleElement = await article.$("h2.post-title a"); const title = titleElement ? await titleElement.textContent() : "";
const title = titleElement ? await titleElement.textContent() : ""; const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract date
// Extract date const dateElement = await article.$("time.post-date");
const dateElement = await article.$("time.post-date"); const datePosted = dateElement
const datePosted = dateElement ? await dateElement.getAttribute("datetime")
? await dateElement.getAttribute("datetime") : "";
: ""; const dateText = dateElement ? await dateElement.textContent() : "";
const dateText = dateElement ? await dateElement.textContent() : "";
// Extract company name
// Extract company name const companyElement = await article.$(
const companyElement = await article.$( ".custom_fields_company_name_display_search_results"
".custom_fields_company_name_display_search_results" );
); let company = companyElement ? await companyElement.textContent() : "";
let company = companyElement ? await companyElement.textContent() : ""; company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract days ago
// Extract days ago const daysAgoElement = await article.$(
const daysAgoElement = await article.$( ".custom_fields_job_date_display_search_results"
".custom_fields_job_date_display_search_results" );
); let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : ""; daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon
// Extract job description excerpt
// Extract job description excerpt const excerptElement = await article.$(".excerpt_part");
const excerptElement = await article.$(".excerpt_part"); const description = excerptElement
const description = excerptElement ? await excerptElement.textContent()
? await excerptElement.textContent() : "";
: "";
// Check if featured/sponsored
// Check if featured/sponsored const featuredElement = await article.$(".custom_fields_sponsored_job");
const featuredElement = await article.$(".custom_fields_sponsored_job"); const isFeatured = !!featuredElement;
const isFeatured = !!featuredElement;
// Extract job ID from article ID
// Extract job ID from article ID const articleId = await article.getAttribute("id");
const articleId = await article.getAttribute("id"); const jobId = articleId ? articleId.replace("post-", "") : "";
const jobId = articleId ? articleId.replace("post-", "") : "";
return {
return { jobId,
jobId, title: cleanText(title),
title: cleanText(title), company: cleanText(company),
company: cleanText(company), jobUrl,
jobUrl, datePosted,
datePosted, dateText: cleanText(dateText),
dateText: cleanText(dateText), daysAgo: cleanText(daysAgo),
daysAgo: cleanText(daysAgo), description: cleanText(description),
description: cleanText(description), isFeatured,
isFeatured, source: "skipthedrive",
source: "skipthedrive", timestamp: new Date().toISOString(),
timestamp: new Date().toISOString(), };
}; } catch (error) {
} catch (error) { logger.error(`Error extracting job data: ${error.message}`);
logger.error(`Error extracting job data: ${error.message}`); return null;
return null; }
} }
}
/**
/** * Parse SkipTheDrive job listings
* Parse SkipTheDrive job listings * @param {Object} options - Parser options
* @param {Object} options - Parser options * @returns {Promise<Array>} - Array of parsed job listings
* @returns {Promise<Array>} - Array of parsed job listings */
*/ async function parseSkipTheDrive(options = {}) {
async function parseSkipTheDrive(options = {}) { const {
const { keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [ "software engineer",
"software engineer", "developer",
"developer", ],
], jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [], locationFilter = process.env.LOCATION_FILTER || "",
locationFilter = process.env.LOCATION_FILTER || "", maxPages = parseInt(process.env.MAX_PAGES) || 5,
maxPages = parseInt(process.env.MAX_PAGES) || 5, headless = process.env.HEADLESS !== "false",
headless = process.env.HEADLESS !== "false", enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
enableAI = process.env.ENABLE_AI_ANALYSIS === "true", aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis", } = options;
useAndLogic = false, // Use AND logic instead of OR logic for keywords
} = options; logger.step("Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
logger.step("Starting SkipTheDrive parser..."); logger.info(
logger.info(`🔍 Keywords: ${keywords.join(", ")}`); `📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); );
logger.info( logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}` logger.info(`📄 Max Pages: ${maxPages}`);
);
logger.info(`📍 Location Filter: ${locationFilter || "None"}`); const browser = await chromium.launch({
logger.info(`📄 Max Pages: ${maxPages}`); headless,
args: [
const browser = await chromium.launch({ "--no-sandbox",
headless, "--disable-setuid-sandbox",
args: [ "--disable-dev-shm-usage",
"--no-sandbox", ],
"--disable-setuid-sandbox", });
"--disable-dev-shm-usage",
], const context = await browser.newContext({
}); userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
const context = await browser.newContext({ });
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", const results = [];
}); const rejectedResults = [];
const seenJobs = new Set();
const results = [];
const rejectedResults = []; try {
const seenJobs = new Set(); // Search for each keyword
for (const keyword of keywords) {
try { logger.info(`\n🔍 Searching for: ${keyword}`);
// For AND logic, combine all keywords into a single search query
// For OR logic, search each keyword separately const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords; const page = await context.newPage();
// Search for each keyword (or combined keyword for AND logic) try {
for (const keyword of searchKeywords) { logger.info(
logger.info(`\n🔍 Searching for: ${keyword}`); `Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
);
const searchUrl = buildSearchUrl(keyword, "date", jobTypes); await page.goto(searchUrl, {
const page = await context.newPage(); waitUntil: "domcontentloaded",
timeout: 30000,
try { });
logger.info( logger.info(
`Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}` `Navigation completed successfully at ${new Date().toISOString()}`
); );
await page.goto(searchUrl, {
waitUntil: "domcontentloaded", // Wait for job listings to load
timeout: 30000, logger.info("Waiting for selector #loops-wrapper");
}); await page
logger.info( .waitForSelector("#loops-wrapper", { timeout: 5000 })
`Navigation completed successfully at ${new Date().toISOString()}` .catch(() => {
); logger.warning(`No results found for keyword: ${keyword}`);
});
// Wait for job listings to load logger.info("Selector wait completed");
logger.info("Waiting for selector #loops-wrapper");
await page let currentPage = 1;
.waitForSelector("#loops-wrapper", { timeout: 5000 }) let hasNextPage = true;
.catch(() => {
logger.warning(`No results found for keyword: ${keyword}`); while (hasNextPage && currentPage <= maxPages) {
}); logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
logger.info("Selector wait completed");
// Extract all job articles on current page
let currentPage = 1; const jobArticles = await page.$$("article[id^='post-']");
let hasNextPage = true; logger.info(
`Found ${jobArticles.length} job listings on page ${currentPage}`
while (hasNextPage && currentPage <= maxPages) { );
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
for (const article of jobArticles) {
// Extract all job articles on current page const jobData = await extractJobData(article);
const jobArticles = await page.$$("article[id^='post-']");
logger.info( if (!jobData || seenJobs.has(jobData.jobId)) {
`Found ${jobArticles.length} job listings on page ${currentPage}` continue;
); }
for (const article of jobArticles) { seenJobs.add(jobData.jobId);
const jobData = await extractJobData(article);
// Add keyword that found this job
if (!jobData || seenJobs.has(jobData.jobId)) { jobData.searchKeyword = keyword;
continue;
} // Validate job against keywords
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
seenJobs.add(jobData.jobId); if (!containsAnyKeyword(fullText, keywords)) {
rejectedResults.push({
// Add keyword that found this job ...jobData,
jobData.searchKeyword = keyword; rejected: true,
reason: "Keywords not found in job listing",
// Validate job against keywords });
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`; continue;
const keywordMatch = useAndLogic }
? containsAllKeywords(fullText, keywords)
: containsAnyKeyword(fullText, keywords); // Location validation (if enabled)
if (locationFilter) {
if (!keywordMatch) { const locationFilters = parseLocationFilters(locationFilter);
rejectedResults.push({ // For SkipTheDrive, most jobs are remote, but we can check the title/description
...jobData, const locationValid =
rejected: true, fullText.toLowerCase().includes("remote") ||
reason: useAndLogic locationFilters.some((filter) =>
? "Not all keywords found in job listing" fullText.toLowerCase().includes(filter.toLowerCase())
: "Keywords not found in job listing", );
});
continue; if (!locationValid) {
} rejectedResults.push({
...jobData,
// Location validation (if enabled) rejected: true,
if (locationFilter) { reason: "Location requirements not met",
const locationFilters = parseLocationFilters(locationFilter); });
// For SkipTheDrive, most jobs are remote, but we can check the title/description continue;
const locationValid = }
fullText.toLowerCase().includes("remote") ||
locationFilters.some((filter) => jobData.locationValid = locationValid;
fullText.toLowerCase().includes(filter.toLowerCase()) }
);
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
if (!locationValid) { results.push(jobData);
rejectedResults.push({ }
...jobData,
rejected: true, // Check for next page
reason: "Location requirements not met", const nextPageLink = await page.$("a.nextp");
}); if (nextPageLink && currentPage < maxPages) {
continue; logger.info("📄 Moving to next page...");
} await nextPageLink.click();
await page.waitForLoadState("domcontentloaded");
jobData.locationValid = locationValid; await page.waitForTimeout(2000); // Wait for content to load
} currentPage++;
} else {
logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`); hasNextPage = false;
results.push(jobData); }
} }
} catch (error) {
// Check for next page logger.error(`Error processing keyword "${keyword}": ${error.message}`);
const nextPageLink = await page.$("a.nextp"); } finally {
if (nextPageLink && currentPage < maxPages) { await page.close();
logger.info("📄 Moving to next page..."); }
await nextPageLink.click(); }
await page.waitForLoadState("domcontentloaded");
await page.waitForTimeout(2000); // Wait for content to load logger.success(`\n✅ Parsing complete!`);
currentPage++; logger.info(`📊 Total jobs found: ${results.length}`);
} else { logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);
hasNextPage = false;
} // Run AI analysis if enabled
} let aiAnalysis = null;
} catch (error) { if (enableAI && results.length > 0) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`); logger.step("Running AI analysis on job listings...");
} finally {
await page.close(); const aiAvailable = await checkOllamaStatus();
} if (aiAvailable) {
} const analysisData = results.map((job) => ({
text: `${job.title} at ${job.company}. ${job.description}`,
logger.success(`\n✅ Parsing complete!`); metadata: {
logger.info(`📊 Total jobs found: ${results.length}`); jobId: job.jobId,
logger.info(`❌ Rejected jobs: ${rejectedResults.length}`); company: job.company,
daysAgo: job.daysAgo,
// Run AI analysis if enabled },
let aiAnalysis = null; }));
if (enableAI && results.length > 0) {
logger.step("Running AI analysis on job listings..."); aiAnalysis = await analyzeBatch(analysisData, aiContext);
const aiAvailable = await checkOllamaStatus(); // Merge AI analysis with results
if (aiAvailable) { results.forEach((job, index) => {
const analysisData = results.map((job) => ({ if (aiAnalysis && aiAnalysis[index]) {
text: `${job.title} at ${job.company}. ${job.description}`, job.aiAnalysis = {
metadata: { isRelevant: aiAnalysis[index].isRelevant,
jobId: job.jobId, confidence: aiAnalysis[index].confidence,
company: job.company, reasoning: aiAnalysis[index].reasoning,
daysAgo: job.daysAgo, };
}, }
})); });
aiAnalysis = await analyzeBatch(analysisData, aiContext); logger.success("✅ AI analysis completed");
} else {
// Merge AI analysis with results logger.warning("⚠️ AI not available - skipping analysis");
results.forEach((job, index) => { }
if (aiAnalysis && aiAnalysis[index]) { }
job.aiAnalysis = {
isRelevant: aiAnalysis[index].isRelevant, return {
confidence: aiAnalysis[index].confidence, results,
reasoning: aiAnalysis[index].reasoning, rejectedResults,
}; metadata: {
} source: "skipthedrive",
}); totalJobs: results.length,
rejectedJobs: rejectedResults.length,
logger.success("✅ AI analysis completed"); keywords: keywords,
} else { jobTypes: jobTypes,
logger.warning("⚠️ AI not available - skipping analysis"); locationFilter: locationFilter,
} aiAnalysisEnabled: enableAI,
} aiAnalysisCompleted: !!aiAnalysis,
timestamp: new Date().toISOString(),
return { },
results, };
rejectedResults, } catch (error) {
metadata: { logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
source: "skipthedrive", throw error;
totalJobs: results.length, } finally {
rejectedJobs: rejectedResults.length, await browser.close();
keywords: keywords, }
jobTypes: jobTypes, }
locationFilter: locationFilter,
aiAnalysisEnabled: enableAI, // Export the parser
aiAnalysisCompleted: !!aiAnalysis, module.exports = {
timestamp: new Date().toISOString(), parseSkipTheDrive,
}, buildSearchUrl,
}; extractJobData,
} catch (error) { };
logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
throw error;
} finally {
await browser.close();
}
}
// Export the parser
module.exports = {
parseSkipTheDrive,
buildSearchUrl,
extractJobData,
};

View File

@ -1,116 +0,0 @@
/**
* CSV Utilities
*
* Functions for converting job search results to CSV format
*/
/**
* Escapes a CSV field value
* @param {string} value - The value to escape
* @returns {string} - The escaped value
*/
function escapeCsvField(value) {
if (value === null || value === undefined) {
return "";
}
const stringValue = String(value);
// If the value contains comma, newline, or double quote, wrap it in quotes and escape quotes
if (stringValue.includes(",") || stringValue.includes("\n") || stringValue.includes('"')) {
return `"${stringValue.replace(/"/g, '""')}"`;
}
return stringValue;
}
/**
* Converts job results to CSV format
* @param {Array} jobs - Array of job objects
* @param {Object} metadata - Metadata object (optional)
* @returns {string} - CSV string
*/
function convertJobsToCsv(jobs, metadata = null) {
if (!jobs || jobs.length === 0) {
return "";
}
// Define CSV columns based on job object structure
const columns = [
"jobId",
"title",
"company",
"location",
"jobUrl",
"postedDate",
"description",
"roleDuties",
"jobRequirements",
"jobType",
"experienceLevel",
"keyword",
"extractedAt",
"source",
"aiRelevant",
"aiConfidence",
"aiReasoning",
"aiContext",
"aiModel",
"aiAnalyzedAt"
];
// Create header row
const headerRow = columns.map(col => escapeCsvField(col)).join(",");
// Create data rows
const dataRows = jobs.map(job => {
const row = columns.map(col => {
if (col.startsWith("ai")) {
// Handle AI analysis fields
const aiField = col.substring(2).charAt(0).toLowerCase() + col.substring(3);
if (job.aiAnalysis) {
if (aiField === "relevant") {
return escapeCsvField(job.aiAnalysis.isRelevant ? "Yes" : "No");
} else if (aiField === "confidence") {
return escapeCsvField(job.aiAnalysis.confidence || "");
} else if (aiField === "reasoning") {
return escapeCsvField(job.aiAnalysis.reasoning || "");
} else if (aiField === "context") {
return escapeCsvField(job.aiAnalysis.context || "");
} else if (aiField === "model") {
return escapeCsvField(job.aiAnalysis.model || "");
} else if (aiField === "analyzedAt") {
return escapeCsvField(job.aiAnalysis.analyzedAt || "");
}
}
return "";
} else {
return escapeCsvField(job[col] || "");
}
});
return row.join(",");
});
// Combine header and data rows
return [headerRow, ...dataRows].join("\n");
}
/**
* Converts full results object (with metadata) to CSV
* @param {Object} resultsData - Full results object with metadata, results, etc.
* @returns {string} - CSV string
*/
function convertResultsToCsv(resultsData) {
if (!resultsData || !resultsData.results) {
return "";
}
return convertJobsToCsv(resultsData.results, resultsData.metadata);
}
module.exports = {
convertJobsToCsv,
convertResultsToCsv,
escapeCsvField,
};

View File

@ -1,947 +0,0 @@
/**
* Indeed Parsing Strategy
*
* Uses core-parser for browser management and ai-analyzer for utilities
*/
const {
logger,
cleanText,
containsAnyKeyword,
containsAllKeywords,
matchesKeywordGroups,
validateLocationAgainstFilters,
} = require("ai-analyzer");
/**
* Indeed URL builder
*/
function buildSearchUrl(keyword, location = "", filters = {}) {
const baseUrl = "https://www.indeed.com/jobs";
const params = new URLSearchParams({
q: keyword,
sort: "date", // Sort by date (newest first)
});
if (location) {
params.append("l", location);
}
// Add date filter if provided
if (filters.fromage) {
// fromage is in days (e.g., 1 = last 24 hours, 7 = last 7 days, 30 = last 30 days)
params.append("fromage", filters.fromage);
}
// Add job type filter
if (filters.jobType) {
// jt=fulltime, parttime, contract, internship, temporary
params.append("jt", filters.jobType);
}
// Add remote filter
if (filters.remote) {
params.append("remote", "true");
}
// Add experience level filter
if (filters.experienceLevel) {
// explvl=entry_level, mid_level, senior_level
params.append("explvl", filters.experienceLevel);
}
return `${baseUrl}?${params.toString()}`;
}
/**
* Indeed parsing strategy function
*/
async function indeedStrategy(coreParser, options = {}) {
const {
keywords = ["software engineer", "developer"],
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
locationFilter = null,
maxPages = 5,
location = "", // Indeed location search (e.g., "Toronto, ON", "Canada")
minDate = null, // Minimum posted date (format: YYYY-MM-DD)
useAndLogic = false, // Use AND logic instead of OR logic for keywords
} = options;
const results = [];
const rejectedResults = [];
const seenJobs = new Set();
try {
// Create main page
const page = await coreParser.createPage("indeed-main");
logger.info("🚀 Starting Indeed parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
if (keywordGroups) {
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
} else {
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
}
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`🌍 Indeed Location: ${location || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
// Convert minDate to fromage (days ago)
let fromage = null;
if (minDate) {
try {
const minDateObj = new Date(minDate);
const now = new Date();
const daysDiff = Math.floor((now - minDateObj) / (1000 * 60 * 60 * 24));
if (daysDiff > 0 && daysDiff <= 30) {
fromage = daysDiff;
logger.info(`📅 Min Date Filter: ${minDate} (${fromage} days ago)`);
} else if (daysDiff > 30) {
fromage = 30; // Indeed's maximum is typically 30 days
logger.info(`📅 Min Date Filter: ${minDate} (limited to 30 days)`);
}
} catch (error) {
logger.warning(`⚠️ Invalid date format for minDate: ${minDate}. Expected format: YYYY-MM-DD`);
}
}
// Determine search keywords based on logic type
let searchKeywords;
if (keywordGroups) {
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
} else if (useAndLogic) {
// For simple AND logic, combine all keywords into a single search query
searchKeywords = [keywords.join(" ")];
} else {
// For OR logic, search each keyword separately
searchKeywords = keywords;
}
// Search for each keyword (or combined keyword for AND logic)
for (const keyword of searchKeywords) {
logger.info(`\n🔍 Searching Indeed for: "${keyword}"`);
const searchUrl = buildSearchUrl(keyword, location, {
fromage: fromage,
});
logger.info(`🔗 Search URL: ${searchUrl}`);
try {
// Navigate to job search results
// Use domcontentloaded instead of networkidle for faster loading
// Indeed can be slow to fully load, so we'll wait for DOM and then check for content
try {
await coreParser.navigateTo(searchUrl, {
pageId: "indeed-main",
retries: 2,
waitUntil: "domcontentloaded",
timeout: 60000, // Increase timeout to 60 seconds
});
} catch (navError) {
// If navigation fails, try with load event instead
logger.warning(`⚠️ Initial navigation failed, trying with 'load' event: ${navError.message}`);
try {
await coreParser.navigateTo(searchUrl, {
pageId: "indeed-main",
retries: 1,
waitUntil: "load",
timeout: 60000,
});
} catch (loadError) {
// Last resort: try direct page navigation
logger.warning(`⚠️ Load event failed, trying direct navigation: ${loadError.message}`);
await page.goto(searchUrl, { timeout: 60000, waitUntil: "domcontentloaded" }).catch(() => {
throw new Error(`Failed to navigate to Indeed after all attempts: ${loadError.message}`);
});
}
}
// Wait for page to load and let JavaScript execute
await new Promise((resolve) => setTimeout(resolve, 5000));
// Check if we're on the right page
const currentUrl = page.url();
logger.info(`📍 Current page URL: ${currentUrl}`);
// Check if we were redirected or blocked (check URL first)
if (currentUrl.includes('captcha') || currentUrl.includes('blocked') || currentUrl.includes('access-denied') || currentUrl.includes('verify')) {
logger.error(`❌ Indeed appears to be blocking access. URL: ${currentUrl}`);
throw new Error('Indeed is showing a CAPTCHA or verification page. Please try running in non-headless mode (set HEADLESS=false in .env) or wait and try again later.');
}
// Check page content for CAPTCHA/human verification indicators
try {
const pageContent = await page.evaluate(() => {
const bodyText = document.body?.textContent?.toLowerCase() || '';
const title = document.title?.toLowerCase() || '';
// Check for common CAPTCHA/verification indicators
const captchaIndicators = [
'verify you\'re human',
'verify you are human',
'captcha',
'prove you\'re not a robot',
'unusual traffic',
'automated queries',
'please verify',
'security check',
'access denied',
'blocked',
];
const foundIndicators = captchaIndicators.filter(indicator =>
bodyText.includes(indicator) || title.includes(indicator)
);
return {
hasCaptcha: foundIndicators.length > 0,
indicators: foundIndicators,
title: document.title,
bodyPreview: bodyText.substring(0, 500),
};
});
if (pageContent.hasCaptcha) {
logger.error(`❌ Indeed is showing a CAPTCHA/verification page.`);
logger.error(` Detected indicators: ${pageContent.indicators.join(', ')}`);
logger.error(` Page title: ${pageContent.title}`);
logger.error(`\n💡 Solutions:`);
logger.error(` 1. Run in non-headless mode: Set HEADLESS=false in .env file`);
logger.error(` 2. Wait a few minutes and try again`);
logger.error(` 3. Use a different IP address or VPN`);
logger.error(` 4. Manually solve the CAPTCHA in a browser, then try again`);
throw new Error(`Indeed CAPTCHA detected: ${pageContent.indicators.join(', ')}. Please see suggestions above.`);
}
} catch (checkError) {
// If the check itself fails, log but don't throw (might be a different error)
if (checkError.message.includes('CAPTCHA')) {
throw checkError; // Re-throw CAPTCHA errors
}
logger.debug(`Could not check for CAPTCHA: ${checkError.message}`);
}
// Check for results count
try {
const resultsText = await page.evaluate(() => {
const countElement = document.querySelector(".jobsearch-JobCountAndSortPane-jobCount");
return countElement ? countElement.textContent : "No results count found";
});
logger.info(`📊 Indeed results info: ${resultsText}`);
} catch (e) {
logger.debug(`Could not get results count: ${e.message}`);
}
// Wait for job listings container
let hasResults = false;
const possibleSelectors = [
"#mosaic-provider-jobcards",
".job_seen_beacon",
"[data-jk]",
".jobsearch-SerpJobCard",
".jobCard",
];
for (const selector of possibleSelectors) {
try {
await page.waitForSelector(selector, { timeout: 5000 });
const count = await page.$$(selector).then((elements) => elements.length);
if (count > 0) {
hasResults = true;
logger.info(`✅ Found job results container with selector: ${selector} (${count} jobs)`);
break;
}
} catch (e) {
// Try next selector
continue;
}
}
if (!hasResults) {
logger.warning(`⚠️ No job results container found for keyword: ${keyword}`);
continue;
}
// Process multiple pages
let currentPage = 1;
const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited
logger.info(`📄 Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`);
while (currentPage <= maxPagesToProcess) {
logger.info(`📄 Processing page ${currentPage}...`);
// Wait for page to fully load
await new Promise((resolve) => setTimeout(resolve, 2000));
// Extract jobs from current page
const pageJobs = await extractJobsFromPage(page, keyword, locationFilter);
logger.info(`📋 Extracted ${pageJobs.length} jobs from page ${currentPage}`);
if (pageJobs.length === 0) {
logger.warning(`⚠️ No jobs found on page ${currentPage}, stopping pagination`);
break;
}
// Process each job
for (const job of pageJobs) {
// Skip duplicates
if (seenJobs.has(job.jobId)) {
continue;
}
seenJobs.add(job.jobId);
// Validate keywords based on logic type
if (keywordGroups) {
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
const fullText = `${job.title} ${job.description} ${job.company}`;
if (!matchesKeywordGroups(fullText, keywordGroups)) {
rejectedResults.push({
...job,
rejectionReason: "Job does not match all keyword groups",
});
continue;
}
} else if (useAndLogic) {
// Simple AND logic: all keywords must match
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
if (!containsAllKeywords(fullText, keywords)) {
rejectedResults.push({
...job,
rejectionReason: "Not all keywords found in job listing",
});
continue;
}
}
// Validate location if filtering enabled
if (locationFilter) {
const locationValid = validateLocationAgainstFilters(
job.location,
locationFilter
);
if (!locationValid.isValid) {
rejectedResults.push({
...job,
rejectionReason: locationValid.reasoning || "Location filter mismatch",
});
continue;
}
}
results.push(job);
}
// Check if there's a next page
const hasNext = await hasNextPageAvailable(page);
if (!hasNext) {
logger.info(`✅ No more pages available. Total jobs extracted: ${results.length}`);
break;
}
// Navigate to next page if we haven't reached maxPages
if (currentPage < maxPagesToProcess) {
logger.info(`➡️ Navigating to page ${currentPage + 1}...`);
const navigationSuccess = await navigateToNextPage(page);
if (!navigationSuccess) {
logger.warning(`⚠️ Failed to navigate to next page, stopping pagination`);
break;
}
currentPage++;
} else {
logger.info(`📊 Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${results.length}`);
break;
}
}
const totalExtracted = results.length + rejectedResults.length;
logger.info(`📋 Extracted ${results.length} accepted jobs, ${rejectedResults.length} rejected jobs (${totalExtracted} total) across ${currentPage} page(s) for "${keyword}"`);
} catch (error) {
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
logger.error(`Stack: ${error.stack}`);
}
}
logger.info(
`🎯 Indeed parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
);
return {
results,
rejectedResults,
summary: {
totalJobs: results.length,
totalRejected: rejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
source: "indeed",
},
};
} catch (error) {
logger.error(`❌ Indeed parsing failed: ${error.message}`);
logger.error(`Stack: ${error.stack}`);
return {
results,
rejectedResults,
summary: {
totalJobs: results.length,
totalRejected: rejectedResults.length,
keywords: keywords.join(", "),
locationFilter,
source: "indeed",
error: error.message,
},
};
}
}
/**
* Extract jobs from current page
*/
async function extractJobsFromPage(page, keyword, locationFilter) {
const jobs = [];
try {
// Indeed job listings are typically in divs with data-jk attribute (job key)
const jobSelectors = [
"[data-jk]",
".job_seen_beacon",
".jobsearch-SerpJobCard",
".jobCard",
"div[data-testid='job-card']",
];
let jobElements = [];
for (const selector of jobSelectors) {
try {
await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {});
const elements = await page.$$(selector);
if (elements.length > 0) {
jobElements = elements;
logger.info(`✅ Found ${jobElements.length} job elements using selector: ${selector}`);
break;
}
} catch (e) {
// Try next selector
continue;
}
}
if (jobElements.length === 0) {
logger.warning(`⚠️ No job elements found with any selector`);
return jobs;
}
for (const jobElement of jobElements) {
try {
// Try to scroll job into view, but don't fail if it times out
// Some elements might be in hidden containers or lazy-loaded
try {
await Promise.race([
jobElement.scrollIntoViewIfNeeded(),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Scroll timeout')), 2000)
)
]);
await new Promise((resolve) => setTimeout(resolve, 100));
} catch (scrollError) {
// If scrolling fails, try a simpler scroll approach
try {
await jobElement.evaluate((el) => {
el.scrollIntoView({ behavior: 'auto', block: 'center' });
});
await new Promise((resolve) => setTimeout(resolve, 100));
} catch (simpleScrollError) {
// If even simple scroll fails, continue anyway - we can still extract data
logger.debug(`Could not scroll element into view, continuing anyway: ${simpleScrollError.message}`);
}
}
const job = await extractJobData(jobElement, keyword);
if (job && (job.title || job.jobId)) {
jobs.push(job);
}
} catch (error) {
logger.warning(`Failed to extract job data: ${error.message}`);
}
}
} catch (error) {
logger.error(`Failed to extract jobs from page: ${error.message}`);
}
return jobs;
}
/**
* Extract data from individual job element
*/
async function extractJobData(jobElement, keyword) {
try {
const jobData = await jobElement.evaluate((el) => {
const data = {
jobId: "",
title: "",
company: "",
location: "",
jobUrl: "",
postedDate: "",
description: "",
salary: "",
jobType: "",
};
// Extract job ID from data-jk attribute
data.jobId = el.getAttribute("data-jk") || "";
// Extract title and URL
const titleSelectors = [
"h2.jobTitle a",
"h2.jobTitle",
"a[data-jk]",
"h2 a",
".jobTitle a",
"[class*='jobTitle'] a",
];
for (const selector of titleSelectors) {
const titleElement = el.querySelector(selector);
if (titleElement) {
data.title = titleElement.textContent?.trim() || titleElement.innerText?.trim() || "";
if (titleElement.tagName === "A") {
data.jobUrl = titleElement.getAttribute("href") || "";
} else {
const link = titleElement.querySelector("a");
if (link) {
data.jobUrl = link.getAttribute("href") || "";
}
}
if (data.title) break;
}
}
// Extract company name
const companySelectors = [
"[data-testid='company-name']",
".companyName",
"[class*='companyName']",
"span.companyName",
"a[data-testid='company-name']",
];
for (const selector of companySelectors) {
const companyElement = el.querySelector(selector);
if (companyElement) {
const text = companyElement.textContent?.trim() || companyElement.innerText?.trim() || "";
if (text && text.length > 0) {
data.company = text;
break;
}
}
}
// Extract location
const locationSelectors = [
"[data-testid='job-location']",
".companyLocation",
"[class*='companyLocation']",
"[class*='location']",
];
for (const selector of locationSelectors) {
const locationElement = el.querySelector(selector);
if (locationElement) {
const text = locationElement.textContent?.trim() || locationElement.innerText?.trim() || "";
if (text && text.length > 0) {
data.location = text;
break;
}
}
}
// Extract salary
const salarySelectors = [
"[data-testid='attribute_snippet_testid']",
".salary-snippet",
"[class*='salary']",
".salaryText",
];
for (const selector of salarySelectors) {
const salaryElement = el.querySelector(selector);
if (salaryElement) {
const text = salaryElement.textContent?.trim() || salaryElement.innerText?.trim() || "";
if (text && text.includes("$") || text.match(/\d+/)) {
data.salary = text;
break;
}
}
}
// Extract posted date
const dateSelectors = [
"[data-testid='myJobsStateDate']",
".date",
"[class*='date']",
"span.date",
];
for (const selector of dateSelectors) {
const dateElement = el.querySelector(selector);
if (dateElement) {
const text = dateElement.textContent?.trim() || dateElement.innerText?.trim() || "";
if (text) {
// Parse relative dates like "2 days ago", "Just posted", etc.
const now = new Date();
if (text.match(/just posted|today/i)) {
data.postedDate = now.toISOString().split("T")[0];
} else if (text.match(/\d+\s*(day|days)/i)) {
const match = text.match(/(\d+)\s*day/i);
if (match) {
const daysAgo = parseInt(match[1]);
const date = new Date(now);
date.setDate(date.getDate() - daysAgo);
data.postedDate = date.toISOString().split("T")[0];
}
} else {
data.postedDate = text;
}
break;
}
}
}
// Extract description snippet
const descSelectors = [
".job-snippet",
"[class*='job-snippet']",
"[class*='summary']",
".summary",
];
for (const selector of descSelectors) {
const descElement = el.querySelector(selector);
if (descElement) {
const text = descElement.textContent?.trim() || descElement.innerText?.trim() || "";
if (text && text.length > 20) {
data.description = text.substring(0, 500); // Limit description length
break;
}
}
}
return data;
});
// Clean and format
const title = cleanText(jobData.title);
let jobUrl = jobData.jobUrl || "";
// Make URL absolute if relative
if (jobUrl && !jobUrl.startsWith("http")) {
if (jobUrl.startsWith("/")) {
jobUrl = `https://www.indeed.com${jobUrl}`;
} else {
jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`;
}
} else if (!jobUrl && jobData.jobId) {
jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`;
}
// Generate job ID if not found
const jobId = jobData.jobId || `indeed-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
if (!jobId && !title) {
return null;
}
return {
jobId,
title,
company: cleanText(jobData.company),
location: cleanText(jobData.location),
jobUrl,
postedDate: jobData.postedDate,
description: cleanText(jobData.description),
salary: cleanText(jobData.salary),
jobType: jobData.jobType,
keyword,
extractedAt: new Date().toISOString(),
source: "indeed",
};
} catch (error) {
logger.warning(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Parse job description to separate role duties from job requirements
*/
function parseDutiesAndRequirements(description) {
if (!description || description.trim().length === 0) {
return { duties: "", requirements: "" };
}
// Common section headers that indicate duties/responsibilities
const dutiesKeywords = [
/responsibilities?:/i,
/duties?:/i,
/what you['\u2019]ll do/i,
/key responsibilities/i,
/your role/i,
/position overview/i,
/about the role/i,
];
// Common section headers that indicate requirements/qualifications
const requirementsKeywords = [
/requirements?:/i,
/qualifications?:/i,
/must have/i,
/required:/i,
/what you['\u2019]ll bring/i,
/you have:/i,
/skills required/i,
/minimum requirements/i,
];
// Split description into sections
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
let currentSection = "duties";
let dutiesText = "";
let requirementsText = "";
for (const section of sections) {
let isRequirementsSection = false;
for (const keyword of requirementsKeywords) {
if (keyword.test(section)) {
isRequirementsSection = true;
currentSection = "requirements";
break;
}
}
if (!isRequirementsSection) {
for (const keyword of dutiesKeywords) {
if (keyword.test(section)) {
currentSection = "duties";
break;
}
}
}
if (currentSection === "requirements") {
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
} else {
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
}
}
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
if (!dutiesText && !requirementsText && description) {
const midPoint = Math.floor(description.length * 0.6);
dutiesText = description.substring(0, midPoint).trim();
requirementsText = description.substring(midPoint).trim();
}
return {
duties: dutiesText.trim(),
requirements: requirementsText.trim(),
};
}
/**
* Check if next page is available
*/
async function hasNextPageAvailable(page) {
try {
const nextButtonSelectors = [
"a[aria-label='Next']",
"a[aria-label='Next Page']",
"a[data-testid='pagination-page-next']",
"[data-testid='pagination-page-next']",
"a[aria-label*='Next']",
];
for (const selector of nextButtonSelectors) {
try {
const nextButton = await page.$(selector);
if (nextButton) {
const isDisabled = await nextButton.evaluate((el) => {
return el.hasAttribute("disabled") ||
el.getAttribute("aria-disabled") === "true" ||
el.classList.contains("disabled");
}).catch(() => false);
if (!isDisabled) {
return true;
}
}
} catch (e) {
continue;
}
}
return false;
} catch (error) {
logger.debug(`Error checking for next page: ${error.message}`);
return false;
}
}
/**
* Navigate to next page
*/
async function navigateToNextPage(page) {
try {
const nextButtonSelectors = [
"a[aria-label='Next']",
"a[aria-label='Next Page']",
"a[data-testid='pagination-page-next']",
"[data-testid='pagination-page-next']",
"a[aria-label*='Next']",
];
for (const selector of nextButtonSelectors) {
try {
const nextButton = await page.$(selector);
if (nextButton) {
const isDisabled = await nextButton.evaluate((el) => {
return el.hasAttribute("disabled") ||
el.getAttribute("aria-disabled") === "true" ||
el.classList.contains("disabled");
}).catch(() => false);
if (!isDisabled) {
// Get current URL before navigation
const urlBefore = page.url();
await nextButton.scrollIntoViewIfNeeded().catch(() => {});
await new Promise((resolve) => setTimeout(resolve, 500));
await nextButton.click();
logger.info(`✅ Clicked next page button`);
// Wait for navigation to complete (URL change or content load)
// Indeed might use AJAX, so wait for either URL change or content update
let navigationComplete = false;
const maxWaitTime = 10000; // 10 seconds max wait
const startTime = Date.now();
while (!navigationComplete && (Date.now() - startTime) < maxWaitTime) {
await new Promise((resolve) => setTimeout(resolve, 500));
// Check if URL changed (full page navigation)
const currentUrl = page.url();
if (currentUrl !== urlBefore) {
logger.info(`📍 URL changed to: ${currentUrl}`);
navigationComplete = true;
break;
}
// Check if job elements appeared (AJAX navigation)
const jobCount = await page.$$eval(
"[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard",
(elements) => elements.length
).catch(() => 0);
if (jobCount > 0) {
logger.info(`✅ Found ${jobCount} job elements (AJAX navigation)`);
navigationComplete = true;
break;
}
}
// Additional wait for content to stabilize
await new Promise((resolve) => setTimeout(resolve, 2000));
// Check for CAPTCHA after navigation
const currentUrl = page.url();
if (currentUrl.includes('captcha') || currentUrl.includes('verify') || currentUrl.includes('blocked')) {
logger.error(`❌ CAPTCHA detected after navigation to page. URL: ${currentUrl}`);
throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false) or wait and try again.');
}
// Check page content for CAPTCHA
try {
const hasCaptcha = await page.evaluate(() => {
const bodyText = document.body?.textContent?.toLowerCase() || '';
const indicators = ['verify you\'re human', 'captcha', 'unusual traffic', 'automated queries'];
return indicators.some(ind => bodyText.includes(ind));
});
if (hasCaptcha) {
logger.error(`❌ CAPTCHA detected on page content after navigation`);
throw new Error('Indeed CAPTCHA detected. Please run in non-headless mode (HEADLESS=false) to solve it manually.');
}
} catch (captchaError) {
if (captchaError.message.includes('CAPTCHA')) {
throw captchaError;
}
}
// Scroll page to trigger any lazy loading
try {
await page.evaluate(() => {
window.scrollTo(0, 300);
});
await new Promise((resolve) => setTimeout(resolve, 1000));
} catch (e) {
// Ignore scroll errors
}
// Final check for job elements with multiple selectors
const finalJobCount = await page.$$eval(
"[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard, div[data-testid='job-card']",
(elements) => elements.length
).catch(() => 0);
if (finalJobCount > 0) {
logger.info(`✅ Navigation successful, found ${finalJobCount} job elements`);
return true;
} else {
logger.warning(`⚠️ No job elements found after navigation (waited ${maxWaitTime}ms)`);
// Debug: check what's on the page
try {
const pageTitle = await page.title();
const pageUrl = page.url();
logger.debug(`Page title: ${pageTitle}, URL: ${pageUrl}`);
// Check if it's a CAPTCHA page
const bodyText = await page.evaluate(() => document.body?.textContent?.toLowerCase() || '');
if (bodyText.includes('captcha') || bodyText.includes('verify')) {
logger.error(`❌ Page appears to be a CAPTCHA page`);
throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false).');
}
} catch (e) {
if (e.message.includes('CAPTCHA')) {
throw e;
}
// Ignore other debug errors
}
return false;
}
}
}
} catch (e) {
continue;
}
}
logger.warning(`⚠️ Could not find or click next page button`);
return false;
} catch (error) {
logger.warning(`Failed to navigate to next page: ${error.message}`);
return false;
}
}
module.exports = {
indeedStrategy,
buildSearchUrl,
};

File diff suppressed because it is too large Load Diff

View File

@ -1,493 +1,302 @@
/** /**
* SkipTheDrive Parsing Strategy * SkipTheDrive Parsing Strategy
* *
* Uses core-parser for browser management and ai-analyzer for utilities * Uses core-parser for browser management and ai-analyzer for utilities
*/ */
const { const {
logger, logger,
cleanText, cleanText,
containsAnyKeyword, containsAnyKeyword,
containsAllKeywords, validateLocationAgainstFilters,
matchesKeywordGroups, } = require("ai-analyzer");
validateLocationAgainstFilters,
} = require("ai-analyzer"); /**
* SkipTheDrive URL builder
/** */
* SkipTheDrive URL builder function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
*/ const baseUrl = "https://www.skipthedrive.com/";
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) { const params = new URLSearchParams({
const baseUrl = "https://www.skipthedrive.com/"; s: keyword,
const params = new URLSearchParams({ orderby: orderBy,
s: keyword, });
orderby: orderBy,
}); if (jobTypes && jobTypes.length > 0) {
params.append("job_type", jobTypes.join(","));
if (jobTypes && jobTypes.length > 0) { }
params.append("job_type", jobTypes.join(","));
} return `${baseUrl}?${params.toString()}`;
}
return `${baseUrl}?${params.toString()}`;
} /**
* SkipTheDrive parsing strategy function
/** */
* SkipTheDrive parsing strategy function async function skipthedriveStrategy(coreParser, options = {}) {
*/ const {
async function skipthedriveStrategy(coreParser, options = {}) { keywords = ["software engineer", "developer", "programmer"],
const { locationFilter = null,
keywords = ["software engineer", "developer", "programmer"], maxPages = 5,
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic jobTypes = [],
locationFilter = null, } = options;
maxPages = 5,
jobTypes = [], const results = [];
useAndLogic = false, // Use AND logic instead of OR logic for keywords const rejectedResults = [];
} = options; const seenJobs = new Set();
const results = []; try {
const rejectedResults = []; // Create main page
const seenJobs = new Set(); const page = await coreParser.createPage("skipthedrive-main");
try { logger.info("🚀 Starting SkipTheDrive parser...");
// Create main page logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
const page = await coreParser.createPage("skipthedrive-main"); logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
logger.info(`📄 Max Pages: ${maxPages}`);
logger.info("🚀 Starting SkipTheDrive parser...");
logger.info(`🔍 Keywords: ${keywords.join(", ")}`); // Search for each keyword
if (keywordGroups) { for (const keyword of keywords) {
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`); logger.info(`\n🔍 Searching for: ${keyword}`);
} else {
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`); const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
}
logger.info(`📍 Location Filter: ${locationFilter || "None"}`); try {
logger.info(`📄 Max Pages: ${maxPages}`); // Navigate to search results
await coreParser.navigateTo(searchUrl, {
// Determine search keywords based on logic type pageId: "skipthedrive-main",
let searchKeywords; retries: 2,
if (keywordGroups) { timeout: 30000,
// For grouped AND/OR logic, search each keyword in each group (OR within groups) });
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
} else if (useAndLogic) { // Wait for job listings to load
// For simple AND logic, combine all keywords into a single search query const hasResults = await coreParser
searchKeywords = [keywords.join(" ")]; .waitForSelector(
} else { "#loops-wrapper",
// For OR logic, search each keyword separately {
searchKeywords = keywords; timeout: 5000,
} },
"skipthedrive-main"
// Search for each keyword (or combined keyword for AND logic) )
for (const keyword of searchKeywords) { .catch(() => {
logger.info(`\n🔍 Searching for: ${keyword}`); logger.warning(`No results found for keyword: ${keyword}`);
return false;
const searchUrl = buildSearchUrl(keyword, "date", jobTypes); });
try { if (!hasResults) {
// Navigate to search results continue;
await coreParser.navigateTo(searchUrl, { }
pageId: "skipthedrive-main",
retries: 2, // Process multiple pages
timeout: 30000, let currentPage = 1;
}); let hasNextPage = true;
// Wait for job listings to load while (hasNextPage && currentPage <= maxPages) {
const hasResults = await page logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);
.waitForSelector("#loops-wrapper", {
timeout: 5000, // Extract jobs from current page
}) const pageJobs = await extractJobsFromPage(
.then(() => true) page,
.catch(() => { keyword,
logger.warning(`No results found for keyword: ${keyword}`); locationFilter
return false; );
});
for (const job of pageJobs) {
if (!hasResults) { // Skip duplicates
continue; if (seenJobs.has(job.jobId)) continue;
} seenJobs.add(job.jobId);
// Process multiple pages // Validate location if filtering enabled
let currentPage = 1; if (locationFilter) {
let hasNextPage = true; const locationValid = validateLocationAgainstFilters(
job.location,
while (hasNextPage && currentPage <= maxPages) { locationFilter
logger.info(`📄 Processing page ${currentPage} for "${keyword}"`); );
// Extract jobs from current page if (!locationValid) {
const pageJobs = await extractJobsFromPage( rejectedResults.push({
page, ...job,
keyword, rejectionReason: "Location filter mismatch",
locationFilter, });
keywords, continue;
keywordGroups, }
useAndLogic }
);
results.push(job);
for (const job of pageJobs) { }
// Skip duplicates
if (seenJobs.has(job.jobId)) continue; // Check for next page
seenJobs.add(job.jobId); hasNextPage = await hasNextPageAvailable(page);
if (hasNextPage && currentPage < maxPages) {
// Validate keywords based on logic type await navigateToNextPage(page, currentPage + 1);
if (keywordGroups) { currentPage++;
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
const fullText = `${job.title} ${job.description} ${job.company}`; // Wait for new page to load
if (!matchesKeywordGroups(fullText, keywordGroups)) { await page.waitForTimeout(2000);
rejectedResults.push({ } else {
...job, hasNextPage = false;
rejectionReason: "Job does not match all keyword groups", }
}); }
continue; } catch (error) {
} logger.error(`Error processing keyword "${keyword}": ${error.message}`);
} else if (useAndLogic) { }
// Simple AND logic: all keywords must match }
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
if (!containsAllKeywords(fullText, keywords)) { logger.info(
rejectedResults.push({ `🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
...job, );
rejectionReason: "Not all keywords found in job listing",
}); return {
continue; results,
} rejectedResults,
} summary: {
totalJobs: results.length,
// Validate location if filtering enabled totalRejected: rejectedResults.length,
if (locationFilter) { keywords: keywords.join(", "),
const locationValid = validateLocationAgainstFilters( locationFilter,
job.location, source: "skipthedrive",
locationFilter },
); };
} catch (error) {
if (!locationValid) { logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`);
rejectedResults.push({ throw error;
...job, }
rejectionReason: "Location filter mismatch", }
});
continue; /**
} * Extract jobs from current page
} */
async function extractJobsFromPage(page, keyword, locationFilter) {
results.push(job); const jobs = [];
}
try {
// Check for next page // Get all job article elements
hasNextPage = await hasNextPageAvailable(page); const jobElements = await page.$$("article.job_listing");
if (hasNextPage && currentPage < maxPages) {
await navigateToNextPage(page, currentPage + 1); for (const jobElement of jobElements) {
currentPage++; try {
const job = await extractJobData(jobElement, keyword);
// Wait for new page to load if (job) {
await page.waitForTimeout(2000); jobs.push(job);
} else { }
hasNextPage = false; } catch (error) {
} logger.warning(`Failed to extract job data: ${error.message}`);
} }
} catch (error) { }
logger.error(`Error processing keyword "${keyword}": ${error.message}`); } catch (error) {
} logger.error(`Failed to extract jobs from page: ${error.message}`);
} }
logger.info( return jobs;
`🎯 SkipTheDrive parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected` }
);
/**
return { * Extract data from individual job element
results, */
rejectedResults, async function extractJobData(jobElement, keyword) {
summary: { try {
totalJobs: results.length, // Extract job ID
totalRejected: rejectedResults.length, const articleId = (await jobElement.getAttribute("id")) || "";
keywords: keywords.join(", "), const jobId = articleId ? articleId.replace("post-", "") : "";
locationFilter,
source: "skipthedrive", // Extract title
}, const titleElement = await jobElement.$(".job_listing-title a");
}; const title = titleElement
} catch (error) { ? cleanText(await titleElement.textContent())
logger.error(`❌ SkipTheDrive parsing failed: ${error.message}`); : "";
throw error; const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
}
} // Extract company
const companyElement = await jobElement.$(".company");
/** const company = companyElement
* Extract jobs from current page ? cleanText(await companyElement.textContent())
*/ : "";
async function extractJobsFromPage(page, keyword, locationFilter, allKeywords = [], keywordGroups = null, useAndLogic = false) {
const jobs = []; // Extract location
const locationElement = await jobElement.$(".location");
try { const location = locationElement
// Get all job article elements ? cleanText(await locationElement.textContent())
const jobElements = await page.$$("article.job_listing"); : "";
for (const jobElement of jobElements) { // Extract date posted
try { const dateElement = await jobElement.$(".job-date");
const job = await extractJobData(jobElement, keyword); const dateText = dateElement
if (job) { ? cleanText(await dateElement.textContent())
jobs.push(job); : "";
}
} catch (error) { // Extract description
logger.warning(`Failed to extract job data: ${error.message}`); const descElement = await jobElement.$(".job_listing-description");
} const description = descElement
} ? cleanText(await descElement.textContent())
} catch (error) { : "";
logger.error(`Failed to extract jobs from page: ${error.message}`);
} // Check if featured
const featuredElement = await jobElement.$(".featured");
return jobs; const isFeatured = featuredElement !== null;
}
// Parse date
/** let datePosted = null;
* Parse job description to separate role duties from job requirements let daysAgo = null;
*/
function parseDutiesAndRequirements(description) { if (dateText) {
if (!description || description.trim().length === 0) { const match = dateText.match(/(\d+)\s+days?\s+ago/);
return { duties: "", requirements: "" }; if (match) {
} daysAgo = parseInt(match[1]);
const date = new Date();
// Common section headers that indicate duties/responsibilities date.setDate(date.getDate() - daysAgo);
const dutiesKeywords = [ datePosted = date.toISOString().split("T")[0];
/responsibilities?:/i, }
/duties?:/i, }
/what you['\u2019]ll do/i,
/key responsibilities/i, return {
/your role/i, jobId,
/position overview/i, title,
/about the role/i, company,
/role overview/i, location,
/what we need/i, jobUrl,
/you will:/i, datePosted,
/you['\u2019]ll be responsible/i, dateText,
]; daysAgo,
description,
// Common section headers that indicate requirements/qualifications isFeatured,
const requirementsKeywords = [ keyword,
/requirements?:/i, extractedAt: new Date().toISOString(),
/qualifications?:/i, source: "skipthedrive",
/must have/i, };
/required:/i, } catch (error) {
/what you['\u2019]ll bring/i, logger.warning(`Error extracting job data: ${error.message}`);
/you have:/i, return null;
/skills required/i, }
/minimum requirements/i, }
/preferred qualifications/i,
/education:/i, /**
/experience:/i, * Check if next page is available
/you must have/i, */
/we['\u2019]re looking for/i, async function hasNextPageAvailable(page) {
]; try {
const nextButton = await page.$(".next-page");
// Split description into sections (by common delimiters) return nextButton !== null;
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0); } catch {
return false;
let currentSection = "duties"; // Default to duties }
let dutiesText = ""; }
let requirementsText = "";
/**
for (const section of sections) { * Navigate to next page
const sectionLower = section.toLowerCase(); */
async function navigateToNextPage(page, pageNumber) {
// Check if this section is about requirements try {
let isRequirementsSection = false; const nextButton = await page.$(".next-page");
for (const keyword of requirementsKeywords) { if (nextButton) {
if (keyword.test(section)) { await nextButton.click();
isRequirementsSection = true; }
currentSection = "requirements"; } catch (error) {
break; logger.warning(
} `Failed to navigate to page ${pageNumber}: ${error.message}`
} );
}
// Check if this section is about duties/responsibilities }
if (!isRequirementsSection) {
for (const keyword of dutiesKeywords) { module.exports = {
if (keyword.test(section)) { skipthedriveStrategy,
currentSection = "duties"; buildSearchUrl,
break; extractJobsFromPage,
} extractJobData,
} };
}
// Add to appropriate section
if (currentSection === "requirements") {
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
} else {
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
}
}
// If we couldn't split by sections, try to find bullet points or numbered lists
if (!dutiesText && !requirementsText) {
const lines = description.split(/\n/);
let foundRequirementsHeader = false;
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.length === 0) continue;
// Check if this line is a requirements header
for (const keyword of requirementsKeywords) {
if (keyword.test(line)) {
foundRequirementsHeader = true;
break;
}
}
if (foundRequirementsHeader) {
requirementsText += (requirementsText ? "\n" : "") + line;
} else {
// Check if it's a duties header
let isDutiesHeader = false;
for (const keyword of dutiesKeywords) {
if (keyword.test(line)) {
isDutiesHeader = true;
break;
}
}
if (!isDutiesHeader) {
// Add to duties if we haven't found requirements header yet
if (!foundRequirementsHeader) {
dutiesText += (dutiesText ? "\n" : "") + line;
} else {
requirementsText += (requirementsText ? "\n" : "") + line;
}
} else {
dutiesText += (dutiesText ? "\n" : "") + line;
}
}
}
}
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
if (!dutiesText && !requirementsText && description) {
const midPoint = Math.floor(description.length * 0.6);
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
const splitPoint = Math.max(
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
);
dutiesText = description.substring(0, splitPoint).trim();
requirementsText = description.substring(splitPoint).trim();
}
return {
duties: dutiesText.trim(),
requirements: requirementsText.trim(),
};
}
/**
* Extract data from individual job element
*/
async function extractJobData(jobElement, keyword) {
try {
// Extract job ID
const articleId = (await jobElement.getAttribute("id")) || "";
const jobId = articleId ? articleId.replace("post-", "") : "";
// Extract title
const titleElement = await jobElement.$(".job_listing-title a");
const title = titleElement
? cleanText(await titleElement.textContent())
: "";
const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";
// Extract company
const companyElement = await jobElement.$(".company");
const company = companyElement
? cleanText(await companyElement.textContent())
: "";
// Extract location
const locationElement = await jobElement.$(".location");
const location = locationElement
? cleanText(await locationElement.textContent())
: "";
// Extract date posted
const dateElement = await jobElement.$(".job-date");
const dateText = dateElement
? cleanText(await dateElement.textContent())
: "";
// Extract description
const descElement = await jobElement.$(".job_listing-description");
const description = descElement
? cleanText(await descElement.textContent())
: "";
// Check if featured
const featuredElement = await jobElement.$(".featured");
const isFeatured = featuredElement !== null;
// Parse date
let datePosted = null;
let daysAgo = null;
if (dateText) {
const match = dateText.match(/(\d+)\s+days?\s+ago/);
if (match) {
daysAgo = parseInt(match[1]);
const date = new Date();
date.setDate(date.getDate() - daysAgo);
datePosted = date.toISOString().split("T")[0];
}
}
// Parse duties and requirements from description if available
const parsed = parseDutiesAndRequirements(description);
return {
jobId,
title,
company,
location,
jobUrl,
datePosted,
dateText,
daysAgo,
description,
roleDuties: parsed.duties,
jobRequirements: parsed.requirements,
isFeatured,
keyword,
extractedAt: new Date().toISOString(),
source: "skipthedrive",
};
} catch (error) {
logger.warning(`Error extracting job data: ${error.message}`);
return null;
}
}
/**
* Check if next page is available
*/
async function hasNextPageAvailable(page) {
try {
const nextButton = await page.$(".next-page");
return nextButton !== null;
} catch {
return false;
}
}
/**
* Navigate to next page
*/
async function navigateToNextPage(page, pageNumber) {
try {
const nextButton = await page.$(".next-page");
if (nextButton) {
await nextButton.click();
}
} catch (error) {
logger.warning(
`Failed to navigate to page ${pageNumber}: ${error.message}`
);
}
}
module.exports = {
skipthedriveStrategy,
buildSearchUrl,
extractJobsFromPage,
extractJobData,
};

View File

@ -10,34 +10,20 @@ const path = require("path");
const fs = require("fs"); const fs = require("fs");
const CoreParser = require("../core-parser"); const CoreParser = require("../core-parser");
const { linkedinStrategy } = require("./strategies/linkedin-strategy"); const { linkedinStrategy } = require("./strategies/linkedin-strategy");
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer"); const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
// Load environment variables - check both linkedin-parser/.env and root .env // Load environment variables
const localEnvPath = path.join(__dirname, ".env"); require("dotenv").config({ path: path.join(__dirname, ".env") });
const rootEnvPath = path.join(__dirname, "..", ".env");
// Try local .env first, then root .env
if (fs.existsSync(localEnvPath)) {
require("dotenv").config({ path: localEnvPath });
} else if (fs.existsSync(rootEnvPath)) {
require("dotenv").config({ path: rootEnvPath });
} else {
// Try default dotenv behavior (looks in current directory and parent directories)
require("dotenv").config();
}
// Configuration from environment // Configuration from environment
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME; const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD; const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
const HEADLESS = process.env.HEADLESS !== "false"; const HEADLESS = process.env.HEADLESS !== "false";
const SEARCH_KEYWORDS = const SEARCH_KEYWORDS =
process.env.SEARCH_KEYWORDS || "layoff";//,downsizing";//,job cuts"; process.env.SEARCH_KEYWORDS || "layoff,downsizing,job cuts";
const LOCATION_FILTER = process.env.LOCATION_FILTER; const LOCATION_FILTER = process.env.LOCATION_FILTER;
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false"; const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50; const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
const EXTRACT_LOCATION_FROM_PROFILE = process.env.EXTRACT_LOCATION_FROM_PROFILE === "true";
/** /**
* Main LinkedIn parser function * Main LinkedIn parser function
@ -72,7 +58,6 @@ async function startLinkedInParser(options = {}) {
keywords, keywords,
locationFilter: LOCATION_FILTER, locationFilter: LOCATION_FILTER,
maxResults: MAX_RESULTS, maxResults: MAX_RESULTS,
extractLocationFromProfile: EXTRACT_LOCATION_FROM_PROFILE,
credentials: { credentials: {
username: LINKEDIN_USERNAME, username: LINKEDIN_USERNAME,
password: LINKEDIN_PASSWORD, password: LINKEDIN_PASSWORD,
@ -81,109 +66,52 @@ async function startLinkedInParser(options = {}) {
const { results, rejectedResults, summary } = parseResult; const { results, rejectedResults, summary } = parseResult;
// AI Analysis if enabled - embed results into each post // AI Analysis if enabled
let resultsWithAI = results; let analysisResults = null;
let aiAnalysisCompleted = false;
if (ENABLE_AI_ANALYSIS && results.length > 0) { if (ENABLE_AI_ANALYSIS && results.length > 0) {
logger.step("🧠 Running AI Analysis..."); logger.step("🧠 Running AI Analysis...");
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL); const ollamaStatus = await checkOllamaStatus();
if (ollamaAvailable) { if (ollamaStatus.available) {
// Prepare data for analysis (analyzeBatch expects posts with 'text' field) analysisResults = await analyzeBatch(results, {
const analysisData = results.map((post) => ({ context:
text: post.text || post.content || "", "LinkedIn posts analysis focusing on job market trends and layoffs",
location: post.location || "",
keyword: post.keyword || "",
timestamp: post.timestamp || post.extractedAt || "",
}));
const analysisResults = await analyzeBatch(
analysisData,
AI_CONTEXT,
OLLAMA_MODEL
);
// Embed AI analysis into each result
resultsWithAI = results.map((post, index) => {
const aiResult = analysisResults[index];
return {
...post,
aiAnalysis: {
isRelevant: aiResult.isRelevant,
confidence: aiResult.confidence,
reasoning: aiResult.reasoning,
context: AI_CONTEXT,
model: OLLAMA_MODEL,
analyzedAt: new Date().toISOString(),
},
};
}); });
aiAnalysisCompleted = true;
logger.success(`✅ AI Analysis completed for ${results.length} posts`); logger.success(`✅ AI Analysis completed for ${results.length} posts`);
} else { } else {
logger.warning("⚠️ Ollama not available, skipping AI analysis"); logger.warning("⚠️ Ollama not available, skipping AI analysis");
} }
} }
// Prepare results with embedded AI analysis // Save results
const outputData = { const outputData = {
metadata: { metadata: {
timestamp: new Date().toISOString(), extractedAt: new Date().toISOString(),
totalPosts: resultsWithAI.length,
rejectedPosts: rejectedResults.length,
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
aiAnalysisCompleted: aiAnalysisCompleted,
aiContext: aiAnalysisCompleted ? AI_CONTEXT : undefined,
aiModel: aiAnalysisCompleted ? OLLAMA_MODEL : undefined,
locationFilter: LOCATION_FILTER || undefined,
parser: "linkedin-parser", parser: "linkedin-parser",
version: "2.0.0", version: "2.0.0",
summary,
analysisResults,
}, },
results: resultsWithAI, results,
rejectedResults,
}; };
// Prepare rejected posts file
const rejectedData = rejectedResults.map((post) => ({
rejected: true,
reason: post.rejectionReason || "Location filter failed: Location not in filter",
keyword: post.keyword,
text: post.text || post.content,
profileLink: post.profileLink || post.authorUrl,
location: post.location || post.profileLocation,
timestamp: post.timestamp || post.extractedAt,
}));
const resultsDir = path.join(__dirname, "results"); const resultsDir = path.join(__dirname, "results");
if (!fs.existsSync(resultsDir)) { if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir, { recursive: true }); fs.mkdirSync(resultsDir, { recursive: true });
} }
const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const resultsFilename = `linkedin-results-${timestamp}.json`; const filename = `linkedin-results-${timestamp}.json`;
const rejectedFilename = `linkedin-rejected-${timestamp}.json`; const filepath = path.join(resultsDir, filename);
const resultsFilepath = path.join(resultsDir, resultsFilename);
const rejectedFilepath = path.join(resultsDir, rejectedFilename);
// Save results with AI analysis fs.writeFileSync(filepath, JSON.stringify(outputData, null, 2));
fs.writeFileSync(resultsFilepath, JSON.stringify(outputData, null, 2));
// Save rejected posts separately
if (rejectedData.length > 0) {
fs.writeFileSync(
rejectedFilepath,
JSON.stringify(rejectedData, null, 2)
);
}
// Final summary // Final summary
logger.success("✅ LinkedIn parsing completed successfully!"); logger.success("✅ LinkedIn parsing completed successfully!");
logger.info(`📊 Total posts found: ${resultsWithAI.length}`); logger.info(`📊 Total posts found: ${results.length}`);
logger.info(`❌ Total rejected: ${rejectedResults.length}`); logger.info(`❌ Total rejected: ${rejectedResults.length}`);
logger.info(`📁 Results saved to: ${resultsFilepath}`); logger.info(`📁 Results saved to: ${filepath}`);
if (rejectedData.length > 0) {
logger.info(`📁 Rejected posts saved to: ${rejectedFilepath}`);
}
return outputData; return outputData;
} catch (error) { } catch (error) {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

3667
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,80 +1,80 @@
const fs = require("fs"); const fs = require("fs");
const assert = require("assert"); const assert = require("assert");
const { analyzeSinglePost, checkOllamaStatus } = require("../ai-analyzer"); const { analyzeSinglePost, checkOllamaStatus } = require("../ai-analyzer");
console.log("AI Analyzer logic tests"); console.log("AI Analyzer logic tests");
const testData = JSON.parse( const testData = JSON.parse(
fs.readFileSync(__dirname + "/test-data.json", "utf-8") fs.readFileSync(__dirname + "/test-data.json", "utf-8")
); );
const aiResults = testData.positive; const aiResults = testData.positive;
const context = "job layoffs and workforce reduction"; const context = "job layoffs and workforce reduction";
const model = process.env.OLLAMA_MODEL || "mistral"; // Use OLLAMA_MODEL from env or default to mistral const model = "mistral"; // or your default model
(async () => { (async () => {
// Check if Ollama is available // Check if Ollama is available
const ollamaAvailable = await checkOllamaStatus(model); const ollamaAvailable = await checkOllamaStatus(model);
if (!ollamaAvailable) { if (!ollamaAvailable) {
console.log("SKIP: Ollama not available - skipping AI analyzer tests"); console.log("SKIP: Ollama not available - skipping AI analyzer tests");
console.log("PASS: AI analyzer tests skipped (Ollama not running)"); console.log("PASS: AI analyzer tests skipped (Ollama not running)");
return; return;
} }
console.log(`Testing AI analyzer with ${aiResults.length} posts...`); console.log(`Testing AI analyzer with ${aiResults.length} posts...`);
for (let i = 0; i < aiResults.length; i++) { for (let i = 0; i < aiResults.length; i++) {
const post = aiResults[i]; const post = aiResults[i];
console.log(`Testing post ${i + 1}: "${post.text.substring(0, 50)}..."`); console.log(`Testing post ${i + 1}: "${post.text.substring(0, 50)}..."`);
const aiOutput = await analyzeSinglePost(post.text, context, model); const aiOutput = await analyzeSinglePost(post.text, context, model);
// Test that the function returns the expected structure // Test that the function returns the expected structure
assert( assert(
typeof aiOutput === "object" && aiOutput !== null, typeof aiOutput === "object" && aiOutput !== null,
`Post ${i} output is not an object` `Post ${i} output is not an object`
); );
assert( assert(
typeof aiOutput.isRelevant === "boolean", typeof aiOutput.isRelevant === "boolean",
`Post ${i} isRelevant is not a boolean: ${typeof aiOutput.isRelevant}` `Post ${i} isRelevant is not a boolean: ${typeof aiOutput.isRelevant}`
); );
assert( assert(
typeof aiOutput.confidence === "number", typeof aiOutput.confidence === "number",
`Post ${i} confidence is not a number: ${typeof aiOutput.confidence}` `Post ${i} confidence is not a number: ${typeof aiOutput.confidence}`
); );
assert( assert(
typeof aiOutput.reasoning === "string", typeof aiOutput.reasoning === "string",
`Post ${i} reasoning is not a string: ${typeof aiOutput.reasoning}` `Post ${i} reasoning is not a string: ${typeof aiOutput.reasoning}`
); );
// Test that confidence is within valid range // Test that confidence is within valid range
assert( assert(
aiOutput.confidence >= 0 && aiOutput.confidence <= 1, aiOutput.confidence >= 0 && aiOutput.confidence <= 1,
`Post ${i} confidence out of range: ${aiOutput.confidence} (should be 0-1)` `Post ${i} confidence out of range: ${aiOutput.confidence} (should be 0-1)`
); );
// Test that reasoning exists and is not empty // Test that reasoning exists and is not empty
assert( assert(
aiOutput.reasoning && aiOutput.reasoning.length > 0, aiOutput.reasoning && aiOutput.reasoning.length > 0,
`Post ${i} missing or empty reasoning` `Post ${i} missing or empty reasoning`
); );
// Test that relevance is a boolean value // Test that relevance is a boolean value
assert( assert(
aiOutput.isRelevant === true || aiOutput.isRelevant === false, aiOutput.isRelevant === true || aiOutput.isRelevant === false,
`Post ${i} isRelevant is not a valid boolean: ${aiOutput.isRelevant}` `Post ${i} isRelevant is not a valid boolean: ${aiOutput.isRelevant}`
); );
console.log( console.log(
` ✓ Post ${i + 1}: relevant=${aiOutput.isRelevant}, confidence=${ ` ✓ Post ${i + 1}: relevant=${aiOutput.isRelevant}, confidence=${
aiOutput.confidence aiOutput.confidence
}` }`
); );
} }
console.log( console.log(
"PASS: AI analyzer returns valid structure and values for all test posts." "PASS: AI analyzer returns valid structure and values for all test posts."
); );
})(); })();