Compare commits
No commits in common. "master" and "indeed" have entirely different histories.
7
.gitignore
vendored
7
.gitignore
vendored
@ -8,3 +8,10 @@ zip*
|
|||||||
*.7z
|
*.7z
|
||||||
*obfuscated.js
|
*obfuscated.js
|
||||||
.history
|
.history
|
||||||
|
# Debug files
|
||||||
|
debug-*.js
|
||||||
|
debug-*.png
|
||||||
|
*.png
|
||||||
|
*.log
|
||||||
|
# Install scripts (optional - remove if you want to commit)
|
||||||
|
install-ollama.sh
|
||||||
|
|||||||
0
ai-analyzer/cli.js
Normal file → Executable file
0
ai-analyzer/cli.js
Normal file → Executable file
@ -5,11 +5,14 @@ const { logger } = require("./logger");
|
|||||||
* Extracted from ai-analyzer-local.js for reuse across parsers
|
* Extracted from ai-analyzer-local.js for reuse across parsers
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// Default model from environment variable or fallback to "mistral"
|
||||||
|
const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "mistral";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if Ollama is running and the model is available
|
* Check if Ollama is running and the model is available
|
||||||
*/
|
*/
|
||||||
async function checkOllamaStatus(
|
async function checkOllamaStatus(
|
||||||
model = "mistral",
|
model = DEFAULT_MODEL,
|
||||||
ollamaHost = "http://localhost:11434"
|
ollamaHost = "http://localhost:11434"
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
@ -60,75 +63,136 @@ async function checkOllamaStatus(
|
|||||||
async function analyzeBatch(
|
async function analyzeBatch(
|
||||||
posts,
|
posts,
|
||||||
context,
|
context,
|
||||||
model = "mistral",
|
model = DEFAULT_MODEL,
|
||||||
ollamaHost = "http://localhost:11434"
|
ollamaHost = "http://localhost:11434"
|
||||||
) {
|
) {
|
||||||
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
|
logger.ai(`Analyzing batch of ${posts.length} posts with ${model}...`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const prompt = `You are an expert at analyzing LinkedIn posts for relevance to specific contexts.
|
// Detect if context is about a student profile
|
||||||
|
const isStudentContext = /student|undergraduate|first year|second year|third year|fourth year|freshman|sophomore|junior|senior|co-op|internship/i.test(context);
|
||||||
|
|
||||||
CONTEXT TO MATCH: "${context}"
|
// Build enhanced prompt based on context type
|
||||||
|
let analysisInstructions = "";
|
||||||
|
if (isStudentContext) {
|
||||||
|
analysisInstructions = `
|
||||||
|
ANALYSIS FOCUS (Student Context Detected):
|
||||||
|
- Pay special attention to the "Requirements" section
|
||||||
|
- Evaluate if the job requirements match the student's level (${context})
|
||||||
|
- Consider: Are requirements too advanced? Are they appropriate for entry-level/co-op/internship?
|
||||||
|
- Check if the role duties are suitable for a student's skill level
|
||||||
|
- Look for keywords like "co-op", "internship", "entry-level", "student", "junior"
|
||||||
|
- If requirements mention "years of experience", "senior", "expert", "PhD", etc., this may not be suitable
|
||||||
|
- If requirements are reasonable for a student (basic skills, willingness to learn), mark as relevant`;
|
||||||
|
} else {
|
||||||
|
analysisInstructions = `
|
||||||
|
ANALYSIS FOCUS:
|
||||||
|
- Evaluate overall relevance to: "${context}"
|
||||||
|
- Consider job title, description, duties, and requirements
|
||||||
|
- Assess if the job matches the specified criteria`;
|
||||||
|
}
|
||||||
|
|
||||||
Analyze these ${
|
const prompt = `Analyze ${posts.length} job postings for relevance to: "${context}"
|
||||||
posts.length
|
|
||||||
} LinkedIn posts and determine if each relates to the context above.
|
|
||||||
|
|
||||||
POSTS:
|
${analysisInstructions}
|
||||||
|
|
||||||
|
JOB POSTINGS:
|
||||||
${posts
|
${posts
|
||||||
.map(
|
.map(
|
||||||
(post, i) => `
|
(post, i) => {
|
||||||
POST ${i + 1}:
|
// For student contexts, prioritize Requirements section if text is too long
|
||||||
"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}"
|
let jobText = post.text;
|
||||||
`
|
if (isStudentContext && jobText.length > 1200) {
|
||||||
|
// Try to extract Requirements section if present
|
||||||
|
const requirementsMatch = jobText.match(/Requirements?:[\s\S]{0,600}/i);
|
||||||
|
const dutiesMatch = jobText.match(/Role Duties?:[\s\S]{0,300}/i);
|
||||||
|
const titleMatch = jobText.match(/Title:[\s\S]{0,100}/i);
|
||||||
|
|
||||||
|
if (requirementsMatch) {
|
||||||
|
// Prioritize: Title + Requirements (most important for students)
|
||||||
|
jobText = (titleMatch ? titleMatch[0] + "\n\n" : "") +
|
||||||
|
(requirementsMatch ? requirementsMatch[0] : "") +
|
||||||
|
(dutiesMatch ? "\n\n" + dutiesMatch[0] : "");
|
||||||
|
} else {
|
||||||
|
// Fallback to truncation
|
||||||
|
jobText = jobText.substring(0, 1200) + "...";
|
||||||
|
}
|
||||||
|
} else if (jobText.length > 1200) {
|
||||||
|
jobText = jobText.substring(0, 1200) + "...";
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
JOB ${i + 1}:
|
||||||
|
${jobText}
|
||||||
|
`;
|
||||||
|
}
|
||||||
)
|
)
|
||||||
.join("")}
|
.join("")}
|
||||||
|
|
||||||
For each post, provide:
|
REQUIRED FORMAT - Respond with EXACTLY ${posts.length} lines, one per post:
|
||||||
- Is it relevant to "${context}"? (YES/NO)
|
JOB 1: YES | 0.8 | reason here
|
||||||
- Confidence level (0.0 to 1.0)
|
JOB 2: NO | 0.2 | reason here
|
||||||
- Brief reasoning
|
JOB 3: YES | 0.9 | reason here
|
||||||
|
|
||||||
Respond in this EXACT format for each post:
|
RULES:
|
||||||
POST 1: YES/NO | 0.X | brief reason
|
- Use YES or NO (uppercase)
|
||||||
POST 2: YES/NO | 0.X | brief reason
|
- Use pipe character | as separator
|
||||||
POST 3: YES/NO | 0.X | brief reason
|
- Confidence must be 0.0 to 1.0 (decimal number)
|
||||||
|
- Keep reasoning brief (one sentence)
|
||||||
|
- MUST include all ${posts.length} jobs in order
|
||||||
|
${isStudentContext ? "- When analyzing requirements, explicitly mention if requirements are too advanced or appropriate for the student level" : ""}
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
- For layoff context: "laid off 50 employees" = YES | 0.9 | mentions layoffs
|
JOB 1: YES | 0.9 | co-op position suitable for first year students
|
||||||
- For hiring context: "we're hiring developers" = YES | 0.8 | job posting
|
JOB 2: NO | 0.2 | requires 5+ years experience, too advanced
|
||||||
- Unrelated content = NO | 0.1 | not relevant to context`;
|
JOB 3: YES | 0.7 | entry-level role with basic requirements appropriate for students`;
|
||||||
|
|
||||||
const response = await fetch(`${ollamaHost}/api/generate`, {
|
// Add timeout to prevent hanging (5 minutes max)
|
||||||
method: "POST",
|
const controller = new AbortController();
|
||||||
headers: {
|
const timeoutId = setTimeout(() => controller.abort(), 5 * 60 * 1000); // 5 minutes
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
try {
|
||||||
body: JSON.stringify({
|
const response = await fetch(`${ollamaHost}/api/generate`, {
|
||||||
model: model,
|
method: "POST",
|
||||||
prompt: prompt,
|
headers: {
|
||||||
stream: false,
|
"Content-Type": "application/json",
|
||||||
options: {
|
|
||||||
temperature: 0.3,
|
|
||||||
top_p: 0.9,
|
|
||||||
},
|
},
|
||||||
}),
|
body: JSON.stringify({
|
||||||
});
|
model: model,
|
||||||
|
prompt: prompt,
|
||||||
|
stream: false,
|
||||||
|
options: {
|
||||||
|
temperature: 0.3,
|
||||||
|
top_p: 0.9,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
signal: controller.signal,
|
||||||
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
clearTimeout(timeoutId);
|
||||||
throw new Error(
|
|
||||||
`Ollama API error: ${response.status} ${response.statusText}`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = await response.json();
|
if (!response.ok) {
|
||||||
const aiResponse = data.response.trim();
|
throw new Error(
|
||||||
|
`Ollama API error: ${response.status} ${response.statusText}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the response
|
const data = await response.json();
|
||||||
const analyses = [];
|
const aiResponse = data.response.trim();
|
||||||
const lines = aiResponse.split("\n").filter((line) => line.trim());
|
|
||||||
|
|
||||||
for (let i = 0; i < posts.length; i++) {
|
// Parse the response
|
||||||
|
const analyses = [];
|
||||||
|
const lines = aiResponse.split("\n").filter((line) => line.trim());
|
||||||
|
|
||||||
|
// Log the raw response for debugging
|
||||||
|
logger.debug(`AI Response length: ${aiResponse.length} chars`);
|
||||||
|
if (aiResponse.length > 0) {
|
||||||
|
logger.debug(`AI Response (first 1000 chars):\n${aiResponse.substring(0, 1000)}`);
|
||||||
|
} else {
|
||||||
|
logger.warning("⚠️ AI response is empty!");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let i = 0; i < posts.length; i++) {
|
||||||
let analysis = {
|
let analysis = {
|
||||||
postIndex: i + 1,
|
postIndex: i + 1,
|
||||||
isRelevant: false,
|
isRelevant: false,
|
||||||
@ -136,50 +200,175 @@ Examples:
|
|||||||
reasoning: "Could not parse AI response",
|
reasoning: "Could not parse AI response",
|
||||||
};
|
};
|
||||||
|
|
||||||
// Look for lines that match "POST X:" pattern
|
// Try multiple patterns to find the post analysis
|
||||||
const postPattern = new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i");
|
// IMPORTANT: Try numbered patterns first, only use generic pattern as last resort
|
||||||
|
const numberedPatterns = [
|
||||||
|
// Exact format: POST 1: YES | 0.8 | reason
|
||||||
|
new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i"),
|
||||||
|
// Numbered list: 1. YES | 0.8 | reason
|
||||||
|
new RegExp(`^\\s*${i + 1}[.)]\\s*(.+)`, "i"),
|
||||||
|
// Just the number: 1: YES | 0.8 | reason
|
||||||
|
new RegExp(`^\\s*${i + 1}:\\s*(.+)`, "i"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let found = false;
|
||||||
|
let matchedContent = null;
|
||||||
|
|
||||||
|
// First, try to find a line with the specific post number
|
||||||
for (const line of lines) {
|
for (const line of lines) {
|
||||||
const match = line.match(postPattern);
|
for (const pattern of numberedPatterns) {
|
||||||
if (match) {
|
const match = line.match(pattern);
|
||||||
const content = match[1].trim();
|
if (match) {
|
||||||
|
matchedContent = match[1].trim();
|
||||||
// Parse: YES/NO | 0.X | reasoning
|
found = true;
|
||||||
const parts = content.split("|").map((p) => p.trim());
|
break;
|
||||||
|
}
|
||||||
if (parts.length >= 3) {
|
}
|
||||||
analysis.isRelevant = parts[0].toUpperCase().includes("YES");
|
if (found) break;
|
||||||
analysis.confidence = Math.max(
|
}
|
||||||
0,
|
|
||||||
Math.min(1, parseFloat(parts[1]) || 0.5)
|
// If not found with numbered patterns, try position-based matching as fallback
|
||||||
);
|
if (!found && lines.length > i) {
|
||||||
analysis.reasoning = parts[2] || "No reasoning provided";
|
const targetLine = lines[i];
|
||||||
} else {
|
if (targetLine) {
|
||||||
// Fallback parsing
|
// Try to parse the line even without post number
|
||||||
analysis.isRelevant =
|
const genericMatch = targetLine.match(/^(?:POST\s*\d+:?\s*)?(.+)$/i);
|
||||||
content.toUpperCase().includes("YES") ||
|
if (genericMatch) {
|
||||||
content.toLowerCase().includes("relevant");
|
matchedContent = genericMatch[1].trim();
|
||||||
analysis.confidence = 0.6;
|
found = true;
|
||||||
analysis.reasoning = content.substring(0, 100);
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
analyses.push(analysis);
|
if (found && matchedContent) {
|
||||||
}
|
const content = matchedContent;
|
||||||
|
|
||||||
// If we didn't get enough analyses, fill in defaults
|
// Try to parse: YES/NO | 0.X | reasoning
|
||||||
while (analyses.length < posts.length) {
|
let parts = content.split("|").map((p) => p.trim());
|
||||||
analyses.push({
|
|
||||||
postIndex: analyses.length + 1,
|
|
||||||
isRelevant: false,
|
|
||||||
confidence: 0.3,
|
|
||||||
reasoning: "AI response parsing failed",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return analyses;
|
// If no pipe separator, try other separators
|
||||||
|
if (parts.length < 2) {
|
||||||
|
// Try colon separator: YES: 0.8: reason
|
||||||
|
parts = content.split(":").map((p) => p.trim());
|
||||||
|
}
|
||||||
|
if (parts.length < 2) {
|
||||||
|
// Try dash separator: YES - 0.8 - reason
|
||||||
|
parts = content.split("-").map((p) => p.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract YES/NO
|
||||||
|
const relevanceText = parts[0] || content;
|
||||||
|
analysis.isRelevant =
|
||||||
|
relevanceText.toUpperCase().includes("YES") ||
|
||||||
|
relevanceText.toLowerCase().includes("relevant") ||
|
||||||
|
relevanceText.toLowerCase().includes("yes");
|
||||||
|
|
||||||
|
// Extract confidence (look for number between 0 and 1)
|
||||||
|
if (parts.length >= 2) {
|
||||||
|
const confidenceMatch = parts[1].match(/(0?\.\d+|1\.0|0|1)/);
|
||||||
|
if (confidenceMatch) {
|
||||||
|
analysis.confidence = Math.max(
|
||||||
|
0,
|
||||||
|
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Try to find confidence in the whole content
|
||||||
|
const confidenceMatch = content.match(/(0?\.\d+|1\.0|0|1)/);
|
||||||
|
if (confidenceMatch) {
|
||||||
|
analysis.confidence = Math.max(
|
||||||
|
0,
|
||||||
|
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract reasoning (everything after confidence, or whole content if no structure)
|
||||||
|
if (parts.length >= 3) {
|
||||||
|
analysis.reasoning = parts.slice(2).join(" ").trim() || parts[2] || "No reasoning provided";
|
||||||
|
} else if (parts.length === 2) {
|
||||||
|
// If only 2 parts, second part might be reasoning
|
||||||
|
analysis.reasoning = parts[1].substring(0, 200);
|
||||||
|
} else {
|
||||||
|
// Use the whole content as reasoning, but remove YES/NO and confidence
|
||||||
|
let reasoning = content
|
||||||
|
.replace(/YES|NO/gi, "")
|
||||||
|
.replace(/0?\.\d+|1\.0/g, "")
|
||||||
|
.replace(/\|/g, "")
|
||||||
|
.trim();
|
||||||
|
analysis.reasoning = reasoning || "Analysis provided but format unclear";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still not found, try to extract from the entire response by position
|
||||||
|
if (!found && lines.length > 0) {
|
||||||
|
// Try to get the line at position i (allowing for some variance)
|
||||||
|
const targetLine = lines[Math.min(i, lines.length - 1)];
|
||||||
|
if (targetLine) {
|
||||||
|
// Extract any YES/NO indication
|
||||||
|
analysis.isRelevant =
|
||||||
|
targetLine.toUpperCase().includes("YES") ||
|
||||||
|
targetLine.toLowerCase().includes("relevant");
|
||||||
|
|
||||||
|
// Extract confidence
|
||||||
|
const confidenceMatch = targetLine.match(/(0?\.\d+|1\.0|0|1)/);
|
||||||
|
if (confidenceMatch) {
|
||||||
|
analysis.confidence = Math.max(
|
||||||
|
0,
|
||||||
|
Math.min(1, parseFloat(confidenceMatch[1]) || 0.5)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the line as reasoning
|
||||||
|
analysis.reasoning = targetLine.substring(0, 200).trim() || "Parsed from unstructured response";
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last resort: if still not found, try to extract from the entire response text
|
||||||
|
if (!found && aiResponse.length > 0) {
|
||||||
|
// Look for any mention of relevance in the response
|
||||||
|
const responseLower = aiResponse.toLowerCase();
|
||||||
|
const hasRelevant = responseLower.includes("relevant") || responseLower.includes("yes");
|
||||||
|
analysis.isRelevant = hasRelevant;
|
||||||
|
|
||||||
|
// Try to find any confidence number
|
||||||
|
const allConfidenceMatches = aiResponse.match(/(0?\.\d+|1\.0|0|1)/g);
|
||||||
|
if (allConfidenceMatches && allConfidenceMatches.length > i) {
|
||||||
|
analysis.confidence = Math.max(
|
||||||
|
0,
|
||||||
|
Math.min(1, parseFloat(allConfidenceMatches[i]) || 0.5)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use a portion of the response as reasoning
|
||||||
|
const responseSnippet = aiResponse.substring(i * 100, (i + 1) * 200).trim();
|
||||||
|
analysis.reasoning = responseSnippet || "Could not parse structured response, using fallback";
|
||||||
|
|
||||||
|
logger.warning(`⚠️ Post ${i + 1}: Using fallback parsing - AI response format unclear`);
|
||||||
|
}
|
||||||
|
|
||||||
|
analyses.push(analysis);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we didn't get enough analyses, fill in defaults
|
||||||
|
while (analyses.length < posts.length) {
|
||||||
|
analyses.push({
|
||||||
|
postIndex: analyses.length + 1,
|
||||||
|
isRelevant: false,
|
||||||
|
confidence: 0.3,
|
||||||
|
reasoning: "AI response parsing failed",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return analyses;
|
||||||
|
} catch (error) {
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
if (error.name === 'AbortError') {
|
||||||
|
throw new Error('Request timeout: AI analysis took longer than 5 minutes');
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error in batch AI analysis: ${error.message}`);
|
logger.error(`Error in batch AI analysis: ${error.message}`);
|
||||||
|
|
||||||
@ -199,7 +388,7 @@ Examples:
|
|||||||
async function analyzeSinglePost(
|
async function analyzeSinglePost(
|
||||||
text,
|
text,
|
||||||
context,
|
context,
|
||||||
model = "mistral",
|
model = DEFAULT_MODEL,
|
||||||
ollamaHost = "http://localhost:11434"
|
ollamaHost = "http://localhost:11434"
|
||||||
) {
|
) {
|
||||||
const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
|
const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
|
||||||
@ -298,4 +487,5 @@ module.exports = {
|
|||||||
analyzeBatch,
|
analyzeBatch,
|
||||||
analyzeSinglePost,
|
analyzeSinglePost,
|
||||||
findLatestResultsFile,
|
findLatestResultsFile,
|
||||||
|
DEFAULT_MODEL, // Export so other modules can use it
|
||||||
};
|
};
|
||||||
|
|||||||
@ -45,6 +45,43 @@ function containsAnyKeyword(text, keywords) {
|
|||||||
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
return keywords.some((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if text contains all of the specified keywords (case insensitive)
|
||||||
|
*/
|
||||||
|
function containsAllKeywords(text, keywords) {
|
||||||
|
if (!text || !Array.isArray(keywords)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lowerText = text.toLowerCase();
|
||||||
|
return keywords.every((keyword) => lowerText.includes(keyword.toLowerCase()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if text matches keyword groups with AND logic between groups and OR logic within groups
|
||||||
|
* @param {string} text - Text to search in
|
||||||
|
* @param {Array<Array<string>>} keywordGroups - Array of keyword groups, each group is an array of OR keywords
|
||||||
|
* @returns {boolean} - True if text matches all groups (AND logic) and at least one keyword in each group (OR logic)
|
||||||
|
*/
|
||||||
|
function matchesKeywordGroups(text, keywordGroups) {
|
||||||
|
if (!text || !Array.isArray(keywordGroups) || keywordGroups.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lowerText = text.toLowerCase();
|
||||||
|
|
||||||
|
// All groups must match (AND logic)
|
||||||
|
return keywordGroups.every((group) => {
|
||||||
|
if (!Array.isArray(group) || group.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// At least one keyword in the group must match (OR logic)
|
||||||
|
return group.some((keyword) =>
|
||||||
|
lowerText.includes(keyword.toLowerCase().trim())
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate if text meets basic quality criteria
|
* Validate if text meets basic quality criteria
|
||||||
*/
|
*/
|
||||||
@ -101,6 +138,8 @@ function normalizeUrl(url) {
|
|||||||
module.exports = {
|
module.exports = {
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
|
matchesKeywordGroups,
|
||||||
isValidText,
|
isValidText,
|
||||||
extractDomain,
|
extractDomain,
|
||||||
normalizeUrl,
|
normalizeUrl,
|
||||||
|
|||||||
@ -20,7 +20,26 @@ class CoreParser {
|
|||||||
this.browser = await playwright.chromium.launch({
|
this.browser = await playwright.chromium.launch({
|
||||||
headless: this.config.headless
|
headless: this.config.headless
|
||||||
});
|
});
|
||||||
this.context = await this.browser.newContext();
|
|
||||||
|
// Create context with user agent to appear more like a real browser
|
||||||
|
const contextOptions = {
|
||||||
|
userAgent: this.config.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
viewport: { width: 1920, height: 1080 },
|
||||||
|
locale: 'en-US',
|
||||||
|
timezoneId: 'America/New_York',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Add extra HTTP headers to appear more legitimate
|
||||||
|
contextOptions.extraHTTPHeaders = {
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'DNT': '1',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
};
|
||||||
|
|
||||||
|
this.context = await this.browser.newContext(contextOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
async createPage(id) {
|
async createPage(id) {
|
||||||
@ -61,3 +80,7 @@ class CoreParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
module.exports = CoreParser;
|
module.exports = CoreParser;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -3,5 +3,7 @@
|
|||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"description": "Core parser utilities for browser management",
|
"description": "Core parser utilities for browser management",
|
||||||
"dependencies": {}
|
"dependencies": {
|
||||||
|
"playwright": "^1.40.0"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -60,13 +60,122 @@ JOB_TYPES="full time,contract" node index.js --sites=skipthedrive
|
|||||||
node index.js --sites=skipthedrive --demo
|
node index.js --sites=skipthedrive --demo
|
||||||
```
|
```
|
||||||
|
|
||||||
### 🚧 Planned Parsers
|
#### LinkedIn Jobs Parser
|
||||||
|
|
||||||
- **Indeed**: Comprehensive job aggregator
|
Professional network job postings with comprehensive job data.
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
|
||||||
|
- LinkedIn authentication support
|
||||||
|
- Keyword-based job search
|
||||||
|
- Location filtering (both LinkedIn location and post-extraction filter)
|
||||||
|
- Multi-page result parsing with pagination
|
||||||
|
- Job type and experience level extraction
|
||||||
|
- Automatic duplicate detection
|
||||||
|
- Infinite scroll handling
|
||||||
|
|
||||||
|
**Requirements:**
|
||||||
|
|
||||||
|
- LinkedIn credentials (username and password) must be set in `.env` file:
|
||||||
|
```env
|
||||||
|
LINKEDIN_USERNAME=******@gmail.com
|
||||||
|
LINKEDIN_PASSWORD=***
|
||||||
|
LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location filter
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Search LinkedIn jobs
|
||||||
|
node index.js --sites=linkedin --keywords="software engineer,developer"
|
||||||
|
|
||||||
|
# Search with location filter
|
||||||
|
node index.js --sites=linkedin --keywords="co-op" --location="Ontario"
|
||||||
|
|
||||||
|
# Search with date filter (jobs posted after specific date)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op" --min-date="2025-12-01"
|
||||||
|
|
||||||
|
# Combine filters
|
||||||
|
node index.js --sites=linkedin --keywords="co-op" --location="Ontario" --min-date="2025-12-01"
|
||||||
|
|
||||||
|
# Combine multiple sites
|
||||||
|
node index.js --sites=linkedin,skipthedrive,indeed --keywords="intern,co-op"
|
||||||
|
|
||||||
|
# Use AND logic - jobs must match ALL keywords (e.g., "co-op" AND "summer 2026")
|
||||||
|
node index.js --sites=linkedin --keywords="co-op,summer 2026" --and
|
||||||
|
|
||||||
|
# Use grouped AND/OR logic - (co-op OR intern) AND (summer 2026)
|
||||||
|
# Use | (pipe) for OR within groups, , (comma) to separate AND groups
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026" --and
|
||||||
|
|
||||||
|
# Multiple AND groups - (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --and
|
||||||
|
```
|
||||||
|
|
||||||
|
**Date Filter Notes:**
|
||||||
|
- The date filter uses LinkedIn's `f_TPR` parameter to filter at the LinkedIn level before parsing
|
||||||
|
- Format: `YYYY-MM-DD` (e.g., `2025-12-01`)
|
||||||
|
- LinkedIn supports relative timeframes up to ~30 days
|
||||||
|
- For dates older than 30 days, LinkedIn may limit results to the maximum supported timeframe
|
||||||
|
|
||||||
|
#### Indeed Parser
|
||||||
|
|
||||||
|
Comprehensive job aggregator with extensive job listings.
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
|
||||||
|
- Keyword-based job search
|
||||||
|
- Location filtering (both Indeed location and post-extraction filter)
|
||||||
|
- Multi-page result parsing with pagination
|
||||||
|
- Salary information extraction
|
||||||
|
- Date filtering (jobs posted within last 30 days)
|
||||||
|
- Automatic duplicate detection
|
||||||
|
- Job type and experience level support
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Search Indeed jobs
|
||||||
|
node index.js --sites=indeed --keywords="software engineer,developer"
|
||||||
|
|
||||||
|
# Search with location filter
|
||||||
|
node index.js --sites=indeed --keywords="co-op" --location="Ontario"
|
||||||
|
|
||||||
|
# Search with date filter (jobs posted after specific date)
|
||||||
|
node index.js --sites=indeed --keywords="co-op" --min-date="2025-12-01"
|
||||||
|
|
||||||
|
# Combine filters
|
||||||
|
node index.js --sites=indeed --keywords="co-op" --location="Ontario" --min-date="2025-12-01"
|
||||||
|
|
||||||
|
# Combine multiple sites
|
||||||
|
node index.js --sites=indeed,linkedin --keywords="intern,co-op"
|
||||||
|
|
||||||
|
# Use AND logic - jobs must match ALL keywords
|
||||||
|
node index.js --sites=indeed --keywords="co-op,summer 2026" --and
|
||||||
|
|
||||||
|
# Use grouped AND/OR logic - (co-op OR intern) AND (summer 2026)
|
||||||
|
node index.js --sites=indeed --keywords="co-op|intern,summer 2026" --and
|
||||||
|
```
|
||||||
|
|
||||||
|
**Date Filter Notes:**
|
||||||
|
- The date filter converts to Indeed's `fromage` parameter (days ago)
|
||||||
|
- Format: `YYYY-MM-DD` (e.g., `2025-12-01`)
|
||||||
|
- Indeed supports up to 30 days for date filtering
|
||||||
|
- For dates older than 30 days, Indeed limits results to the maximum supported timeframe
|
||||||
|
|
||||||
|
**CAPTCHA/Verification Handling:**
|
||||||
|
- Indeed may show CAPTCHA or human verification pages when detecting automated access
|
||||||
|
- If you encounter CAPTCHA errors, try:
|
||||||
|
1. Run in non-headless mode: Set `HEADLESS=false` in `.env` file (you can manually solve CAPTCHA)
|
||||||
|
2. Wait a few minutes between runs to avoid rate limiting
|
||||||
|
3. Use a different IP address or VPN if available
|
||||||
|
4. Reduce the number of pages or keywords per run
|
||||||
|
- The parser will automatically detect and report CAPTCHA pages with helpful error messages
|
||||||
|
|
||||||
|
### 🚧 Planned Parsers
|
||||||
- **Glassdoor**: Jobs with company reviews and salary data
|
- **Glassdoor**: Jobs with company reviews and salary data
|
||||||
- **Monster**: Traditional job board
|
- **Monster**: Traditional job board
|
||||||
- **SimplyHired**: Job aggregator with salary estimates
|
- **SimplyHired**: Job aggregator with salary estimates
|
||||||
- **LinkedIn Jobs**: Professional network job postings
|
|
||||||
- **AngelList**: Startup and tech jobs
|
- **AngelList**: Startup and tech jobs
|
||||||
- **Remote.co**: Dedicated remote work jobs
|
- **Remote.co**: Dedicated remote work jobs
|
||||||
- **FlexJobs**: Flexible and remote positions
|
- **FlexJobs**: Flexible and remote positions
|
||||||
@ -92,23 +201,43 @@ Create a `.env` file in the parser directory:
|
|||||||
|
|
||||||
```env
|
```env
|
||||||
# Job Search Configuration
|
# Job Search Configuration
|
||||||
SEARCH_SOURCES=linkedin,indeed,glassdoor
|
SEARCH_KEYWORDS=software engineer,developer,programmer
|
||||||
TARGET_ROLES=software engineer,data scientist,product manager
|
# For grouped AND/OR logic, use pipe (|) for OR within groups and comma (,) for AND groups:
|
||||||
LOCATION_FILTER=Toronto,Vancouver,Calgary
|
# SEARCH_KEYWORDS=co-op|intern,summer 2026,remote # (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
EXPERIENCE_LEVELS=entry,mid,senior
|
USE_AND_LOGIC=false # Set to "true" to enable AND logic (required for grouped keywords)
|
||||||
REMOTE_PREFERENCE=remote,hybrid,onsite
|
LOCATION_FILTER=Ontario,Canada
|
||||||
|
MAX_PAGES=5
|
||||||
|
|
||||||
|
# LinkedIn Configuration (required for LinkedIn jobs)
|
||||||
|
LINKEDIN_USERNAME=your_email@example.com
|
||||||
|
LINKEDIN_PASSWORD=your_password
|
||||||
|
LINKEDIN_JOB_LOCATION=Canada # Optional: LinkedIn location search
|
||||||
|
|
||||||
|
# Date Filter (LinkedIn only - filters at LinkedIn level before parsing)
|
||||||
|
MIN_DATE=2025-12-01 # Format: YYYY-MM-DD (jobs posted after this date)
|
||||||
|
|
||||||
# Analysis Configuration
|
# Analysis Configuration
|
||||||
ENABLE_SALARY_ANALYSIS=true
|
ENABLE_AI_ANALYSIS=false
|
||||||
ENABLE_SKILL_ANALYSIS=true
|
HEADLESS=true
|
||||||
ENABLE_TREND_ANALYSIS=true
|
|
||||||
MIN_SALARY=50000
|
|
||||||
MAX_SALARY=200000
|
|
||||||
|
|
||||||
# Output Configuration
|
# Output Configuration
|
||||||
OUTPUT_FORMAT=json,csv
|
OUTPUT_FORMAT=json # Options: "json", "csv", or "both"
|
||||||
SAVE_RAW_DATA=true
|
```
|
||||||
ANALYSIS_INTERVAL=daily
|
|
||||||
|
**Keyword Examples in .env:**
|
||||||
|
|
||||||
|
```env
|
||||||
|
# Simple OR logic (default) - matches ANY keyword
|
||||||
|
SEARCH_KEYWORDS=co-op,intern
|
||||||
|
USE_AND_LOGIC=false
|
||||||
|
|
||||||
|
# Simple AND logic - matches ALL keywords
|
||||||
|
SEARCH_KEYWORDS=co-op,summer 2026
|
||||||
|
USE_AND_LOGIC=true
|
||||||
|
|
||||||
|
# Grouped AND/OR logic - (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
|
SEARCH_KEYWORDS=co-op|intern,summer 2026,remote
|
||||||
|
USE_AND_LOGIC=true
|
||||||
```
|
```
|
||||||
|
|
||||||
### Command Line Options
|
### Command Line Options
|
||||||
@ -117,31 +246,52 @@ ANALYSIS_INTERVAL=daily
|
|||||||
# Basic usage
|
# Basic usage
|
||||||
node index.js
|
node index.js
|
||||||
|
|
||||||
# Specific roles
|
# Select sites to parse
|
||||||
node index.js --roles="frontend developer,backend developer"
|
node index.js --sites=linkedin,skipthedrive,indeed
|
||||||
|
|
||||||
# Geographic focus
|
# Search keywords
|
||||||
node index.js --locations="Toronto,Vancouver"
|
node index.js --keywords="software engineer,developer"
|
||||||
|
|
||||||
# Experience level
|
# Location filter
|
||||||
node index.js --experience="senior"
|
node index.js --location="Ontario"
|
||||||
|
|
||||||
# Output format
|
# Max pages to parse
|
||||||
node index.js --output=results/job-market-analysis.json
|
node index.js --max-pages=10
|
||||||
|
|
||||||
|
# Exclude rejected results
|
||||||
|
node index.js --no-rejected
|
||||||
|
|
||||||
|
# Output format (json, csv, or both)
|
||||||
|
node index.js --output=csv
|
||||||
|
node index.js --output=both
|
||||||
|
|
||||||
|
# Date filter (LinkedIn only - filters at LinkedIn level)
|
||||||
|
node index.js --sites=linkedin --min-date="2025-12-01"
|
||||||
|
|
||||||
|
# Use AND logic for keywords (all keywords must match)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op,summer 2026" --and
|
||||||
|
|
||||||
|
# Use grouped AND/OR logic: (co-op OR intern) AND (summer 2026)
|
||||||
|
# Use | (pipe) for OR within groups, , (comma) to separate AND groups
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026" --and
|
||||||
|
|
||||||
|
# Multiple AND groups: (co-op OR intern) AND (summer 2026) AND (remote)
|
||||||
|
node index.js --sites=linkedin --keywords="co-op|intern,summer 2026,remote" --and
|
||||||
```
|
```
|
||||||
|
|
||||||
**Available Options:**
|
**Available Options:**
|
||||||
|
|
||||||
- `--roles="role1,role2"`: Target job roles
|
- `--sites="site1,site2"`: Job sites to parse (linkedin, skipthedrive, indeed)
|
||||||
- `--locations="city1,city2"`: Geographic focus
|
- `--keywords="keyword1,keyword2"`: Search keywords
|
||||||
- `--experience="entry|mid|senior"`: Experience level
|
- Use `|` (pipe) to separate OR keywords within a group: `"co-op|intern"` means "co-op" OR "intern"
|
||||||
- `--remote="remote|hybrid|onsite"`: Remote work preference
|
- Use `,` (comma) to separate AND groups when using `--and`: `"co-op|intern,summer 2026"` means (co-op OR intern) AND (summer 2026)
|
||||||
- `--salary-min=NUMBER`: Minimum salary filter
|
- `--location="LOCATION"`: Location filter
|
||||||
- `--salary-max=NUMBER`: Maximum salary filter
|
- `--max-pages=NUMBER`: Maximum pages to parse (0 or "all" for unlimited)
|
||||||
- `--output=FILE`: Output filename
|
- `--min-date="YYYY-MM-DD"`: Minimum posted date filter (LinkedIn only - filters at LinkedIn level before parsing)
|
||||||
- `--format=json|csv`: Output format
|
- `--no-rejected` or `--exclude-rejected`: Exclude rejected results from output
|
||||||
- `--trends`: Enable trend analysis
|
- `--output=FORMAT` or `--format=FORMAT`: Output format - "json", "csv", or "both" (default: "json")
|
||||||
- `--skills`: Enable skill analysis
|
- `--and` or `--all-keywords`: Use AND logic for keywords (all keywords must match). Default is OR logic (any keyword matches)
|
||||||
|
- When combined with `|` (pipe) in keywords, enables grouped AND/OR logic
|
||||||
|
|
||||||
## 📊 Keywords
|
## 📊 Keywords
|
||||||
|
|
||||||
@ -340,12 +490,46 @@ node index.js --companies="Google,Microsoft,Amazon"
|
|||||||
|
|
||||||
### CSV Output
|
### CSV Output
|
||||||
|
|
||||||
The parser can also generate CSV files for easy analysis:
|
The parser can generate CSV files for easy spreadsheet analysis. Use `--output=csv` or `OUTPUT_FORMAT=csv` to export results as CSV.
|
||||||
|
|
||||||
|
**CSV Columns:**
|
||||||
|
- `jobId`: Unique job identifier
|
||||||
|
- `title`: Job title
|
||||||
|
- `company`: Company name
|
||||||
|
- `location`: Job location
|
||||||
|
- `jobUrl`: Link to job posting
|
||||||
|
- `postedDate`: Date job was posted
|
||||||
|
- `description`: Job description
|
||||||
|
- `jobType`: Type of job (full-time, part-time, contract, etc.)
|
||||||
|
- `experienceLevel`: Required experience level
|
||||||
|
- `keyword`: Search keyword that matched
|
||||||
|
- `extractedAt`: Timestamp when job was extracted
|
||||||
|
- `source`: Source site (e.g., "linkedin-jobs", "skipthedrive")
|
||||||
|
- `aiRelevant`: AI analysis relevance (Yes/No)
|
||||||
|
- `aiConfidence`: AI confidence score (0-1)
|
||||||
|
- `aiReasoning`: AI reasoning for relevance
|
||||||
|
- `aiContext`: AI analysis context
|
||||||
|
- `aiModel`: AI model used for analysis
|
||||||
|
- `aiAnalyzedAt`: Timestamp of AI analysis
|
||||||
|
|
||||||
|
**Example CSV Output:**
|
||||||
|
|
||||||
```csv
|
```csv
|
||||||
job_id,title,company,location,remote_type,salary_min,salary_max,required_skills,experience_level,posted_date
|
jobId,title,company,location,jobUrl,postedDate,description,jobType,experienceLevel,keyword,extractedAt,source,aiRelevant,aiConfidence,aiReasoning,aiContext,aiModel,aiAnalyzedAt
|
||||||
job_1,Senior Software Engineer,TechCorp,Toronto,hybrid,100000,140000,"React,Node.js,TypeScript",senior,2024-01-10
|
4344137241,Web Applications Co-op/Intern,Nokia,Kanata ON (Hybrid),https://www.linkedin.com/jobs/view/4344137241,,"Web Applications Co-op/Intern",,co-op,2025-12-17T04:50:05.600Z,linkedin-jobs,Yes,0.8,"The post mentions a co-op/intern position",co-op and internship opportunities for First year Math students,mistral,2025-12-17T04:58:33.479Z
|
||||||
job_2,Data Scientist,DataCorp,Vancouver,remote,90000,130000,"Python,SQL,ML",mid,2024-01-09
|
```
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Export as CSV only
|
||||||
|
node index.js --output=csv
|
||||||
|
|
||||||
|
# Export both JSON and CSV
|
||||||
|
node index.js --output=both
|
||||||
|
|
||||||
|
# Using environment variable
|
||||||
|
OUTPUT_FORMAT=csv node index.js
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🔒 Security & Best Practices
|
## 🔒 Security & Best Practices
|
||||||
|
|||||||
@ -10,7 +10,10 @@ const path = require("path");
|
|||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const CoreParser = require("../core-parser");
|
const CoreParser = require("../core-parser");
|
||||||
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
|
const { skipthedriveStrategy } = require("./strategies/skipthedrive-strategy");
|
||||||
const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
|
const { linkedinJobsStrategy } = require("./strategies/linkedin-jobs-strategy");
|
||||||
|
const { indeedStrategy } = require("./strategies/indeed-strategy");
|
||||||
|
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
|
||||||
|
const { convertResultsToCsv } = require("./src/csv-utils");
|
||||||
|
|
||||||
// Load environment variables
|
// Load environment variables
|
||||||
require("dotenv").config({ path: path.join(__dirname, ".env") });
|
require("dotenv").config({ path: path.join(__dirname, ".env") });
|
||||||
@ -18,16 +21,23 @@ require("dotenv").config({ path: path.join(__dirname, ".env") });
|
|||||||
// Configuration from environment
|
// Configuration from environment
|
||||||
const HEADLESS = process.env.HEADLESS !== "false";
|
const HEADLESS = process.env.HEADLESS !== "false";
|
||||||
const SEARCH_KEYWORDS =
|
const SEARCH_KEYWORDS =
|
||||||
process.env.SEARCH_KEYWORDS || "software engineer,developer,programmer";
|
process.env.SEARCH_KEYWORDS || "co-op,intern";//"software engineer,developer,programmer";
|
||||||
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
||||||
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
|
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
|
||||||
|
const AI_CONTEXT = process.env.AI_CONTEXT || "Job market analysis focusing on job postings, skills, and trends";
|
||||||
|
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
||||||
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
const MAX_PAGES = parseInt(process.env.MAX_PAGES) || 5;
|
||||||
|
const EXCLUDE_REJECTED = process.env.EXCLUDE_REJECTED === "true";
|
||||||
|
const OUTPUT_FORMAT = process.env.OUTPUT_FORMAT || "json"; // "json", "csv", or "both"
|
||||||
|
const MIN_DATE = process.env.MIN_DATE; // Minimum posted date (format: YYYY-MM-DD)
|
||||||
|
const USE_AND_LOGIC = process.env.USE_AND_LOGIC === "true"; // Use AND logic for keywords
|
||||||
|
|
||||||
// Available site strategies
|
// Available site strategies
|
||||||
const SITE_STRATEGIES = {
|
const SITE_STRATEGIES = {
|
||||||
skipthedrive: skipthedriveStrategy,
|
skipthedrive: skipthedriveStrategy,
|
||||||
|
linkedin: linkedinJobsStrategy,
|
||||||
|
indeed: indeedStrategy,
|
||||||
// Add more site strategies here
|
// Add more site strategies here
|
||||||
// indeed: indeedStrategy,
|
|
||||||
// glassdoor: glassdoorStrategy,
|
// glassdoor: glassdoorStrategy,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -41,6 +51,10 @@ function parseArguments() {
|
|||||||
keywords: null,
|
keywords: null,
|
||||||
locationFilter: null,
|
locationFilter: null,
|
||||||
maxPages: MAX_PAGES,
|
maxPages: MAX_PAGES,
|
||||||
|
excludeRejected: EXCLUDE_REJECTED,
|
||||||
|
outputFormat: OUTPUT_FORMAT,
|
||||||
|
minDate: MIN_DATE,
|
||||||
|
useAndLogic: USE_AND_LOGIC, // Use AND logic instead of OR logic for keywords (from env or CLI)
|
||||||
};
|
};
|
||||||
|
|
||||||
args.forEach((arg) => {
|
args.forEach((arg) => {
|
||||||
@ -57,7 +71,26 @@ function parseArguments() {
|
|||||||
} else if (arg.startsWith("--location=")) {
|
} else if (arg.startsWith("--location=")) {
|
||||||
options.locationFilter = arg.split("=")[1];
|
options.locationFilter = arg.split("=")[1];
|
||||||
} else if (arg.startsWith("--max-pages=")) {
|
} else if (arg.startsWith("--max-pages=")) {
|
||||||
options.maxPages = parseInt(arg.split("=")[1]) || MAX_PAGES;
|
const value = arg.split("=")[1];
|
||||||
|
// Support "all" or "0" to mean unlimited pages
|
||||||
|
if (value === "all" || value === "0") {
|
||||||
|
options.maxPages = 0; // 0 means unlimited
|
||||||
|
} else {
|
||||||
|
options.maxPages = parseInt(value) || MAX_PAGES;
|
||||||
|
}
|
||||||
|
} else if (arg === "--no-rejected" || arg === "--exclude-rejected") {
|
||||||
|
options.excludeRejected = true;
|
||||||
|
} else if (arg.startsWith("--output=") || arg.startsWith("--format=")) {
|
||||||
|
const format = arg.split("=")[1].toLowerCase();
|
||||||
|
if (["json", "csv", "both"].includes(format)) {
|
||||||
|
options.outputFormat = format;
|
||||||
|
} else {
|
||||||
|
logger.warning(`⚠️ Unknown output format: ${format}. Using default: json`);
|
||||||
|
}
|
||||||
|
} else if (arg.startsWith("--min-date=")) {
|
||||||
|
options.minDate = arg.split("=")[1];
|
||||||
|
} else if (arg === "--and" || arg === "--all-keywords") {
|
||||||
|
options.useAndLogic = true; // CLI flag overrides env variable
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -80,21 +113,136 @@ async function startJobSearchParser(options = {}) {
|
|||||||
logger.step("🚀 Job Search Parser Starting...");
|
logger.step("🚀 Job Search Parser Starting...");
|
||||||
|
|
||||||
// Parse keywords
|
// Parse keywords
|
||||||
const keywords =
|
let keywords =
|
||||||
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
finalOptions.keywords || SEARCH_KEYWORDS.split(",").map((k) => k.trim());
|
||||||
|
|
||||||
|
// Parse keyword groups if AND logic is enabled and keywords contain pipe (|) separator
|
||||||
|
// Format: "co-op|intern,summer 2026" means (co-op OR intern) AND (summer 2026)
|
||||||
|
let keywordGroups = null;
|
||||||
|
if (finalOptions.useAndLogic && keywords.some(k => k.includes('|'))) {
|
||||||
|
keywordGroups = keywords.map(group =>
|
||||||
|
group.split('|').map(k => k.trim()).filter(k => k.length > 0)
|
||||||
|
);
|
||||||
|
logger.info(`🔍 Keyword Groups: ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
}
|
||||||
|
|
||||||
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
const locationFilter = finalOptions.locationFilter || LOCATION_FILTER;
|
||||||
const sites = finalOptions.sites;
|
const sites = finalOptions.sites;
|
||||||
|
const excludeRejected = finalOptions.excludeRejected !== undefined ? finalOptions.excludeRejected : EXCLUDE_REJECTED;
|
||||||
|
|
||||||
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
logger.info(`📦 Selected job sites: ${sites.join(", ")}`);
|
||||||
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Search Keywords: ${keywords.join(", ")}`);
|
||||||
|
if (keywordGroups) {
|
||||||
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
} else {
|
||||||
|
logger.info(`🔗 Keyword Logic: ${finalOptions.useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
|
}
|
||||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
|
const minDate = finalOptions.minDate || MIN_DATE;
|
||||||
|
if (minDate) {
|
||||||
|
logger.info(`📅 Min Date Filter: ${minDate} (jobs posted after this date)`);
|
||||||
|
}
|
||||||
logger.info(
|
logger.info(
|
||||||
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
|
`🧠 AI Analysis: ${ENABLE_AI_ANALYSIS ? "Enabled" : "Disabled"}`
|
||||||
);
|
);
|
||||||
|
if (ENABLE_AI_ANALYSIS) {
|
||||||
|
logger.info(` Context: "${AI_CONTEXT}"`);
|
||||||
|
logger.info(` Model: ${OLLAMA_MODEL}`);
|
||||||
|
}
|
||||||
|
|
||||||
const allResults = [];
|
const allResults = [];
|
||||||
const allRejectedResults = [];
|
const allRejectedResults = [];
|
||||||
const siteResults = {};
|
const siteResults = {};
|
||||||
|
let analysisResults = null;
|
||||||
|
|
||||||
|
// Initialize results directory and file for incremental saving
|
||||||
|
const resultsDir = path.join(__dirname, "results");
|
||||||
|
if (!fs.existsSync(resultsDir)) {
|
||||||
|
fs.mkdirSync(resultsDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||||
|
const outputFormat = finalOptions.outputFormat || OUTPUT_FORMAT;
|
||||||
|
let incrementalJsonFilepath = null;
|
||||||
|
let incrementalCsvFilepath = null;
|
||||||
|
|
||||||
|
// Initialize incremental save files
|
||||||
|
if (outputFormat === "json" || outputFormat === "both") {
|
||||||
|
const jsonFilename = `job-search-results-${timestamp}.json`;
|
||||||
|
incrementalJsonFilepath = path.join(resultsDir, jsonFilename);
|
||||||
|
}
|
||||||
|
if (outputFormat === "csv" || outputFormat === "both") {
|
||||||
|
const csvFilename = `job-search-results-${timestamp}.csv`;
|
||||||
|
incrementalCsvFilepath = path.join(resultsDir, csvFilename);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save results incrementally as they're found
|
||||||
|
*/
|
||||||
|
const saveIncrementalResults = (currentResults, currentRejectedResults, currentSiteResults, currentAnalysisResults = null, isComplete = false) => {
|
||||||
|
try {
|
||||||
|
const outputData = {
|
||||||
|
metadata: {
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
parser: "job-search-parser",
|
||||||
|
version: "2.0.0",
|
||||||
|
sites: sites,
|
||||||
|
keywords: keywords.join(", "),
|
||||||
|
locationFilter,
|
||||||
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
||||||
|
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
|
||||||
|
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
|
||||||
|
analysisResults: currentAnalysisResults,
|
||||||
|
rejectedJobsExcluded: excludeRejected,
|
||||||
|
isComplete: isComplete,
|
||||||
|
lastUpdated: new Date().toISOString(),
|
||||||
|
},
|
||||||
|
results: currentResults,
|
||||||
|
siteResults: currentSiteResults,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!excludeRejected) {
|
||||||
|
outputData.rejectedResults = currentRejectedResults;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save JSON incrementally
|
||||||
|
if (incrementalJsonFilepath) {
|
||||||
|
fs.writeFileSync(incrementalJsonFilepath, JSON.stringify(outputData, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save CSV incrementally (convert on each save)
|
||||||
|
if (incrementalCsvFilepath) {
|
||||||
|
const csvContent = convertResultsToCsv(outputData);
|
||||||
|
fs.writeFileSync(incrementalCsvFilepath, csvContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isComplete) {
|
||||||
|
logger.info(`💾 Incremental save: ${currentResults.length} results saved to ${incrementalJsonFilepath || incrementalCsvFilepath}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`⚠️ Failed to save incremental results: ${error.message}`);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Save initial empty state
|
||||||
|
saveIncrementalResults([], [], {}, null, false);
|
||||||
|
|
||||||
|
// Set up signal handlers for graceful shutdown
|
||||||
|
let isShuttingDown = false;
|
||||||
|
const gracefulShutdown = async (signal) => {
|
||||||
|
if (isShuttingDown) return;
|
||||||
|
isShuttingDown = true;
|
||||||
|
|
||||||
|
logger.warning(`\n⚠️ Received ${signal}, saving current results before exit...`);
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
|
logger.info(`💾 Saved ${allResults.length} results before shutdown`);
|
||||||
|
|
||||||
|
await coreParser.cleanup();
|
||||||
|
process.exit(0);
|
||||||
|
};
|
||||||
|
|
||||||
|
process.on('SIGINT', () => gracefulShutdown('SIGINT'));
|
||||||
|
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
|
||||||
|
|
||||||
// Process each selected site
|
// Process each selected site
|
||||||
for (const site of sites) {
|
for (const site of sites) {
|
||||||
@ -108,18 +256,49 @@ async function startJobSearchParser(options = {}) {
|
|||||||
logger.step(`\n🌐 Parsing ${site}...`);
|
logger.step(`\n🌐 Parsing ${site}...`);
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
|
|
||||||
const parseResult = await strategy(coreParser, {
|
// Prepare strategy options
|
||||||
|
const strategyOptions = {
|
||||||
keywords,
|
keywords,
|
||||||
|
keywordGroups, // Pass grouped keywords if available
|
||||||
locationFilter,
|
locationFilter,
|
||||||
maxPages: finalOptions.maxPages,
|
maxPages: finalOptions.maxPages,
|
||||||
});
|
useAndLogic: finalOptions.useAndLogic || false,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Add credentials for LinkedIn
|
||||||
|
if (site === "linkedin") {
|
||||||
|
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
||||||
|
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||||
|
|
||||||
|
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
|
||||||
|
logger.error(`❌ LinkedIn credentials not found. Please set LINKEDIN_USERNAME and LINKEDIN_PASSWORD in .env file`);
|
||||||
|
siteResults[site] = {
|
||||||
|
count: 0,
|
||||||
|
rejected: 0,
|
||||||
|
duration: "0s",
|
||||||
|
error: "LinkedIn credentials not found",
|
||||||
|
};
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
strategyOptions.credentials = {
|
||||||
|
username: LINKEDIN_USERNAME,
|
||||||
|
password: LINKEDIN_PASSWORD,
|
||||||
|
};
|
||||||
|
strategyOptions.location = process.env.LINKEDIN_JOB_LOCATION || "";
|
||||||
|
strategyOptions.minDate = minDate; // Add date filter for LinkedIn
|
||||||
|
}
|
||||||
|
|
||||||
|
const parseResult = await strategy(coreParser, strategyOptions);
|
||||||
|
|
||||||
const { results, rejectedResults, summary } = parseResult;
|
const { results, rejectedResults, summary } = parseResult;
|
||||||
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
||||||
|
|
||||||
// Collect results
|
// Collect results
|
||||||
|
logger.info(`📦 Strategy returned: ${results.length} results, ${rejectedResults.length} rejected`);
|
||||||
allResults.push(...results);
|
allResults.push(...results);
|
||||||
allRejectedResults.push(...rejectedResults);
|
allRejectedResults.push(...rejectedResults);
|
||||||
|
logger.info(`📦 Total accumulated: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||||
|
|
||||||
siteResults[site] = {
|
siteResults[site] = {
|
||||||
count: results.length,
|
count: results.length,
|
||||||
@ -131,6 +310,9 @@ async function startJobSearchParser(options = {}) {
|
|||||||
logger.success(
|
logger.success(
|
||||||
`✅ ${site} completed in ${duration}s - Found ${results.length} jobs`
|
`✅ ${site} completed in ${duration}s - Found ${results.length} jobs`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Save results incrementally after each site
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`❌ ${site} parsing failed: ${error.message}`);
|
logger.error(`❌ ${site} parsing failed: ${error.message}`);
|
||||||
siteResults[site] = {
|
siteResults[site] = {
|
||||||
@ -139,60 +321,126 @@ async function startJobSearchParser(options = {}) {
|
|||||||
duration: "0s",
|
duration: "0s",
|
||||||
error: error.message,
|
error: error.message,
|
||||||
};
|
};
|
||||||
|
// Save even on error to preserve what we have
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// AI Analysis if enabled
|
// AI Analysis if enabled
|
||||||
let analysisResults = null;
|
// Save results before AI analysis (in case AI analysis takes a long time)
|
||||||
|
if (allResults.length > 0) {
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, null, false);
|
||||||
|
}
|
||||||
|
|
||||||
if (ENABLE_AI_ANALYSIS && allResults.length > 0) {
|
if (ENABLE_AI_ANALYSIS && allResults.length > 0) {
|
||||||
logger.step("🧠 Running AI Analysis...");
|
logger.step("🧠 Running AI Analysis...");
|
||||||
|
|
||||||
const ollamaStatus = await checkOllamaStatus();
|
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL);
|
||||||
if (ollamaStatus.available) {
|
if (ollamaAvailable) {
|
||||||
analysisResults = await analyzeBatch(allResults, {
|
// Prepare data for analysis (analyzeBatch expects objects with 'text' field)
|
||||||
context:
|
const analysisData = allResults.map((job) => {
|
||||||
"Job market analysis focusing on job postings, skills, and trends",
|
// Build comprehensive text including all available job information
|
||||||
|
const parts = [];
|
||||||
|
if (job.title) parts.push(`Title: ${job.title}`);
|
||||||
|
if (job.company) parts.push(`Company: ${job.company}`);
|
||||||
|
if (job.description) parts.push(`Description: ${job.description}`);
|
||||||
|
if (job.roleDuties) parts.push(`Role Duties: ${job.roleDuties}`);
|
||||||
|
if (job.jobRequirements) parts.push(`Requirements: ${job.jobRequirements}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: parts.join("\n\n"),
|
||||||
|
location: job.location || "",
|
||||||
|
keyword: job.keyword || "",
|
||||||
|
timestamp: job.extractedAt || job.postedDate || "",
|
||||||
|
roleDuties: job.roleDuties || "",
|
||||||
|
jobRequirements: job.jobRequirements || "",
|
||||||
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Process in smaller batches to avoid timeouts (5 jobs per batch)
|
||||||
|
const BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE) || 5;
|
||||||
|
analysisResults = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < analysisData.length; i += BATCH_SIZE) {
|
||||||
|
const batch = analysisData.slice(i, i + BATCH_SIZE);
|
||||||
|
const batchNumber = Math.floor(i / BATCH_SIZE) + 1;
|
||||||
|
const totalBatches = Math.ceil(analysisData.length / BATCH_SIZE);
|
||||||
|
|
||||||
|
logger.info(` Processing batch ${batchNumber}/${totalBatches} (${batch.length} jobs)...`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const batchResults = await analyzeBatch(
|
||||||
|
batch,
|
||||||
|
AI_CONTEXT,
|
||||||
|
OLLAMA_MODEL
|
||||||
|
);
|
||||||
|
analysisResults.push(...batchResults);
|
||||||
|
logger.success(` ✅ Batch ${batchNumber} completed`);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(` ❌ Batch ${batchNumber} failed: ${error.message}`);
|
||||||
|
// Add fallback results for this batch
|
||||||
|
const fallbackResults = batch.map((_, idx) => ({
|
||||||
|
postIndex: i + idx + 1,
|
||||||
|
isRelevant: true,
|
||||||
|
confidence: 0.3,
|
||||||
|
reasoning: `Analysis failed: ${error.message}`,
|
||||||
|
}));
|
||||||
|
analysisResults.push(...fallbackResults);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Embed AI analysis into each job result
|
||||||
|
allResults.forEach((job, index) => {
|
||||||
|
if (analysisResults && analysisResults[index]) {
|
||||||
|
job.aiAnalysis = {
|
||||||
|
isRelevant: analysisResults[index].isRelevant,
|
||||||
|
confidence: analysisResults[index].confidence,
|
||||||
|
reasoning: analysisResults[index].reasoning,
|
||||||
|
context: AI_CONTEXT,
|
||||||
|
model: OLLAMA_MODEL,
|
||||||
|
analyzedAt: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
logger.success(
|
logger.success(
|
||||||
`✅ AI Analysis completed for ${allResults.length} jobs`
|
`✅ AI Analysis completed for ${allResults.length} jobs`
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Save results after AI analysis completes
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, false);
|
||||||
} else {
|
} else {
|
||||||
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save results
|
// Final save with complete flag
|
||||||
const outputData = {
|
logger.info(`💾 Preparing final save: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||||
metadata: {
|
logger.info(`💾 EXCLUDE_REJECTED env: ${process.env.EXCLUDE_REJECTED}, excludeRejected variable: ${excludeRejected}`);
|
||||||
extractedAt: new Date().toISOString(),
|
|
||||||
parser: "job-search-parser",
|
|
||||||
version: "2.0.0",
|
|
||||||
sites: sites,
|
|
||||||
keywords: keywords.join(", "),
|
|
||||||
locationFilter,
|
|
||||||
analysisResults,
|
|
||||||
},
|
|
||||||
results: allResults,
|
|
||||||
rejectedResults: allRejectedResults,
|
|
||||||
siteResults,
|
|
||||||
};
|
|
||||||
|
|
||||||
const resultsDir = path.join(__dirname, "results");
|
if (!excludeRejected) {
|
||||||
if (!fs.existsSync(resultsDir)) {
|
logger.info(`✅ Including ${allRejectedResults.length} rejected results in output`);
|
||||||
fs.mkdirSync(resultsDir, { recursive: true });
|
} else {
|
||||||
|
logger.info(`⏭️ Excluding rejected results (EXCLUDE_REJECTED=true)`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
logger.info(`💾 Final output: ${allResults.length} results, ${allRejectedResults.length} rejected`);
|
||||||
const filename = `job-search-results-${timestamp}.json`;
|
|
||||||
const filepath = path.join(resultsDir, filename);
|
|
||||||
|
|
||||||
fs.writeFileSync(filepath, JSON.stringify(outputData, null, 2));
|
// Final save with isComplete flag
|
||||||
|
saveIncrementalResults(allResults, allRejectedResults, siteResults, analysisResults, true);
|
||||||
|
|
||||||
|
const savedFiles = [];
|
||||||
|
if (incrementalJsonFilepath) savedFiles.push(incrementalJsonFilepath);
|
||||||
|
if (incrementalCsvFilepath) savedFiles.push(incrementalCsvFilepath);
|
||||||
|
|
||||||
// Final summary
|
// Final summary
|
||||||
logger.step("\n📊 Job Search Parser Summary");
|
logger.step("\n📊 Job Search Parser Summary");
|
||||||
logger.success(`✅ Total jobs found: ${allResults.length}`);
|
logger.success(`✅ Total jobs found: ${allResults.length}`);
|
||||||
logger.info(`❌ Total rejected: ${allRejectedResults.length}`);
|
logger.info(`❌ Total rejected: ${allRejectedResults.length}`);
|
||||||
logger.info(`📁 Results saved to: ${filepath}`);
|
logger.info(`📁 Results saved to:`);
|
||||||
|
savedFiles.forEach(filepath => {
|
||||||
|
logger.info(` ${filepath}`);
|
||||||
|
});
|
||||||
|
|
||||||
logger.info("\n📈 Results by site:");
|
logger.info("\n📈 Results by site:");
|
||||||
for (const [site, stats] of Object.entries(siteResults)) {
|
for (const [site, stats] of Object.entries(siteResults)) {
|
||||||
@ -207,6 +455,31 @@ async function startJobSearchParser(options = {}) {
|
|||||||
|
|
||||||
logger.success("\n✅ Job Search Parser completed successfully!");
|
logger.success("\n✅ Job Search Parser completed successfully!");
|
||||||
|
|
||||||
|
// Construct output data for return
|
||||||
|
const outputData = {
|
||||||
|
metadata: {
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
parser: "job-search-parser",
|
||||||
|
version: "2.0.0",
|
||||||
|
sites: sites,
|
||||||
|
keywords: keywords.join(", "),
|
||||||
|
locationFilter,
|
||||||
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
||||||
|
aiContext: ENABLE_AI_ANALYSIS ? AI_CONTEXT : undefined,
|
||||||
|
aiModel: ENABLE_AI_ANALYSIS ? OLLAMA_MODEL : undefined,
|
||||||
|
analysisResults: analysisResults,
|
||||||
|
rejectedJobsExcluded: excludeRejected,
|
||||||
|
isComplete: true,
|
||||||
|
lastUpdated: new Date().toISOString(),
|
||||||
|
},
|
||||||
|
results: allResults,
|
||||||
|
siteResults: siteResults,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!excludeRejected) {
|
||||||
|
outputData.rejectedResults = allRejectedResults;
|
||||||
|
}
|
||||||
|
|
||||||
return outputData;
|
return outputData;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`❌ Job Search Parser failed: ${error.message}`);
|
logger.error(`❌ Job Search Parser failed: ${error.message}`);
|
||||||
|
|||||||
@ -13,6 +13,7 @@ const {
|
|||||||
logger,
|
logger,
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
parseLocationFilters,
|
parseLocationFilters,
|
||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
extractLocationFromProfile,
|
extractLocationFromProfile,
|
||||||
@ -125,10 +126,12 @@ async function parseSkipTheDrive(options = {}) {
|
|||||||
headless = process.env.HEADLESS !== "false",
|
headless = process.env.HEADLESS !== "false",
|
||||||
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
|
||||||
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
|
||||||
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
logger.step("Starting SkipTheDrive parser...");
|
logger.step("Starting SkipTheDrive parser...");
|
||||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
logger.info(
|
logger.info(
|
||||||
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
`📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
|
||||||
);
|
);
|
||||||
@ -154,8 +157,12 @@ async function parseSkipTheDrive(options = {}) {
|
|||||||
const seenJobs = new Set();
|
const seenJobs = new Set();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Search for each keyword
|
// For AND logic, combine all keywords into a single search query
|
||||||
for (const keyword of keywords) {
|
// For OR logic, search each keyword separately
|
||||||
|
const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;
|
||||||
|
|
||||||
|
// Search for each keyword (or combined keyword for AND logic)
|
||||||
|
for (const keyword of searchKeywords) {
|
||||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||||
|
|
||||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||||
@ -208,11 +215,17 @@ async function parseSkipTheDrive(options = {}) {
|
|||||||
|
|
||||||
// Validate job against keywords
|
// Validate job against keywords
|
||||||
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
|
||||||
if (!containsAnyKeyword(fullText, keywords)) {
|
const keywordMatch = useAndLogic
|
||||||
|
? containsAllKeywords(fullText, keywords)
|
||||||
|
: containsAnyKeyword(fullText, keywords);
|
||||||
|
|
||||||
|
if (!keywordMatch) {
|
||||||
rejectedResults.push({
|
rejectedResults.push({
|
||||||
...jobData,
|
...jobData,
|
||||||
rejected: true,
|
rejected: true,
|
||||||
reason: "Keywords not found in job listing",
|
reason: useAndLogic
|
||||||
|
? "Not all keywords found in job listing"
|
||||||
|
: "Keywords not found in job listing",
|
||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
116
job-search-parser/src/csv-utils.js
Normal file
116
job-search-parser/src/csv-utils.js
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
/**
|
||||||
|
* CSV Utilities
|
||||||
|
*
|
||||||
|
* Functions for converting job search results to CSV format
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Escapes a CSV field value
|
||||||
|
* @param {string} value - The value to escape
|
||||||
|
* @returns {string} - The escaped value
|
||||||
|
*/
|
||||||
|
function escapeCsvField(value) {
|
||||||
|
if (value === null || value === undefined) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
const stringValue = String(value);
|
||||||
|
|
||||||
|
// If the value contains comma, newline, or double quote, wrap it in quotes and escape quotes
|
||||||
|
if (stringValue.includes(",") || stringValue.includes("\n") || stringValue.includes('"')) {
|
||||||
|
return `"${stringValue.replace(/"/g, '""')}"`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return stringValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts job results to CSV format
|
||||||
|
* @param {Array} jobs - Array of job objects
|
||||||
|
* @param {Object} metadata - Metadata object (optional)
|
||||||
|
* @returns {string} - CSV string
|
||||||
|
*/
|
||||||
|
function convertJobsToCsv(jobs, metadata = null) {
|
||||||
|
if (!jobs || jobs.length === 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define CSV columns based on job object structure
|
||||||
|
const columns = [
|
||||||
|
"jobId",
|
||||||
|
"title",
|
||||||
|
"company",
|
||||||
|
"location",
|
||||||
|
"jobUrl",
|
||||||
|
"postedDate",
|
||||||
|
"description",
|
||||||
|
"roleDuties",
|
||||||
|
"jobRequirements",
|
||||||
|
"jobType",
|
||||||
|
"experienceLevel",
|
||||||
|
"keyword",
|
||||||
|
"extractedAt",
|
||||||
|
"source",
|
||||||
|
"aiRelevant",
|
||||||
|
"aiConfidence",
|
||||||
|
"aiReasoning",
|
||||||
|
"aiContext",
|
||||||
|
"aiModel",
|
||||||
|
"aiAnalyzedAt"
|
||||||
|
];
|
||||||
|
|
||||||
|
// Create header row
|
||||||
|
const headerRow = columns.map(col => escapeCsvField(col)).join(",");
|
||||||
|
|
||||||
|
// Create data rows
|
||||||
|
const dataRows = jobs.map(job => {
|
||||||
|
const row = columns.map(col => {
|
||||||
|
if (col.startsWith("ai")) {
|
||||||
|
// Handle AI analysis fields
|
||||||
|
const aiField = col.substring(2).charAt(0).toLowerCase() + col.substring(3);
|
||||||
|
if (job.aiAnalysis) {
|
||||||
|
if (aiField === "relevant") {
|
||||||
|
return escapeCsvField(job.aiAnalysis.isRelevant ? "Yes" : "No");
|
||||||
|
} else if (aiField === "confidence") {
|
||||||
|
return escapeCsvField(job.aiAnalysis.confidence || "");
|
||||||
|
} else if (aiField === "reasoning") {
|
||||||
|
return escapeCsvField(job.aiAnalysis.reasoning || "");
|
||||||
|
} else if (aiField === "context") {
|
||||||
|
return escapeCsvField(job.aiAnalysis.context || "");
|
||||||
|
} else if (aiField === "model") {
|
||||||
|
return escapeCsvField(job.aiAnalysis.model || "");
|
||||||
|
} else if (aiField === "analyzedAt") {
|
||||||
|
return escapeCsvField(job.aiAnalysis.analyzedAt || "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
} else {
|
||||||
|
return escapeCsvField(job[col] || "");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return row.join(",");
|
||||||
|
});
|
||||||
|
|
||||||
|
// Combine header and data rows
|
||||||
|
return [headerRow, ...dataRows].join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts full results object (with metadata) to CSV
|
||||||
|
* @param {Object} resultsData - Full results object with metadata, results, etc.
|
||||||
|
* @returns {string} - CSV string
|
||||||
|
*/
|
||||||
|
function convertResultsToCsv(resultsData) {
|
||||||
|
if (!resultsData || !resultsData.results) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return convertJobsToCsv(resultsData.results, resultsData.metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
convertJobsToCsv,
|
||||||
|
convertResultsToCsv,
|
||||||
|
escapeCsvField,
|
||||||
|
};
|
||||||
|
|
||||||
947
job-search-parser/strategies/indeed-strategy.js
Normal file
947
job-search-parser/strategies/indeed-strategy.js
Normal file
@ -0,0 +1,947 @@
|
|||||||
|
/**
|
||||||
|
* Indeed Parsing Strategy
|
||||||
|
*
|
||||||
|
* Uses core-parser for browser management and ai-analyzer for utilities
|
||||||
|
*/
|
||||||
|
|
||||||
|
const {
|
||||||
|
logger,
|
||||||
|
cleanText,
|
||||||
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
|
matchesKeywordGroups,
|
||||||
|
validateLocationAgainstFilters,
|
||||||
|
} = require("ai-analyzer");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indeed URL builder
|
||||||
|
*/
|
||||||
|
function buildSearchUrl(keyword, location = "", filters = {}) {
|
||||||
|
const baseUrl = "https://www.indeed.com/jobs";
|
||||||
|
const params = new URLSearchParams({
|
||||||
|
q: keyword,
|
||||||
|
sort: "date", // Sort by date (newest first)
|
||||||
|
});
|
||||||
|
|
||||||
|
if (location) {
|
||||||
|
params.append("l", location);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add date filter if provided
|
||||||
|
if (filters.fromage) {
|
||||||
|
// fromage is in days (e.g., 1 = last 24 hours, 7 = last 7 days, 30 = last 30 days)
|
||||||
|
params.append("fromage", filters.fromage);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add job type filter
|
||||||
|
if (filters.jobType) {
|
||||||
|
// jt=fulltime, parttime, contract, internship, temporary
|
||||||
|
params.append("jt", filters.jobType);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add remote filter
|
||||||
|
if (filters.remote) {
|
||||||
|
params.append("remote", "true");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add experience level filter
|
||||||
|
if (filters.experienceLevel) {
|
||||||
|
// explvl=entry_level, mid_level, senior_level
|
||||||
|
params.append("explvl", filters.experienceLevel);
|
||||||
|
}
|
||||||
|
|
||||||
|
return `${baseUrl}?${params.toString()}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indeed parsing strategy function
|
||||||
|
*/
|
||||||
|
async function indeedStrategy(coreParser, options = {}) {
|
||||||
|
const {
|
||||||
|
keywords = ["software engineer", "developer"],
|
||||||
|
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
|
||||||
|
locationFilter = null,
|
||||||
|
maxPages = 5,
|
||||||
|
location = "", // Indeed location search (e.g., "Toronto, ON", "Canada")
|
||||||
|
minDate = null, // Minimum posted date (format: YYYY-MM-DD)
|
||||||
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
const results = [];
|
||||||
|
const rejectedResults = [];
|
||||||
|
const seenJobs = new Set();
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Create main page
|
||||||
|
const page = await coreParser.createPage("indeed-main");
|
||||||
|
|
||||||
|
logger.info("🚀 Starting Indeed parser...");
|
||||||
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
|
if (keywordGroups) {
|
||||||
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
} else {
|
||||||
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
|
}
|
||||||
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
|
logger.info(`🌍 Indeed Location: ${location || "None"}`);
|
||||||
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||||
|
|
||||||
|
// Convert minDate to fromage (days ago)
|
||||||
|
let fromage = null;
|
||||||
|
if (minDate) {
|
||||||
|
try {
|
||||||
|
const minDateObj = new Date(minDate);
|
||||||
|
const now = new Date();
|
||||||
|
const daysDiff = Math.floor((now - minDateObj) / (1000 * 60 * 60 * 24));
|
||||||
|
if (daysDiff > 0 && daysDiff <= 30) {
|
||||||
|
fromage = daysDiff;
|
||||||
|
logger.info(`📅 Min Date Filter: ${minDate} (${fromage} days ago)`);
|
||||||
|
} else if (daysDiff > 30) {
|
||||||
|
fromage = 30; // Indeed's maximum is typically 30 days
|
||||||
|
logger.info(`📅 Min Date Filter: ${minDate} (limited to 30 days)`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`⚠️ Invalid date format for minDate: ${minDate}. Expected format: YYYY-MM-DD`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine search keywords based on logic type
|
||||||
|
let searchKeywords;
|
||||||
|
if (keywordGroups) {
|
||||||
|
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
|
||||||
|
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// For simple AND logic, combine all keywords into a single search query
|
||||||
|
searchKeywords = [keywords.join(" ")];
|
||||||
|
} else {
|
||||||
|
// For OR logic, search each keyword separately
|
||||||
|
searchKeywords = keywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search for each keyword (or combined keyword for AND logic)
|
||||||
|
for (const keyword of searchKeywords) {
|
||||||
|
logger.info(`\n🔍 Searching Indeed for: "${keyword}"`);
|
||||||
|
|
||||||
|
const searchUrl = buildSearchUrl(keyword, location, {
|
||||||
|
fromage: fromage,
|
||||||
|
});
|
||||||
|
logger.info(`🔗 Search URL: ${searchUrl}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Navigate to job search results
|
||||||
|
// Use domcontentloaded instead of networkidle for faster loading
|
||||||
|
// Indeed can be slow to fully load, so we'll wait for DOM and then check for content
|
||||||
|
try {
|
||||||
|
await coreParser.navigateTo(searchUrl, {
|
||||||
|
pageId: "indeed-main",
|
||||||
|
retries: 2,
|
||||||
|
waitUntil: "domcontentloaded",
|
||||||
|
timeout: 60000, // Increase timeout to 60 seconds
|
||||||
|
});
|
||||||
|
} catch (navError) {
|
||||||
|
// If navigation fails, try with load event instead
|
||||||
|
logger.warning(`⚠️ Initial navigation failed, trying with 'load' event: ${navError.message}`);
|
||||||
|
try {
|
||||||
|
await coreParser.navigateTo(searchUrl, {
|
||||||
|
pageId: "indeed-main",
|
||||||
|
retries: 1,
|
||||||
|
waitUntil: "load",
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
} catch (loadError) {
|
||||||
|
// Last resort: try direct page navigation
|
||||||
|
logger.warning(`⚠️ Load event failed, trying direct navigation: ${loadError.message}`);
|
||||||
|
await page.goto(searchUrl, { timeout: 60000, waitUntil: "domcontentloaded" }).catch(() => {
|
||||||
|
throw new Error(`Failed to navigate to Indeed after all attempts: ${loadError.message}`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for page to load and let JavaScript execute
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 5000));
|
||||||
|
|
||||||
|
// Check if we're on the right page
|
||||||
|
const currentUrl = page.url();
|
||||||
|
logger.info(`📍 Current page URL: ${currentUrl}`);
|
||||||
|
|
||||||
|
// Check if we were redirected or blocked (check URL first)
|
||||||
|
if (currentUrl.includes('captcha') || currentUrl.includes('blocked') || currentUrl.includes('access-denied') || currentUrl.includes('verify')) {
|
||||||
|
logger.error(`❌ Indeed appears to be blocking access. URL: ${currentUrl}`);
|
||||||
|
throw new Error('Indeed is showing a CAPTCHA or verification page. Please try running in non-headless mode (set HEADLESS=false in .env) or wait and try again later.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check page content for CAPTCHA/human verification indicators
|
||||||
|
try {
|
||||||
|
const pageContent = await page.evaluate(() => {
|
||||||
|
const bodyText = document.body?.textContent?.toLowerCase() || '';
|
||||||
|
const title = document.title?.toLowerCase() || '';
|
||||||
|
|
||||||
|
// Check for common CAPTCHA/verification indicators
|
||||||
|
const captchaIndicators = [
|
||||||
|
'verify you\'re human',
|
||||||
|
'verify you are human',
|
||||||
|
'captcha',
|
||||||
|
'prove you\'re not a robot',
|
||||||
|
'unusual traffic',
|
||||||
|
'automated queries',
|
||||||
|
'please verify',
|
||||||
|
'security check',
|
||||||
|
'access denied',
|
||||||
|
'blocked',
|
||||||
|
];
|
||||||
|
|
||||||
|
const foundIndicators = captchaIndicators.filter(indicator =>
|
||||||
|
bodyText.includes(indicator) || title.includes(indicator)
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
hasCaptcha: foundIndicators.length > 0,
|
||||||
|
indicators: foundIndicators,
|
||||||
|
title: document.title,
|
||||||
|
bodyPreview: bodyText.substring(0, 500),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
if (pageContent.hasCaptcha) {
|
||||||
|
logger.error(`❌ Indeed is showing a CAPTCHA/verification page.`);
|
||||||
|
logger.error(` Detected indicators: ${pageContent.indicators.join(', ')}`);
|
||||||
|
logger.error(` Page title: ${pageContent.title}`);
|
||||||
|
logger.error(`\n💡 Solutions:`);
|
||||||
|
logger.error(` 1. Run in non-headless mode: Set HEADLESS=false in .env file`);
|
||||||
|
logger.error(` 2. Wait a few minutes and try again`);
|
||||||
|
logger.error(` 3. Use a different IP address or VPN`);
|
||||||
|
logger.error(` 4. Manually solve the CAPTCHA in a browser, then try again`);
|
||||||
|
throw new Error(`Indeed CAPTCHA detected: ${pageContent.indicators.join(', ')}. Please see suggestions above.`);
|
||||||
|
}
|
||||||
|
} catch (checkError) {
|
||||||
|
// If the check itself fails, log but don't throw (might be a different error)
|
||||||
|
if (checkError.message.includes('CAPTCHA')) {
|
||||||
|
throw checkError; // Re-throw CAPTCHA errors
|
||||||
|
}
|
||||||
|
logger.debug(`Could not check for CAPTCHA: ${checkError.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for results count
|
||||||
|
try {
|
||||||
|
const resultsText = await page.evaluate(() => {
|
||||||
|
const countElement = document.querySelector(".jobsearch-JobCountAndSortPane-jobCount");
|
||||||
|
return countElement ? countElement.textContent : "No results count found";
|
||||||
|
});
|
||||||
|
logger.info(`📊 Indeed results info: ${resultsText}`);
|
||||||
|
} catch (e) {
|
||||||
|
logger.debug(`Could not get results count: ${e.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for job listings container
|
||||||
|
let hasResults = false;
|
||||||
|
const possibleSelectors = [
|
||||||
|
"#mosaic-provider-jobcards",
|
||||||
|
".job_seen_beacon",
|
||||||
|
"[data-jk]",
|
||||||
|
".jobsearch-SerpJobCard",
|
||||||
|
".jobCard",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of possibleSelectors) {
|
||||||
|
try {
|
||||||
|
await page.waitForSelector(selector, { timeout: 5000 });
|
||||||
|
const count = await page.$$(selector).then((elements) => elements.length);
|
||||||
|
if (count > 0) {
|
||||||
|
hasResults = true;
|
||||||
|
logger.info(`✅ Found job results container with selector: ${selector} (${count} jobs)`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Try next selector
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!hasResults) {
|
||||||
|
logger.warning(`⚠️ No job results container found for keyword: ${keyword}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process multiple pages
|
||||||
|
let currentPage = 1;
|
||||||
|
const maxPagesToProcess = maxPages > 0 ? maxPages : 999; // 0 means unlimited
|
||||||
|
|
||||||
|
logger.info(`📄 Processing pages (max: ${maxPagesToProcess === 999 ? 'unlimited' : maxPagesToProcess}) for "${keyword}"...`);
|
||||||
|
|
||||||
|
while (currentPage <= maxPagesToProcess) {
|
||||||
|
logger.info(`📄 Processing page ${currentPage}...`);
|
||||||
|
|
||||||
|
// Wait for page to fully load
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||||
|
|
||||||
|
// Extract jobs from current page
|
||||||
|
const pageJobs = await extractJobsFromPage(page, keyword, locationFilter);
|
||||||
|
logger.info(`📋 Extracted ${pageJobs.length} jobs from page ${currentPage}`);
|
||||||
|
|
||||||
|
if (pageJobs.length === 0) {
|
||||||
|
logger.warning(`⚠️ No jobs found on page ${currentPage}, stopping pagination`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process each job
|
||||||
|
for (const job of pageJobs) {
|
||||||
|
// Skip duplicates
|
||||||
|
if (seenJobs.has(job.jobId)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seenJobs.add(job.jobId);
|
||||||
|
|
||||||
|
// Validate keywords based on logic type
|
||||||
|
if (keywordGroups) {
|
||||||
|
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
|
||||||
|
const fullText = `${job.title} ${job.description} ${job.company}`;
|
||||||
|
if (!matchesKeywordGroups(fullText, keywordGroups)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Job does not match all keyword groups",
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// Simple AND logic: all keywords must match
|
||||||
|
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
|
||||||
|
if (!containsAllKeywords(fullText, keywords)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Not all keywords found in job listing",
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate location if filtering enabled
|
||||||
|
if (locationFilter) {
|
||||||
|
const locationValid = validateLocationAgainstFilters(
|
||||||
|
job.location,
|
||||||
|
locationFilter
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!locationValid.isValid) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: locationValid.reasoning || "Location filter mismatch",
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push(job);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if there's a next page
|
||||||
|
const hasNext = await hasNextPageAvailable(page);
|
||||||
|
if (!hasNext) {
|
||||||
|
logger.info(`✅ No more pages available. Total jobs extracted: ${results.length}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Navigate to next page if we haven't reached maxPages
|
||||||
|
if (currentPage < maxPagesToProcess) {
|
||||||
|
logger.info(`➡️ Navigating to page ${currentPage + 1}...`);
|
||||||
|
const navigationSuccess = await navigateToNextPage(page);
|
||||||
|
|
||||||
|
if (!navigationSuccess) {
|
||||||
|
logger.warning(`⚠️ Failed to navigate to next page, stopping pagination`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentPage++;
|
||||||
|
} else {
|
||||||
|
logger.info(`📊 Reached max pages limit (${maxPagesToProcess}). Total jobs extracted: ${results.length}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const totalExtracted = results.length + rejectedResults.length;
|
||||||
|
logger.info(`📋 Extracted ${results.length} accepted jobs, ${rejectedResults.length} rejected jobs (${totalExtracted} total) across ${currentPage} page(s) for "${keyword}"`);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Error processing keyword "${keyword}": ${error.message}`);
|
||||||
|
logger.error(`Stack: ${error.stack}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
`🎯 Indeed parsing completed: ${results.length} jobs found, ${rejectedResults.length} rejected`
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
results,
|
||||||
|
rejectedResults,
|
||||||
|
summary: {
|
||||||
|
totalJobs: results.length,
|
||||||
|
totalRejected: rejectedResults.length,
|
||||||
|
keywords: keywords.join(", "),
|
||||||
|
locationFilter,
|
||||||
|
source: "indeed",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`❌ Indeed parsing failed: ${error.message}`);
|
||||||
|
logger.error(`Stack: ${error.stack}`);
|
||||||
|
return {
|
||||||
|
results,
|
||||||
|
rejectedResults,
|
||||||
|
summary: {
|
||||||
|
totalJobs: results.length,
|
||||||
|
totalRejected: rejectedResults.length,
|
||||||
|
keywords: keywords.join(", "),
|
||||||
|
locationFilter,
|
||||||
|
source: "indeed",
|
||||||
|
error: error.message,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract jobs from current page
|
||||||
|
*/
|
||||||
|
async function extractJobsFromPage(page, keyword, locationFilter) {
|
||||||
|
const jobs = [];
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Indeed job listings are typically in divs with data-jk attribute (job key)
|
||||||
|
const jobSelectors = [
|
||||||
|
"[data-jk]",
|
||||||
|
".job_seen_beacon",
|
||||||
|
".jobsearch-SerpJobCard",
|
||||||
|
".jobCard",
|
||||||
|
"div[data-testid='job-card']",
|
||||||
|
];
|
||||||
|
|
||||||
|
let jobElements = [];
|
||||||
|
for (const selector of jobSelectors) {
|
||||||
|
try {
|
||||||
|
await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {});
|
||||||
|
const elements = await page.$$(selector);
|
||||||
|
if (elements.length > 0) {
|
||||||
|
jobElements = elements;
|
||||||
|
logger.info(`✅ Found ${jobElements.length} job elements using selector: ${selector}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Try next selector
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jobElements.length === 0) {
|
||||||
|
logger.warning(`⚠️ No job elements found with any selector`);
|
||||||
|
return jobs;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const jobElement of jobElements) {
|
||||||
|
try {
|
||||||
|
// Try to scroll job into view, but don't fail if it times out
|
||||||
|
// Some elements might be in hidden containers or lazy-loaded
|
||||||
|
try {
|
||||||
|
await Promise.race([
|
||||||
|
jobElement.scrollIntoViewIfNeeded(),
|
||||||
|
new Promise((_, reject) =>
|
||||||
|
setTimeout(() => reject(new Error('Scroll timeout')), 2000)
|
||||||
|
)
|
||||||
|
]);
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||||
|
} catch (scrollError) {
|
||||||
|
// If scrolling fails, try a simpler scroll approach
|
||||||
|
try {
|
||||||
|
await jobElement.evaluate((el) => {
|
||||||
|
el.scrollIntoView({ behavior: 'auto', block: 'center' });
|
||||||
|
});
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||||
|
} catch (simpleScrollError) {
|
||||||
|
// If even simple scroll fails, continue anyway - we can still extract data
|
||||||
|
logger.debug(`Could not scroll element into view, continuing anyway: ${simpleScrollError.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const job = await extractJobData(jobElement, keyword);
|
||||||
|
if (job && (job.title || job.jobId)) {
|
||||||
|
jobs.push(job);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`Failed to extract job data: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Failed to extract jobs from page: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return jobs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract data from individual job element
|
||||||
|
*/
|
||||||
|
async function extractJobData(jobElement, keyword) {
|
||||||
|
try {
|
||||||
|
const jobData = await jobElement.evaluate((el) => {
|
||||||
|
const data = {
|
||||||
|
jobId: "",
|
||||||
|
title: "",
|
||||||
|
company: "",
|
||||||
|
location: "",
|
||||||
|
jobUrl: "",
|
||||||
|
postedDate: "",
|
||||||
|
description: "",
|
||||||
|
salary: "",
|
||||||
|
jobType: "",
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract job ID from data-jk attribute
|
||||||
|
data.jobId = el.getAttribute("data-jk") || "";
|
||||||
|
|
||||||
|
// Extract title and URL
|
||||||
|
const titleSelectors = [
|
||||||
|
"h2.jobTitle a",
|
||||||
|
"h2.jobTitle",
|
||||||
|
"a[data-jk]",
|
||||||
|
"h2 a",
|
||||||
|
".jobTitle a",
|
||||||
|
"[class*='jobTitle'] a",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of titleSelectors) {
|
||||||
|
const titleElement = el.querySelector(selector);
|
||||||
|
if (titleElement) {
|
||||||
|
data.title = titleElement.textContent?.trim() || titleElement.innerText?.trim() || "";
|
||||||
|
if (titleElement.tagName === "A") {
|
||||||
|
data.jobUrl = titleElement.getAttribute("href") || "";
|
||||||
|
} else {
|
||||||
|
const link = titleElement.querySelector("a");
|
||||||
|
if (link) {
|
||||||
|
data.jobUrl = link.getAttribute("href") || "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (data.title) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract company name
|
||||||
|
const companySelectors = [
|
||||||
|
"[data-testid='company-name']",
|
||||||
|
".companyName",
|
||||||
|
"[class*='companyName']",
|
||||||
|
"span.companyName",
|
||||||
|
"a[data-testid='company-name']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of companySelectors) {
|
||||||
|
const companyElement = el.querySelector(selector);
|
||||||
|
if (companyElement) {
|
||||||
|
const text = companyElement.textContent?.trim() || companyElement.innerText?.trim() || "";
|
||||||
|
if (text && text.length > 0) {
|
||||||
|
data.company = text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract location
|
||||||
|
const locationSelectors = [
|
||||||
|
"[data-testid='job-location']",
|
||||||
|
".companyLocation",
|
||||||
|
"[class*='companyLocation']",
|
||||||
|
"[class*='location']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of locationSelectors) {
|
||||||
|
const locationElement = el.querySelector(selector);
|
||||||
|
if (locationElement) {
|
||||||
|
const text = locationElement.textContent?.trim() || locationElement.innerText?.trim() || "";
|
||||||
|
if (text && text.length > 0) {
|
||||||
|
data.location = text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract salary
|
||||||
|
const salarySelectors = [
|
||||||
|
"[data-testid='attribute_snippet_testid']",
|
||||||
|
".salary-snippet",
|
||||||
|
"[class*='salary']",
|
||||||
|
".salaryText",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of salarySelectors) {
|
||||||
|
const salaryElement = el.querySelector(selector);
|
||||||
|
if (salaryElement) {
|
||||||
|
const text = salaryElement.textContent?.trim() || salaryElement.innerText?.trim() || "";
|
||||||
|
if (text && text.includes("$") || text.match(/\d+/)) {
|
||||||
|
data.salary = text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract posted date
|
||||||
|
const dateSelectors = [
|
||||||
|
"[data-testid='myJobsStateDate']",
|
||||||
|
".date",
|
||||||
|
"[class*='date']",
|
||||||
|
"span.date",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of dateSelectors) {
|
||||||
|
const dateElement = el.querySelector(selector);
|
||||||
|
if (dateElement) {
|
||||||
|
const text = dateElement.textContent?.trim() || dateElement.innerText?.trim() || "";
|
||||||
|
if (text) {
|
||||||
|
// Parse relative dates like "2 days ago", "Just posted", etc.
|
||||||
|
const now = new Date();
|
||||||
|
if (text.match(/just posted|today/i)) {
|
||||||
|
data.postedDate = now.toISOString().split("T")[0];
|
||||||
|
} else if (text.match(/\d+\s*(day|days)/i)) {
|
||||||
|
const match = text.match(/(\d+)\s*day/i);
|
||||||
|
if (match) {
|
||||||
|
const daysAgo = parseInt(match[1]);
|
||||||
|
const date = new Date(now);
|
||||||
|
date.setDate(date.getDate() - daysAgo);
|
||||||
|
data.postedDate = date.toISOString().split("T")[0];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
data.postedDate = text;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract description snippet
|
||||||
|
const descSelectors = [
|
||||||
|
".job-snippet",
|
||||||
|
"[class*='job-snippet']",
|
||||||
|
"[class*='summary']",
|
||||||
|
".summary",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of descSelectors) {
|
||||||
|
const descElement = el.querySelector(selector);
|
||||||
|
if (descElement) {
|
||||||
|
const text = descElement.textContent?.trim() || descElement.innerText?.trim() || "";
|
||||||
|
if (text && text.length > 20) {
|
||||||
|
data.description = text.substring(0, 500); // Limit description length
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Clean and format
|
||||||
|
const title = cleanText(jobData.title);
|
||||||
|
let jobUrl = jobData.jobUrl || "";
|
||||||
|
|
||||||
|
// Make URL absolute if relative
|
||||||
|
if (jobUrl && !jobUrl.startsWith("http")) {
|
||||||
|
if (jobUrl.startsWith("/")) {
|
||||||
|
jobUrl = `https://www.indeed.com${jobUrl}`;
|
||||||
|
} else {
|
||||||
|
jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`;
|
||||||
|
}
|
||||||
|
} else if (!jobUrl && jobData.jobId) {
|
||||||
|
jobUrl = `https://www.indeed.com/viewjob?jk=${jobData.jobId}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate job ID if not found
|
||||||
|
const jobId = jobData.jobId || `indeed-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
||||||
|
|
||||||
|
if (!jobId && !title) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
jobId,
|
||||||
|
title,
|
||||||
|
company: cleanText(jobData.company),
|
||||||
|
location: cleanText(jobData.location),
|
||||||
|
jobUrl,
|
||||||
|
postedDate: jobData.postedDate,
|
||||||
|
description: cleanText(jobData.description),
|
||||||
|
salary: cleanText(jobData.salary),
|
||||||
|
jobType: jobData.jobType,
|
||||||
|
keyword,
|
||||||
|
extractedAt: new Date().toISOString(),
|
||||||
|
source: "indeed",
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`Error extracting job data: ${error.message}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse job description to separate role duties from job requirements
|
||||||
|
*/
|
||||||
|
function parseDutiesAndRequirements(description) {
|
||||||
|
if (!description || description.trim().length === 0) {
|
||||||
|
return { duties: "", requirements: "" };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common section headers that indicate duties/responsibilities
|
||||||
|
const dutiesKeywords = [
|
||||||
|
/responsibilities?:/i,
|
||||||
|
/duties?:/i,
|
||||||
|
/what you['\u2019]ll do/i,
|
||||||
|
/key responsibilities/i,
|
||||||
|
/your role/i,
|
||||||
|
/position overview/i,
|
||||||
|
/about the role/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Common section headers that indicate requirements/qualifications
|
||||||
|
const requirementsKeywords = [
|
||||||
|
/requirements?:/i,
|
||||||
|
/qualifications?:/i,
|
||||||
|
/must have/i,
|
||||||
|
/required:/i,
|
||||||
|
/what you['\u2019]ll bring/i,
|
||||||
|
/you have:/i,
|
||||||
|
/skills required/i,
|
||||||
|
/minimum requirements/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Split description into sections
|
||||||
|
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
|
||||||
|
|
||||||
|
let currentSection = "duties";
|
||||||
|
let dutiesText = "";
|
||||||
|
let requirementsText = "";
|
||||||
|
|
||||||
|
for (const section of sections) {
|
||||||
|
let isRequirementsSection = false;
|
||||||
|
for (const keyword of requirementsKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
isRequirementsSection = true;
|
||||||
|
currentSection = "requirements";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isRequirementsSection) {
|
||||||
|
for (const keyword of dutiesKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
currentSection = "duties";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentSection === "requirements") {
|
||||||
|
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
|
||||||
|
} else {
|
||||||
|
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
|
||||||
|
if (!dutiesText && !requirementsText && description) {
|
||||||
|
const midPoint = Math.floor(description.length * 0.6);
|
||||||
|
dutiesText = description.substring(0, midPoint).trim();
|
||||||
|
requirementsText = description.substring(midPoint).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
duties: dutiesText.trim(),
|
||||||
|
requirements: requirementsText.trim(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if next page is available
|
||||||
|
*/
|
||||||
|
async function hasNextPageAvailable(page) {
|
||||||
|
try {
|
||||||
|
const nextButtonSelectors = [
|
||||||
|
"a[aria-label='Next']",
|
||||||
|
"a[aria-label='Next Page']",
|
||||||
|
"a[data-testid='pagination-page-next']",
|
||||||
|
"[data-testid='pagination-page-next']",
|
||||||
|
"a[aria-label*='Next']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of nextButtonSelectors) {
|
||||||
|
try {
|
||||||
|
const nextButton = await page.$(selector);
|
||||||
|
if (nextButton) {
|
||||||
|
const isDisabled = await nextButton.evaluate((el) => {
|
||||||
|
return el.hasAttribute("disabled") ||
|
||||||
|
el.getAttribute("aria-disabled") === "true" ||
|
||||||
|
el.classList.contains("disabled");
|
||||||
|
}).catch(() => false);
|
||||||
|
|
||||||
|
if (!isDisabled) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
} catch (error) {
|
||||||
|
logger.debug(`Error checking for next page: ${error.message}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Navigate to next page
|
||||||
|
*/
|
||||||
|
async function navigateToNextPage(page) {
|
||||||
|
try {
|
||||||
|
const nextButtonSelectors = [
|
||||||
|
"a[aria-label='Next']",
|
||||||
|
"a[aria-label='Next Page']",
|
||||||
|
"a[data-testid='pagination-page-next']",
|
||||||
|
"[data-testid='pagination-page-next']",
|
||||||
|
"a[aria-label*='Next']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of nextButtonSelectors) {
|
||||||
|
try {
|
||||||
|
const nextButton = await page.$(selector);
|
||||||
|
if (nextButton) {
|
||||||
|
const isDisabled = await nextButton.evaluate((el) => {
|
||||||
|
return el.hasAttribute("disabled") ||
|
||||||
|
el.getAttribute("aria-disabled") === "true" ||
|
||||||
|
el.classList.contains("disabled");
|
||||||
|
}).catch(() => false);
|
||||||
|
|
||||||
|
if (!isDisabled) {
|
||||||
|
// Get current URL before navigation
|
||||||
|
const urlBefore = page.url();
|
||||||
|
|
||||||
|
await nextButton.scrollIntoViewIfNeeded().catch(() => {});
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||||
|
|
||||||
|
await nextButton.click();
|
||||||
|
logger.info(`✅ Clicked next page button`);
|
||||||
|
|
||||||
|
// Wait for navigation to complete (URL change or content load)
|
||||||
|
// Indeed might use AJAX, so wait for either URL change or content update
|
||||||
|
let navigationComplete = false;
|
||||||
|
const maxWaitTime = 10000; // 10 seconds max wait
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
while (!navigationComplete && (Date.now() - startTime) < maxWaitTime) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||||
|
|
||||||
|
// Check if URL changed (full page navigation)
|
||||||
|
const currentUrl = page.url();
|
||||||
|
if (currentUrl !== urlBefore) {
|
||||||
|
logger.info(`📍 URL changed to: ${currentUrl}`);
|
||||||
|
navigationComplete = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if job elements appeared (AJAX navigation)
|
||||||
|
const jobCount = await page.$$eval(
|
||||||
|
"[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard",
|
||||||
|
(elements) => elements.length
|
||||||
|
).catch(() => 0);
|
||||||
|
|
||||||
|
if (jobCount > 0) {
|
||||||
|
logger.info(`✅ Found ${jobCount} job elements (AJAX navigation)`);
|
||||||
|
navigationComplete = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Additional wait for content to stabilize
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||||
|
|
||||||
|
// Check for CAPTCHA after navigation
|
||||||
|
const currentUrl = page.url();
|
||||||
|
if (currentUrl.includes('captcha') || currentUrl.includes('verify') || currentUrl.includes('blocked')) {
|
||||||
|
logger.error(`❌ CAPTCHA detected after navigation to page. URL: ${currentUrl}`);
|
||||||
|
throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false) or wait and try again.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check page content for CAPTCHA
|
||||||
|
try {
|
||||||
|
const hasCaptcha = await page.evaluate(() => {
|
||||||
|
const bodyText = document.body?.textContent?.toLowerCase() || '';
|
||||||
|
const indicators = ['verify you\'re human', 'captcha', 'unusual traffic', 'automated queries'];
|
||||||
|
return indicators.some(ind => bodyText.includes(ind));
|
||||||
|
});
|
||||||
|
|
||||||
|
if (hasCaptcha) {
|
||||||
|
logger.error(`❌ CAPTCHA detected on page content after navigation`);
|
||||||
|
throw new Error('Indeed CAPTCHA detected. Please run in non-headless mode (HEADLESS=false) to solve it manually.');
|
||||||
|
}
|
||||||
|
} catch (captchaError) {
|
||||||
|
if (captchaError.message.includes('CAPTCHA')) {
|
||||||
|
throw captchaError;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scroll page to trigger any lazy loading
|
||||||
|
try {
|
||||||
|
await page.evaluate(() => {
|
||||||
|
window.scrollTo(0, 300);
|
||||||
|
});
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore scroll errors
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final check for job elements with multiple selectors
|
||||||
|
const finalJobCount = await page.$$eval(
|
||||||
|
"[data-jk], .job_seen_beacon, .jobsearch-SerpJobCard, .jobCard, div[data-testid='job-card']",
|
||||||
|
(elements) => elements.length
|
||||||
|
).catch(() => 0);
|
||||||
|
|
||||||
|
if (finalJobCount > 0) {
|
||||||
|
logger.info(`✅ Navigation successful, found ${finalJobCount} job elements`);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
logger.warning(`⚠️ No job elements found after navigation (waited ${maxWaitTime}ms)`);
|
||||||
|
// Debug: check what's on the page
|
||||||
|
try {
|
||||||
|
const pageTitle = await page.title();
|
||||||
|
const pageUrl = page.url();
|
||||||
|
logger.debug(`Page title: ${pageTitle}, URL: ${pageUrl}`);
|
||||||
|
|
||||||
|
// Check if it's a CAPTCHA page
|
||||||
|
const bodyText = await page.evaluate(() => document.body?.textContent?.toLowerCase() || '');
|
||||||
|
if (bodyText.includes('captcha') || bodyText.includes('verify')) {
|
||||||
|
logger.error(`❌ Page appears to be a CAPTCHA page`);
|
||||||
|
throw new Error('Indeed is showing a CAPTCHA. Please run in non-headless mode (HEADLESS=false).');
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
if (e.message.includes('CAPTCHA')) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
// Ignore other debug errors
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.warning(`⚠️ Could not find or click next page button`);
|
||||||
|
return false;
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`Failed to navigate to next page: ${error.message}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
indeedStrategy,
|
||||||
|
buildSearchUrl,
|
||||||
|
};
|
||||||
|
|
||||||
1682
job-search-parser/strategies/linkedin-jobs-strategy.js
Normal file
1682
job-search-parser/strategies/linkedin-jobs-strategy.js
Normal file
File diff suppressed because it is too large
Load Diff
@ -8,6 +8,8 @@ const {
|
|||||||
logger,
|
logger,
|
||||||
cleanText,
|
cleanText,
|
||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
|
containsAllKeywords,
|
||||||
|
matchesKeywordGroups,
|
||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
} = require("ai-analyzer");
|
} = require("ai-analyzer");
|
||||||
|
|
||||||
@ -34,9 +36,11 @@ function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
|
|||||||
async function skipthedriveStrategy(coreParser, options = {}) {
|
async function skipthedriveStrategy(coreParser, options = {}) {
|
||||||
const {
|
const {
|
||||||
keywords = ["software engineer", "developer", "programmer"],
|
keywords = ["software engineer", "developer", "programmer"],
|
||||||
|
keywordGroups = null, // Array of keyword groups for grouped AND/OR logic
|
||||||
locationFilter = null,
|
locationFilter = null,
|
||||||
maxPages = 5,
|
maxPages = 5,
|
||||||
jobTypes = [],
|
jobTypes = [],
|
||||||
|
useAndLogic = false, // Use AND logic instead of OR logic for keywords
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
const results = [];
|
const results = [];
|
||||||
@ -49,11 +53,29 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
|
|
||||||
logger.info("🚀 Starting SkipTheDrive parser...");
|
logger.info("🚀 Starting SkipTheDrive parser...");
|
||||||
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
|
||||||
|
if (keywordGroups) {
|
||||||
|
logger.info(`🔗 Keyword Logic: Grouped AND/OR - ${keywordGroups.map(g => `(${g.join(' OR ')})`).join(' AND ')}`);
|
||||||
|
} else {
|
||||||
|
logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
|
||||||
|
}
|
||||||
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
|
||||||
logger.info(`📄 Max Pages: ${maxPages}`);
|
logger.info(`📄 Max Pages: ${maxPages}`);
|
||||||
|
|
||||||
// Search for each keyword
|
// Determine search keywords based on logic type
|
||||||
for (const keyword of keywords) {
|
let searchKeywords;
|
||||||
|
if (keywordGroups) {
|
||||||
|
// For grouped AND/OR logic, search each keyword in each group (OR within groups)
|
||||||
|
searchKeywords = keywordGroups.flat(); // Flatten all keywords from all groups
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// For simple AND logic, combine all keywords into a single search query
|
||||||
|
searchKeywords = [keywords.join(" ")];
|
||||||
|
} else {
|
||||||
|
// For OR logic, search each keyword separately
|
||||||
|
searchKeywords = keywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search for each keyword (or combined keyword for AND logic)
|
||||||
|
for (const keyword of searchKeywords) {
|
||||||
logger.info(`\n🔍 Searching for: ${keyword}`);
|
logger.info(`\n🔍 Searching for: ${keyword}`);
|
||||||
|
|
||||||
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
|
||||||
@ -67,14 +89,11 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Wait for job listings to load
|
// Wait for job listings to load
|
||||||
const hasResults = await coreParser
|
const hasResults = await page
|
||||||
.waitForSelector(
|
.waitForSelector("#loops-wrapper", {
|
||||||
"#loops-wrapper",
|
timeout: 5000,
|
||||||
{
|
})
|
||||||
timeout: 5000,
|
.then(() => true)
|
||||||
},
|
|
||||||
"skipthedrive-main"
|
|
||||||
)
|
|
||||||
.catch(() => {
|
.catch(() => {
|
||||||
logger.warning(`No results found for keyword: ${keyword}`);
|
logger.warning(`No results found for keyword: ${keyword}`);
|
||||||
return false;
|
return false;
|
||||||
@ -95,7 +114,10 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
const pageJobs = await extractJobsFromPage(
|
const pageJobs = await extractJobsFromPage(
|
||||||
page,
|
page,
|
||||||
keyword,
|
keyword,
|
||||||
locationFilter
|
locationFilter,
|
||||||
|
keywords,
|
||||||
|
keywordGroups,
|
||||||
|
useAndLogic
|
||||||
);
|
);
|
||||||
|
|
||||||
for (const job of pageJobs) {
|
for (const job of pageJobs) {
|
||||||
@ -103,6 +125,29 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
if (seenJobs.has(job.jobId)) continue;
|
if (seenJobs.has(job.jobId)) continue;
|
||||||
seenJobs.add(job.jobId);
|
seenJobs.add(job.jobId);
|
||||||
|
|
||||||
|
// Validate keywords based on logic type
|
||||||
|
if (keywordGroups) {
|
||||||
|
// Grouped AND/OR logic: all groups must match (AND), at least one keyword per group (OR)
|
||||||
|
const fullText = `${job.title} ${job.description} ${job.company}`;
|
||||||
|
if (!matchesKeywordGroups(fullText, keywordGroups)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Job does not match all keyword groups",
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if (useAndLogic) {
|
||||||
|
// Simple AND logic: all keywords must match
|
||||||
|
const fullText = `${job.title} ${job.description} ${job.company}`.toLowerCase();
|
||||||
|
if (!containsAllKeywords(fullText, keywords)) {
|
||||||
|
rejectedResults.push({
|
||||||
|
...job,
|
||||||
|
rejectionReason: "Not all keywords found in job listing",
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Validate location if filtering enabled
|
// Validate location if filtering enabled
|
||||||
if (locationFilter) {
|
if (locationFilter) {
|
||||||
const locationValid = validateLocationAgainstFilters(
|
const locationValid = validateLocationAgainstFilters(
|
||||||
@ -163,7 +208,7 @@ async function skipthedriveStrategy(coreParser, options = {}) {
|
|||||||
/**
|
/**
|
||||||
* Extract jobs from current page
|
* Extract jobs from current page
|
||||||
*/
|
*/
|
||||||
async function extractJobsFromPage(page, keyword, locationFilter) {
|
async function extractJobsFromPage(page, keyword, locationFilter, allKeywords = [], keywordGroups = null, useAndLogic = false) {
|
||||||
const jobs = [];
|
const jobs = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -187,6 +232,147 @@ async function extractJobsFromPage(page, keyword, locationFilter) {
|
|||||||
return jobs;
|
return jobs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse job description to separate role duties from job requirements
|
||||||
|
*/
|
||||||
|
function parseDutiesAndRequirements(description) {
|
||||||
|
if (!description || description.trim().length === 0) {
|
||||||
|
return { duties: "", requirements: "" };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common section headers that indicate duties/responsibilities
|
||||||
|
const dutiesKeywords = [
|
||||||
|
/responsibilities?:/i,
|
||||||
|
/duties?:/i,
|
||||||
|
/what you['\u2019]ll do/i,
|
||||||
|
/key responsibilities/i,
|
||||||
|
/your role/i,
|
||||||
|
/position overview/i,
|
||||||
|
/about the role/i,
|
||||||
|
/role overview/i,
|
||||||
|
/what we need/i,
|
||||||
|
/you will:/i,
|
||||||
|
/you['\u2019]ll be responsible/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Common section headers that indicate requirements/qualifications
|
||||||
|
const requirementsKeywords = [
|
||||||
|
/requirements?:/i,
|
||||||
|
/qualifications?:/i,
|
||||||
|
/must have/i,
|
||||||
|
/required:/i,
|
||||||
|
/what you['\u2019]ll bring/i,
|
||||||
|
/you have:/i,
|
||||||
|
/skills required/i,
|
||||||
|
/minimum requirements/i,
|
||||||
|
/preferred qualifications/i,
|
||||||
|
/education:/i,
|
||||||
|
/experience:/i,
|
||||||
|
/you must have/i,
|
||||||
|
/we['\u2019]re looking for/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Split description into sections (by common delimiters)
|
||||||
|
const sections = description.split(/\n\s*\n|\r\n\s*\r\n/).filter(s => s.trim().length > 0);
|
||||||
|
|
||||||
|
let currentSection = "duties"; // Default to duties
|
||||||
|
let dutiesText = "";
|
||||||
|
let requirementsText = "";
|
||||||
|
|
||||||
|
for (const section of sections) {
|
||||||
|
const sectionLower = section.toLowerCase();
|
||||||
|
|
||||||
|
// Check if this section is about requirements
|
||||||
|
let isRequirementsSection = false;
|
||||||
|
for (const keyword of requirementsKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
isRequirementsSection = true;
|
||||||
|
currentSection = "requirements";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this section is about duties/responsibilities
|
||||||
|
if (!isRequirementsSection) {
|
||||||
|
for (const keyword of dutiesKeywords) {
|
||||||
|
if (keyword.test(section)) {
|
||||||
|
currentSection = "duties";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add to appropriate section
|
||||||
|
if (currentSection === "requirements") {
|
||||||
|
requirementsText += (requirementsText ? "\n\n" : "") + section.trim();
|
||||||
|
} else {
|
||||||
|
dutiesText += (dutiesText ? "\n\n" : "") + section.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we couldn't split by sections, try to find bullet points or numbered lists
|
||||||
|
if (!dutiesText && !requirementsText) {
|
||||||
|
const lines = description.split(/\n/);
|
||||||
|
let foundRequirementsHeader = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
const line = lines[i].trim();
|
||||||
|
if (line.length === 0) continue;
|
||||||
|
|
||||||
|
// Check if this line is a requirements header
|
||||||
|
for (const keyword of requirementsKeywords) {
|
||||||
|
if (keyword.test(line)) {
|
||||||
|
foundRequirementsHeader = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (foundRequirementsHeader) {
|
||||||
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
||||||
|
} else {
|
||||||
|
// Check if it's a duties header
|
||||||
|
let isDutiesHeader = false;
|
||||||
|
for (const keyword of dutiesKeywords) {
|
||||||
|
if (keyword.test(line)) {
|
||||||
|
isDutiesHeader = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isDutiesHeader) {
|
||||||
|
// Add to duties if we haven't found requirements header yet
|
||||||
|
if (!foundRequirementsHeader) {
|
||||||
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
||||||
|
} else {
|
||||||
|
requirementsText += (requirementsText ? "\n" : "") + line;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dutiesText += (dutiesText ? "\n" : "") + line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: if we still have nothing separated, put first 60% in duties, rest in requirements
|
||||||
|
if (!dutiesText && !requirementsText && description) {
|
||||||
|
const midPoint = Math.floor(description.length * 0.6);
|
||||||
|
const lastRequirementsKeyword = description.toLowerCase().lastIndexOf("requirement");
|
||||||
|
const lastQualificationsKeyword = description.toLowerCase().lastIndexOf("qualification");
|
||||||
|
const splitPoint = Math.max(
|
||||||
|
lastRequirementsKeyword > 0 ? lastRequirementsKeyword : midPoint,
|
||||||
|
lastQualificationsKeyword > 0 ? lastQualificationsKeyword : midPoint
|
||||||
|
);
|
||||||
|
|
||||||
|
dutiesText = description.substring(0, splitPoint).trim();
|
||||||
|
requirementsText = description.substring(splitPoint).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
duties: dutiesText.trim(),
|
||||||
|
requirements: requirementsText.trim(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract data from individual job element
|
* Extract data from individual job element
|
||||||
*/
|
*/
|
||||||
@ -245,6 +431,9 @@ async function extractJobData(jobElement, keyword) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse duties and requirements from description if available
|
||||||
|
const parsed = parseDutiesAndRequirements(description);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
jobId,
|
jobId,
|
||||||
title,
|
title,
|
||||||
@ -255,6 +444,8 @@ async function extractJobData(jobElement, keyword) {
|
|||||||
dateText,
|
dateText,
|
||||||
daysAgo,
|
daysAgo,
|
||||||
description,
|
description,
|
||||||
|
roleDuties: parsed.duties,
|
||||||
|
jobRequirements: parsed.requirements,
|
||||||
isFeatured,
|
isFeatured,
|
||||||
keyword,
|
keyword,
|
||||||
extractedAt: new Date().toISOString(),
|
extractedAt: new Date().toISOString(),
|
||||||
|
|||||||
@ -10,20 +10,34 @@ const path = require("path");
|
|||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const CoreParser = require("../core-parser");
|
const CoreParser = require("../core-parser");
|
||||||
const { linkedinStrategy } = require("./strategies/linkedin-strategy");
|
const { linkedinStrategy } = require("./strategies/linkedin-strategy");
|
||||||
const { logger, analyzeBatch, checkOllamaStatus } = require("ai-analyzer");
|
const { logger, analyzeBatch, checkOllamaStatus, DEFAULT_MODEL } = require("ai-analyzer");
|
||||||
|
|
||||||
// Load environment variables
|
// Load environment variables - check both linkedin-parser/.env and root .env
|
||||||
require("dotenv").config({ path: path.join(__dirname, ".env") });
|
const localEnvPath = path.join(__dirname, ".env");
|
||||||
|
const rootEnvPath = path.join(__dirname, "..", ".env");
|
||||||
|
|
||||||
|
// Try local .env first, then root .env
|
||||||
|
if (fs.existsSync(localEnvPath)) {
|
||||||
|
require("dotenv").config({ path: localEnvPath });
|
||||||
|
} else if (fs.existsSync(rootEnvPath)) {
|
||||||
|
require("dotenv").config({ path: rootEnvPath });
|
||||||
|
} else {
|
||||||
|
// Try default dotenv behavior (looks in current directory and parent directories)
|
||||||
|
require("dotenv").config();
|
||||||
|
}
|
||||||
|
|
||||||
// Configuration from environment
|
// Configuration from environment
|
||||||
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
||||||
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||||
const HEADLESS = process.env.HEADLESS !== "false";
|
const HEADLESS = process.env.HEADLESS !== "false";
|
||||||
const SEARCH_KEYWORDS =
|
const SEARCH_KEYWORDS =
|
||||||
process.env.SEARCH_KEYWORDS || "layoff,downsizing,job cuts";
|
process.env.SEARCH_KEYWORDS || "layoff";//,downsizing";//,job cuts";
|
||||||
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
const LOCATION_FILTER = process.env.LOCATION_FILTER;
|
||||||
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS === "true";
|
const ENABLE_AI_ANALYSIS = process.env.ENABLE_AI_ANALYSIS !== "false";
|
||||||
|
const AI_CONTEXT = process.env.AI_CONTEXT || "job market analysis and trends";
|
||||||
|
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || DEFAULT_MODEL;
|
||||||
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
|
const MAX_RESULTS = parseInt(process.env.MAX_RESULTS) || 50;
|
||||||
|
const EXTRACT_LOCATION_FROM_PROFILE = process.env.EXTRACT_LOCATION_FROM_PROFILE === "true";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main LinkedIn parser function
|
* Main LinkedIn parser function
|
||||||
@ -58,6 +72,7 @@ async function startLinkedInParser(options = {}) {
|
|||||||
keywords,
|
keywords,
|
||||||
locationFilter: LOCATION_FILTER,
|
locationFilter: LOCATION_FILTER,
|
||||||
maxResults: MAX_RESULTS,
|
maxResults: MAX_RESULTS,
|
||||||
|
extractLocationFromProfile: EXTRACT_LOCATION_FROM_PROFILE,
|
||||||
credentials: {
|
credentials: {
|
||||||
username: LINKEDIN_USERNAME,
|
username: LINKEDIN_USERNAME,
|
||||||
password: LINKEDIN_PASSWORD,
|
password: LINKEDIN_PASSWORD,
|
||||||
@ -66,52 +81,109 @@ async function startLinkedInParser(options = {}) {
|
|||||||
|
|
||||||
const { results, rejectedResults, summary } = parseResult;
|
const { results, rejectedResults, summary } = parseResult;
|
||||||
|
|
||||||
// AI Analysis if enabled
|
// AI Analysis if enabled - embed results into each post
|
||||||
let analysisResults = null;
|
let resultsWithAI = results;
|
||||||
|
let aiAnalysisCompleted = false;
|
||||||
if (ENABLE_AI_ANALYSIS && results.length > 0) {
|
if (ENABLE_AI_ANALYSIS && results.length > 0) {
|
||||||
logger.step("🧠 Running AI Analysis...");
|
logger.step("🧠 Running AI Analysis...");
|
||||||
|
|
||||||
const ollamaStatus = await checkOllamaStatus();
|
const ollamaAvailable = await checkOllamaStatus(OLLAMA_MODEL);
|
||||||
if (ollamaStatus.available) {
|
if (ollamaAvailable) {
|
||||||
analysisResults = await analyzeBatch(results, {
|
// Prepare data for analysis (analyzeBatch expects posts with 'text' field)
|
||||||
context:
|
const analysisData = results.map((post) => ({
|
||||||
"LinkedIn posts analysis focusing on job market trends and layoffs",
|
text: post.text || post.content || "",
|
||||||
|
location: post.location || "",
|
||||||
|
keyword: post.keyword || "",
|
||||||
|
timestamp: post.timestamp || post.extractedAt || "",
|
||||||
|
}));
|
||||||
|
|
||||||
|
const analysisResults = await analyzeBatch(
|
||||||
|
analysisData,
|
||||||
|
AI_CONTEXT,
|
||||||
|
OLLAMA_MODEL
|
||||||
|
);
|
||||||
|
|
||||||
|
// Embed AI analysis into each result
|
||||||
|
resultsWithAI = results.map((post, index) => {
|
||||||
|
const aiResult = analysisResults[index];
|
||||||
|
return {
|
||||||
|
...post,
|
||||||
|
aiAnalysis: {
|
||||||
|
isRelevant: aiResult.isRelevant,
|
||||||
|
confidence: aiResult.confidence,
|
||||||
|
reasoning: aiResult.reasoning,
|
||||||
|
context: AI_CONTEXT,
|
||||||
|
model: OLLAMA_MODEL,
|
||||||
|
analyzedAt: new Date().toISOString(),
|
||||||
|
},
|
||||||
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
aiAnalysisCompleted = true;
|
||||||
logger.success(`✅ AI Analysis completed for ${results.length} posts`);
|
logger.success(`✅ AI Analysis completed for ${results.length} posts`);
|
||||||
} else {
|
} else {
|
||||||
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
logger.warning("⚠️ Ollama not available, skipping AI analysis");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save results
|
// Prepare results with embedded AI analysis
|
||||||
const outputData = {
|
const outputData = {
|
||||||
metadata: {
|
metadata: {
|
||||||
extractedAt: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
|
totalPosts: resultsWithAI.length,
|
||||||
|
rejectedPosts: rejectedResults.length,
|
||||||
|
aiAnalysisEnabled: ENABLE_AI_ANALYSIS,
|
||||||
|
aiAnalysisCompleted: aiAnalysisCompleted,
|
||||||
|
aiContext: aiAnalysisCompleted ? AI_CONTEXT : undefined,
|
||||||
|
aiModel: aiAnalysisCompleted ? OLLAMA_MODEL : undefined,
|
||||||
|
locationFilter: LOCATION_FILTER || undefined,
|
||||||
parser: "linkedin-parser",
|
parser: "linkedin-parser",
|
||||||
version: "2.0.0",
|
version: "2.0.0",
|
||||||
summary,
|
|
||||||
analysisResults,
|
|
||||||
},
|
},
|
||||||
results,
|
results: resultsWithAI,
|
||||||
rejectedResults,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Prepare rejected posts file
|
||||||
|
const rejectedData = rejectedResults.map((post) => ({
|
||||||
|
rejected: true,
|
||||||
|
reason: post.rejectionReason || "Location filter failed: Location not in filter",
|
||||||
|
keyword: post.keyword,
|
||||||
|
text: post.text || post.content,
|
||||||
|
profileLink: post.profileLink || post.authorUrl,
|
||||||
|
location: post.location || post.profileLocation,
|
||||||
|
timestamp: post.timestamp || post.extractedAt,
|
||||||
|
}));
|
||||||
|
|
||||||
const resultsDir = path.join(__dirname, "results");
|
const resultsDir = path.join(__dirname, "results");
|
||||||
if (!fs.existsSync(resultsDir)) {
|
if (!fs.existsSync(resultsDir)) {
|
||||||
fs.mkdirSync(resultsDir, { recursive: true });
|
fs.mkdirSync(resultsDir, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||||
const filename = `linkedin-results-${timestamp}.json`;
|
const resultsFilename = `linkedin-results-${timestamp}.json`;
|
||||||
const filepath = path.join(resultsDir, filename);
|
const rejectedFilename = `linkedin-rejected-${timestamp}.json`;
|
||||||
|
const resultsFilepath = path.join(resultsDir, resultsFilename);
|
||||||
|
const rejectedFilepath = path.join(resultsDir, rejectedFilename);
|
||||||
|
|
||||||
fs.writeFileSync(filepath, JSON.stringify(outputData, null, 2));
|
// Save results with AI analysis
|
||||||
|
fs.writeFileSync(resultsFilepath, JSON.stringify(outputData, null, 2));
|
||||||
|
|
||||||
|
// Save rejected posts separately
|
||||||
|
if (rejectedData.length > 0) {
|
||||||
|
fs.writeFileSync(
|
||||||
|
rejectedFilepath,
|
||||||
|
JSON.stringify(rejectedData, null, 2)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Final summary
|
// Final summary
|
||||||
logger.success("✅ LinkedIn parsing completed successfully!");
|
logger.success("✅ LinkedIn parsing completed successfully!");
|
||||||
logger.info(`📊 Total posts found: ${results.length}`);
|
logger.info(`📊 Total posts found: ${resultsWithAI.length}`);
|
||||||
logger.info(`❌ Total rejected: ${rejectedResults.length}`);
|
logger.info(`❌ Total rejected: ${rejectedResults.length}`);
|
||||||
logger.info(`📁 Results saved to: ${filepath}`);
|
logger.info(`📁 Results saved to: ${resultsFilepath}`);
|
||||||
|
if (rejectedData.length > 0) {
|
||||||
|
logger.info(`📁 Rejected posts saved to: ${rejectedFilepath}`);
|
||||||
|
}
|
||||||
|
|
||||||
return outputData;
|
return outputData;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
3705
linkedin-parser/package-lock.json
generated
Normal file
3705
linkedin-parser/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -10,6 +10,7 @@ const {
|
|||||||
containsAnyKeyword,
|
containsAnyKeyword,
|
||||||
validateLocationAgainstFilters,
|
validateLocationAgainstFilters,
|
||||||
extractLocationFromProfile,
|
extractLocationFromProfile,
|
||||||
|
parseLocationFilters,
|
||||||
} = require("ai-analyzer");
|
} = require("ai-analyzer");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -20,6 +21,7 @@ async function linkedinStrategy(coreParser, options = {}) {
|
|||||||
keywords = ["layoff", "downsizing", "job cuts"],
|
keywords = ["layoff", "downsizing", "job cuts"],
|
||||||
locationFilter = null,
|
locationFilter = null,
|
||||||
maxResults = 50,
|
maxResults = 50,
|
||||||
|
extractLocationFromProfile = false,
|
||||||
credentials = {},
|
credentials = {},
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
@ -48,22 +50,65 @@ async function linkedinStrategy(coreParser, options = {}) {
|
|||||||
await coreParser.navigateTo(searchUrl, {
|
await coreParser.navigateTo(searchUrl, {
|
||||||
pageId: "linkedin-main",
|
pageId: "linkedin-main",
|
||||||
retries: 2,
|
retries: 2,
|
||||||
|
waitUntil: "networkidle", // Wait for network to be idle
|
||||||
});
|
});
|
||||||
|
|
||||||
// Wait for search results
|
// Wait for page to load and content to render
|
||||||
const hasResults = await coreParser.navigationManager.navigateAndWaitFor(
|
await new Promise(resolve => setTimeout(resolve, 5000)); // Give LinkedIn time to render dynamic content
|
||||||
searchUrl,
|
|
||||||
|
// Scroll down a bit to trigger lazy loading
|
||||||
|
try {
|
||||||
|
await page.evaluate(() => {
|
||||||
|
window.scrollTo(0, 500);
|
||||||
|
});
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||||
|
} catch (e) {
|
||||||
|
logger.debug(`Could not scroll page: ${e.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for search results - try multiple selectors
|
||||||
|
let hasResults = false;
|
||||||
|
const possibleSelectors = [
|
||||||
|
".feed-shared-update-v2",
|
||||||
|
"article[data-urn*='urn:li:activity']",
|
||||||
|
"article",
|
||||||
".search-results-container",
|
".search-results-container",
|
||||||
{ pageId: "linkedin-main", timeout: 10000 }
|
".search-results__list",
|
||||||
);
|
".reusable-search__result-container",
|
||||||
|
"[data-test-id='search-results']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of possibleSelectors) {
|
||||||
|
try {
|
||||||
|
await page.waitForSelector(selector, { timeout: 10000 });
|
||||||
|
// Verify we actually have post elements
|
||||||
|
const count = await page.$$(selector).then(elements => elements.length);
|
||||||
|
if (count > 0) {
|
||||||
|
hasResults = true;
|
||||||
|
logger.info(`✅ Found ${count} post elements with selector: ${selector}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Try next selector
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!hasResults) {
|
if (!hasResults) {
|
||||||
logger.warning(`No search results found for keyword: ${keyword}`);
|
logger.warning(`⚠️ No search results container found for keyword: ${keyword}`);
|
||||||
|
// Take screenshot for debugging
|
||||||
|
try {
|
||||||
|
const screenshotPath = `debug-${keyword.replace(/\s+/g, '-')}-${Date.now()}.png`;
|
||||||
|
await page.screenshot({ path: screenshotPath, fullPage: true });
|
||||||
|
logger.info(`📸 Debug screenshot saved: ${screenshotPath}`);
|
||||||
|
} catch (e) {
|
||||||
|
logger.warning(`Could not take screenshot: ${e.message}`);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract posts from current page
|
// Extract posts from current page
|
||||||
const posts = await extractPostsFromPage(page, keyword);
|
const posts = await extractPostsFromPage(page, keyword, extractLocationFromProfile);
|
||||||
|
logger.info(`📊 Found ${posts.length} posts for keyword "${keyword}"`);
|
||||||
|
|
||||||
for (const post of posts) {
|
for (const post of posts) {
|
||||||
// Skip duplicates
|
// Skip duplicates
|
||||||
@ -72,17 +117,25 @@ async function linkedinStrategy(coreParser, options = {}) {
|
|||||||
|
|
||||||
// Validate location if filtering enabled
|
// Validate location if filtering enabled
|
||||||
if (locationFilter) {
|
if (locationFilter) {
|
||||||
|
const postLocation = post.location || post.profileLocation || "";
|
||||||
|
// Parse locationFilter string into array if it's a string
|
||||||
|
const locationFiltersArray = typeof locationFilter === 'string'
|
||||||
|
? parseLocationFilters(locationFilter)
|
||||||
|
: locationFilter;
|
||||||
const locationValid = validateLocationAgainstFilters(
|
const locationValid = validateLocationAgainstFilters(
|
||||||
post.location || post.profileLocation,
|
postLocation,
|
||||||
locationFilter
|
locationFiltersArray
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!locationValid) {
|
if (!locationValid.isValid) {
|
||||||
|
logger.debug(`⏭️ Post rejected: location "${postLocation}" doesn't match filter "${locationFilter}"`);
|
||||||
rejectedResults.push({
|
rejectedResults.push({
|
||||||
...post,
|
...post,
|
||||||
rejectionReason: "Location filter mismatch",
|
rejectionReason: locationValid.reasoning || `Location filter mismatch: "${postLocation}" not in "${locationFilter}"`,
|
||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
|
} else {
|
||||||
|
logger.debug(`✅ Post location "${postLocation}" matches filter "${locationFilter}" (${locationValid.reasoning || 'matched'})`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,25 +173,120 @@ async function linkedinStrategy(coreParser, options = {}) {
|
|||||||
/**
|
/**
|
||||||
* Extract posts from current search results page
|
* Extract posts from current search results page
|
||||||
*/
|
*/
|
||||||
async function extractPostsFromPage(page, keyword) {
|
async function extractPostsFromPage(page, keyword, extractLocationFromProfile = false) {
|
||||||
const posts = [];
|
const posts = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Get all post elements
|
// Try multiple selectors for post elements (LinkedIn changes these frequently)
|
||||||
const postElements = await page.$$(".feed-shared-update-v2");
|
// Prioritize selectors that are more specific to actual posts
|
||||||
|
const postSelectors = [
|
||||||
|
"article[data-urn*='urn:li:activity']", // Most specific - posts with activity ID
|
||||||
|
".feed-shared-update-v2[data-urn*='urn:li:activity']",
|
||||||
|
"article.feed-shared-update-v2",
|
||||||
|
".feed-shared-update-v2",
|
||||||
|
"[data-urn*='urn:li:activity']",
|
||||||
|
".reusable-search__result-container",
|
||||||
|
".search-result__wrapper",
|
||||||
|
"article",
|
||||||
|
];
|
||||||
|
|
||||||
for (const postElement of postElements) {
|
let postElements = [];
|
||||||
|
let usedSelector = null;
|
||||||
|
|
||||||
|
for (const selector of postSelectors) {
|
||||||
try {
|
try {
|
||||||
const post = await extractPostData(postElement, keyword);
|
// Wait a bit for elements to be available
|
||||||
if (post) {
|
await page.waitForSelector(selector, { timeout: 3000 }).catch(() => {});
|
||||||
posts.push(post);
|
postElements = await page.$$(selector);
|
||||||
|
|
||||||
|
// Filter to only elements that have a data-urn attribute (actual posts)
|
||||||
|
if (postElements.length > 0) {
|
||||||
|
const validElements = [];
|
||||||
|
for (const elem of postElements) {
|
||||||
|
try {
|
||||||
|
const dataUrn = await elem.getAttribute("data-urn");
|
||||||
|
if (dataUrn && dataUrn.includes("urn:li:activity")) {
|
||||||
|
validElements.push(elem);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Element might have been detached, skip it
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (validElements.length > 0) {
|
||||||
|
postElements = validElements;
|
||||||
|
usedSelector = selector;
|
||||||
|
logger.info(`✅ Found ${postElements.length} valid post elements using selector: ${selector}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (e) {
|
||||||
logger.warning(`Failed to extract post data: ${error.message}`);
|
// Try next selector
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (postElements.length === 0) {
|
||||||
|
logger.warning(`⚠️ No post elements found with any selector. Page might have different structure.`);
|
||||||
|
// Log page title and URL for debugging
|
||||||
|
try {
|
||||||
|
const pageTitle = await page.title();
|
||||||
|
const pageUrl = page.url();
|
||||||
|
logger.info(`📄 Page title: ${pageTitle}`);
|
||||||
|
logger.info(`🔗 Page URL: ${pageUrl}`);
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore
|
||||||
|
}
|
||||||
|
return posts;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`🔍 Processing ${postElements.length} post elements...`);
|
||||||
|
|
||||||
|
for (let i = 0; i < postElements.length; i++) {
|
||||||
|
try {
|
||||||
|
// Scroll element into view to ensure it's fully rendered
|
||||||
|
try {
|
||||||
|
await postElements[i].evaluate((el) => {
|
||||||
|
el.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||||||
|
});
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for rendering
|
||||||
|
} catch (e) {
|
||||||
|
// Element might already be in view or detached, continue anyway
|
||||||
|
}
|
||||||
|
|
||||||
|
const post = await extractPostData(postElements[i], keyword);
|
||||||
|
if (post) {
|
||||||
|
// If location is missing and we're enabled to extract from profile, try to get it
|
||||||
|
if (!post.location && extractLocationFromProfile && post.authorUrl) {
|
||||||
|
try {
|
||||||
|
logger.debug(`📍 Location missing for post ${i + 1}, attempting to extract from profile...`);
|
||||||
|
const profileLocation = await extractLocationFromProfilePage(page, post.authorUrl);
|
||||||
|
if (profileLocation) {
|
||||||
|
post.location = profileLocation;
|
||||||
|
post.profileLocation = profileLocation;
|
||||||
|
logger.debug(`✅ Extracted location from profile: ${profileLocation}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.debug(`⚠️ Could not extract location from profile: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
posts.push(post);
|
||||||
|
const hasContent = post.content && post.content.length > 0;
|
||||||
|
const hasAuthor = post.authorName && post.authorName.length > 0;
|
||||||
|
const hasLocation = post.location && post.location.length > 0;
|
||||||
|
logger.debug(`✅ Extracted post ${i + 1}/${postElements.length}: ${post.postId.substring(0, 20)}... (content: ${hasContent ? 'yes' : 'no'}, author: ${hasAuthor ? 'yes' : 'no'}, location: ${hasLocation ? 'yes' : 'no'})`);
|
||||||
|
} else {
|
||||||
|
logger.debug(`⏭️ Post ${i + 1}/${postElements.length} filtered out (no keyword match or missing data)`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.warning(`❌ Failed to extract post ${i + 1} data: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`✅ Successfully extracted ${posts.length} valid posts from ${postElements.length} elements`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Failed to extract posts from page: ${error.message}`);
|
logger.error(`❌ Failed to extract posts from page: ${error.message}`);
|
||||||
|
logger.error(`Stack trace: ${error.stack}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return posts;
|
return posts;
|
||||||
@ -146,75 +294,606 @@ async function extractPostsFromPage(page, keyword) {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract data from individual post element
|
* Extract data from individual post element
|
||||||
|
* Uses evaluate() to extract data directly from DOM for better reliability
|
||||||
*/
|
*/
|
||||||
async function extractPostData(postElement, keyword) {
|
async function extractPostData(postElement, keyword) {
|
||||||
try {
|
try {
|
||||||
// Extract post ID
|
// Use evaluate to extract data directly from the DOM element
|
||||||
const postId = (await postElement.getAttribute("data-urn")) || "";
|
// This is more reliable than using selectors which may not match
|
||||||
|
const postData = await postElement.evaluate((el, keyword) => {
|
||||||
|
const data = {
|
||||||
|
postId: "",
|
||||||
|
authorName: "",
|
||||||
|
authorUrl: "",
|
||||||
|
content: "",
|
||||||
|
timestamp: "",
|
||||||
|
location: "",
|
||||||
|
likes: 0,
|
||||||
|
comments: 0,
|
||||||
|
};
|
||||||
|
|
||||||
// Extract author info
|
// Extract post ID from data-urn attribute
|
||||||
const authorElement = await postElement.$(".feed-shared-actor__name");
|
data.postId = el.getAttribute("data-urn") ||
|
||||||
const authorName = authorElement
|
el.getAttribute("data-activity-id") ||
|
||||||
? cleanText(await authorElement.textContent())
|
el.querySelector("[data-urn]")?.getAttribute("data-urn") || "";
|
||||||
: "";
|
|
||||||
|
|
||||||
const authorLinkElement = await postElement.$(".feed-shared-actor__name a");
|
// Extract author name - try multiple selectors and approaches
|
||||||
const authorUrl = authorLinkElement
|
const authorSelectors = [
|
||||||
? await authorLinkElement.getAttribute("href")
|
".feed-shared-actor__name",
|
||||||
: "";
|
".feed-shared-actor__name-link",
|
||||||
|
".update-components-actor__name",
|
||||||
|
".feed-shared-actor__name a",
|
||||||
|
"[data-test-id='actor-name']",
|
||||||
|
"span[aria-label*='name']",
|
||||||
|
"a[href*='/in/'] span",
|
||||||
|
".feed-shared-actor a span",
|
||||||
|
".feed-shared-actor span",
|
||||||
|
".feed-shared-actor__name-link span",
|
||||||
|
];
|
||||||
|
|
||||||
// Extract post content
|
for (const selector of authorSelectors) {
|
||||||
const contentElement = await postElement.$(".feed-shared-text");
|
const elem = el.querySelector(selector);
|
||||||
const content = contentElement
|
if (elem) {
|
||||||
? cleanText(await contentElement.textContent())
|
const text = elem.textContent?.trim() || elem.innerText?.trim();
|
||||||
: "";
|
if (text && text.length > 0 && text.length < 100) { // Reasonable name length
|
||||||
|
data.authorName = text;
|
||||||
|
// Try to get link from same element or parent
|
||||||
|
const link = elem.closest("a") || elem.querySelector("a");
|
||||||
|
if (link) {
|
||||||
|
data.authorUrl = link.getAttribute("href") || "";
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Extract timestamp
|
// If author name found but no URL, try to find link separately
|
||||||
const timeElement = await postElement.$(
|
if (data.authorName && !data.authorUrl) {
|
||||||
".feed-shared-actor__sub-description time"
|
const authorLink = el.querySelector(".feed-shared-actor__name-link, .feed-shared-actor__name a, a[href*='/in/']");
|
||||||
);
|
if (authorLink) {
|
||||||
const timestamp = timeElement
|
data.authorUrl = authorLink.getAttribute("href") || "";
|
||||||
? await timeElement.getAttribute("datetime")
|
}
|
||||||
: "";
|
}
|
||||||
|
|
||||||
// Extract engagement metrics
|
// Fallback: Look for any link with /in/ pattern and get the name from nearby text
|
||||||
const likesElement = await postElement.$(".social-counts-reactions__count");
|
if (!data.authorName) {
|
||||||
const likesText = likesElement
|
const profileLinks = el.querySelectorAll("a[href*='/in/']");
|
||||||
? cleanText(await likesElement.textContent())
|
for (const link of profileLinks) {
|
||||||
: "0";
|
// Skip if it's a company link
|
||||||
|
if (link.getAttribute("href")?.includes("/company/")) continue;
|
||||||
|
|
||||||
const commentsElement = await postElement.$(
|
// Get text from the link or nearby
|
||||||
".social-counts-comments__count"
|
const linkText = link.textContent?.trim() || link.innerText?.trim();
|
||||||
);
|
if (linkText && linkText.length > 0 && linkText.length < 100 && !linkText.includes("View")) {
|
||||||
const commentsText = commentsElement
|
data.authorName = linkText;
|
||||||
? cleanText(await commentsElement.textContent())
|
data.authorUrl = link.getAttribute("href") || "";
|
||||||
: "0";
|
break;
|
||||||
|
}
|
||||||
|
// Try to get text from first child span
|
||||||
|
const childSpan = link.querySelector("span");
|
||||||
|
if (childSpan) {
|
||||||
|
const spanText = childSpan.textContent?.trim() || childSpan.innerText?.trim();
|
||||||
|
if (spanText && spanText.length > 0 && spanText.length < 100) {
|
||||||
|
data.authorName = spanText;
|
||||||
|
data.authorUrl = link.getAttribute("href") || "";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Try to get text from parent
|
||||||
|
const parentText = link.parentElement?.textContent?.trim();
|
||||||
|
if (parentText && parentText.length < 100 && !parentText.includes("View")) {
|
||||||
|
// Extract just the name part (first line or first few words)
|
||||||
|
const namePart = parentText.split("\n")[0].split("·")[0].trim();
|
||||||
|
if (namePart.length > 0 && namePart.length < 100) {
|
||||||
|
data.authorName = namePart;
|
||||||
|
data.authorUrl = link.getAttribute("href") || "";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check if post contains relevant keywords
|
// Last resort: Extract from actor section by looking at all text
|
||||||
const isRelevant = containsAnyKeyword(content, [keyword]);
|
if (!data.authorName) {
|
||||||
|
const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor, [class*='actor']");
|
||||||
|
if (actorSection) {
|
||||||
|
const actorText = actorSection.textContent || actorSection.innerText || "";
|
||||||
|
const lines = actorText.split("\n").map(l => l.trim()).filter(l => l.length > 0);
|
||||||
|
// First non-empty line is often the name
|
||||||
|
for (const line of lines) {
|
||||||
|
if (line.length > 0 && line.length < 100 &&
|
||||||
|
!line.includes("·") &&
|
||||||
|
!line.includes("ago") &&
|
||||||
|
!line.match(/^\d+/) &&
|
||||||
|
!line.toLowerCase().includes("view")) {
|
||||||
|
data.authorName = line;
|
||||||
|
// Try to find associated link
|
||||||
|
const link = actorSection.querySelector("a[href*='/in/']");
|
||||||
|
if (link) {
|
||||||
|
data.authorUrl = link.getAttribute("href") || "";
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!isRelevant) {
|
// Extract post content - try multiple selectors
|
||||||
return null; // Skip irrelevant posts
|
const contentSelectors = [
|
||||||
|
".feed-shared-text",
|
||||||
|
".feed-shared-text__text-view",
|
||||||
|
".feed-shared-update-v2__description",
|
||||||
|
".update-components-text",
|
||||||
|
"[data-test-id='post-text']",
|
||||||
|
".feed-shared-text span",
|
||||||
|
".feed-shared-update-v2__description-wrapper",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of contentSelectors) {
|
||||||
|
const elem = el.querySelector(selector);
|
||||||
|
if (elem) {
|
||||||
|
const text = elem.textContent?.trim() || elem.innerText?.trim();
|
||||||
|
if (text && text.length > 10) { // Only use if substantial content
|
||||||
|
data.content = text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract timestamp
|
||||||
|
const timeSelectors = [
|
||||||
|
".feed-shared-actor__sub-description time",
|
||||||
|
"time[datetime]",
|
||||||
|
"[data-test-id='timestamp']",
|
||||||
|
".feed-shared-actor__sub-description time[datetime]",
|
||||||
|
"time",
|
||||||
|
".feed-shared-actor__sub-description time",
|
||||||
|
"span[aria-label*='time']",
|
||||||
|
"span[aria-label*='ago']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of timeSelectors) {
|
||||||
|
const elem = el.querySelector(selector);
|
||||||
|
if (elem) {
|
||||||
|
data.timestamp = elem.getAttribute("datetime") ||
|
||||||
|
elem.getAttribute("title") ||
|
||||||
|
elem.getAttribute("aria-label") ||
|
||||||
|
elem.textContent?.trim() || "";
|
||||||
|
if (data.timestamp) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: Look for time-like patterns in sub-description
|
||||||
|
if (!data.timestamp) {
|
||||||
|
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
||||||
|
if (subDesc) {
|
||||||
|
const subDescText = subDesc.textContent || subDesc.innerText || "";
|
||||||
|
// Look for patterns like "2h", "3d", "1w", "2 months ago", etc.
|
||||||
|
const timePatterns = [
|
||||||
|
/\d+\s*(minute|hour|day|week|month|year)s?\s*ago/i,
|
||||||
|
/\d+\s*(h|d|w|mo|yr)/i,
|
||||||
|
/(just now|today|yesterday)/i,
|
||||||
|
];
|
||||||
|
for (const pattern of timePatterns) {
|
||||||
|
const match = subDescText.match(pattern);
|
||||||
|
if (match) {
|
||||||
|
data.timestamp = match[0];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract location - try multiple approaches
|
||||||
|
const locationSelectors = [
|
||||||
|
".feed-shared-actor__sub-description .feed-shared-actor__sub-description-link",
|
||||||
|
".feed-shared-actor__sub-description-link--without-hover",
|
||||||
|
"span[aria-label*='location' i]",
|
||||||
|
"span[aria-label*='Location']",
|
||||||
|
".feed-shared-actor__sub-description span",
|
||||||
|
".feed-shared-actor__sub-description a",
|
||||||
|
"a[href*='/company/']",
|
||||||
|
"a[href*='/location/']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of locationSelectors) {
|
||||||
|
const elem = el.querySelector(selector);
|
||||||
|
if (elem) {
|
||||||
|
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || elem.innerText?.trim() || "";
|
||||||
|
// Check if it looks like a location (contains comma or common location words)
|
||||||
|
if (text && text.length > 2 && text.length < 100) {
|
||||||
|
// More flexible location detection
|
||||||
|
if (text.includes(",") ||
|
||||||
|
/(city|province|state|country|region|ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut)/i.test(text) ||
|
||||||
|
/^[A-Z][a-z]+,\s*[A-Z][a-z]+/i.test(text)) {
|
||||||
|
data.location = text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no location found, try parsing from sub-description text
|
||||||
|
if (!data.location) {
|
||||||
|
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
||||||
|
if (subDesc) {
|
||||||
|
const subDescText = subDesc.textContent || subDesc.innerText || "";
|
||||||
|
|
||||||
|
// First, try to get all links in sub-description (location is often a link)
|
||||||
|
const subDescLinks = subDesc.querySelectorAll("a");
|
||||||
|
for (const link of subDescLinks) {
|
||||||
|
const linkText = link.textContent?.trim() || link.innerText?.trim() || "";
|
||||||
|
const linkHref = link.getAttribute("href") || "";
|
||||||
|
|
||||||
|
// Skip if it's a time/date link or company link
|
||||||
|
if (linkHref.includes("/company/") || linkText.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w)/i)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If link text looks like a location
|
||||||
|
if (linkText && linkText.length > 2 && linkText.length < 100) {
|
||||||
|
if (linkText.includes(",") ||
|
||||||
|
/(ontario|alberta|british columbia|quebec|manitoba|saskatchewan|nova scotia|new brunswick|newfoundland|prince edward island|yukon|northwest territories|nunavut|toronto|vancouver|calgary|ottawa|montreal|winnipeg|edmonton|halifax|victoria|regina|saskatoon|windsor|kitchener|hamilton|london|st\.?\s*catharines|oshawa|barrie|greater sudbury|sherbrooke|kelowna|abbotsford|trois-rivières|guelph|cambridge|coquitlam|saanich|saint john|thunder bay|waterloo|delta|chatham|red deer|kamloops|brantford|whitehorse|yellowknife|iqaluit)/i.test(linkText)) {
|
||||||
|
data.location = linkText;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still no location, try pattern matching on the full text
|
||||||
|
if (!data.location && subDescText) {
|
||||||
|
// Look for location patterns (City, Province/State, Country)
|
||||||
|
const locationPatterns = [
|
||||||
|
// Full location: "City, Province, Country"
|
||||||
|
/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,\s*([A-Z][a-z]+))?/,
|
||||||
|
// City, Province
|
||||||
|
/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*,\s*([A-Z][a-z]+)/,
|
||||||
|
// Just province/state names
|
||||||
|
/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut|ON|AB|BC|QC|MB|SK|NS|NB|NL|PE|YT|NT|NU)\b/i,
|
||||||
|
// Major cities
|
||||||
|
/\b(Toronto|Vancouver|Calgary|Ottawa|Montreal|Winnipeg|Edmonton|Halifax|Victoria|Regina|Saskatoon)\b/i,
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const pattern of locationPatterns) {
|
||||||
|
const match = subDescText.match(pattern);
|
||||||
|
if (match) {
|
||||||
|
// Get more context around the match
|
||||||
|
const matchIndex = subDescText.indexOf(match[0]);
|
||||||
|
const contextStart = Math.max(0, matchIndex - 30);
|
||||||
|
const contextEnd = Math.min(subDescText.length, matchIndex + match[0].length + 30);
|
||||||
|
const context = subDescText.substring(contextStart, contextEnd).trim();
|
||||||
|
|
||||||
|
// Extract just the location part (remove time/date info)
|
||||||
|
let locationText = match[0].trim();
|
||||||
|
// If we have more context, try to get a better location string
|
||||||
|
if (context.includes(",") && context.length < 100) {
|
||||||
|
// Try to extract "City, Province" pattern from context
|
||||||
|
const cityProvinceMatch = context.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*([A-Z][a-z]+)/);
|
||||||
|
if (cityProvinceMatch) {
|
||||||
|
locationText = cityProvinceMatch[0].trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data.location = locationText;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last resort: extract any text that looks location-like from sub-description
|
||||||
|
if (!data.location && subDescText) {
|
||||||
|
// Split by common separators and look for location-like text
|
||||||
|
const parts = subDescText.split(/[·•|]/).map(p => p.trim()).filter(p => p.length > 0);
|
||||||
|
for (const part of parts) {
|
||||||
|
// Skip if it looks like time/date
|
||||||
|
if (part.match(/\d+\s*(minute|hour|day|week|month|year|h|d|w|ago)/i)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Check if it looks like a location
|
||||||
|
if (part.length > 2 && part.length < 100 &&
|
||||||
|
(part.includes(",") ||
|
||||||
|
/(ontario|alberta|british columbia|quebec|manitoba|toronto|vancouver|calgary|ottawa|montreal)/i.test(part))) {
|
||||||
|
data.location = part;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final fallback: look anywhere in the actor section for location-like text
|
||||||
|
if (!data.location) {
|
||||||
|
const actorSection = el.querySelector(".feed-shared-actor, .update-components-actor");
|
||||||
|
if (actorSection) {
|
||||||
|
const actorText = actorSection.textContent || actorSection.innerText || "";
|
||||||
|
// Look for province names
|
||||||
|
const provinceMatch = actorText.match(/\b(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)\b/i);
|
||||||
|
if (provinceMatch) {
|
||||||
|
// Try to get city, province if available
|
||||||
|
const cityProvinceMatch = actorText.match(/([A-Z][a-z]+),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
|
||||||
|
if (cityProvinceMatch) {
|
||||||
|
data.location = cityProvinceMatch[0].trim();
|
||||||
|
} else {
|
||||||
|
data.location = provinceMatch[0].trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to extract from any hover cards or mini profiles in the DOM
|
||||||
|
if (!data.location) {
|
||||||
|
// Look for mini profile cards or tooltips
|
||||||
|
const miniProfileSelectors = [
|
||||||
|
"[data-control-name='hovercard']",
|
||||||
|
".artdeco-hoverable-trigger",
|
||||||
|
".feed-shared-actor__meta",
|
||||||
|
".pv-text-details__left-panel",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of miniProfileSelectors) {
|
||||||
|
const elem = el.querySelector(selector);
|
||||||
|
if (elem) {
|
||||||
|
const text = elem.textContent || elem.innerText || "";
|
||||||
|
// Look for location patterns
|
||||||
|
const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),\s*(Ontario|Alberta|British Columbia|Quebec|Manitoba|Saskatchewan|Nova Scotia|New Brunswick|Newfoundland|Prince Edward Island|Yukon|Northwest Territories|Nunavut)/i);
|
||||||
|
if (locationMatch) {
|
||||||
|
data.location = locationMatch[0].trim();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to extract from data attributes or hidden elements
|
||||||
|
if (!data.location) {
|
||||||
|
// Check for data attributes that might contain location
|
||||||
|
const actorSection = el.querySelector(".feed-shared-actor");
|
||||||
|
if (actorSection) {
|
||||||
|
// Check all data attributes
|
||||||
|
for (const attr of actorSection.attributes) {
|
||||||
|
if (attr.name.startsWith("data-") && attr.value) {
|
||||||
|
const value = attr.value.toLowerCase();
|
||||||
|
// Look for location-like patterns in data attributes
|
||||||
|
if (/(ontario|alberta|british columbia|quebec|toronto|vancouver|calgary|ottawa|montreal)/i.test(value)) {
|
||||||
|
// Try to extract the actual location text
|
||||||
|
const locationMatch = attr.value.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
|
||||||
|
if (locationMatch) {
|
||||||
|
data.location = locationMatch[0];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for hidden spans or divs with location info
|
||||||
|
const hiddenElements = actorSection.querySelectorAll("span[style*='display: none'], div[style*='display: none'], [aria-hidden='true']");
|
||||||
|
for (const hiddenElem of hiddenElements) {
|
||||||
|
const text = hiddenElem.textContent || hiddenElem.getAttribute("aria-label") || "";
|
||||||
|
if (text && /(ontario|alberta|british columbia|quebec|toronto|vancouver)/i.test(text)) {
|
||||||
|
const locationMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+)/);
|
||||||
|
if (locationMatch) {
|
||||||
|
data.location = locationMatch[0].trim();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract engagement metrics - try multiple approaches
|
||||||
|
const likesSelectors = [
|
||||||
|
".social-counts-reactions__count",
|
||||||
|
"[data-test-id='reactions-count']",
|
||||||
|
".social-counts__reactions-count",
|
||||||
|
".feed-shared-social-action-bar__reactions-count",
|
||||||
|
"button[aria-label*='reaction']",
|
||||||
|
"button[aria-label*='like']",
|
||||||
|
".social-actions-button__reactions-count",
|
||||||
|
"[data-test-id='social-actions__reactions-count']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of likesSelectors) {
|
||||||
|
const elem = el.querySelector(selector);
|
||||||
|
if (elem) {
|
||||||
|
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
|
||||||
|
const match = text.match(/(\d+)/);
|
||||||
|
if (match) {
|
||||||
|
data.likes = parseInt(match[1], 10) || 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: Look for any button or element with reaction/like text
|
||||||
|
if (data.likes === 0) {
|
||||||
|
const allButtons = el.querySelectorAll("button, span, div");
|
||||||
|
for (const btn of allButtons) {
|
||||||
|
const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
|
||||||
|
if (/reaction|like/i.test(text)) {
|
||||||
|
const match = text.match(/(\d+)/);
|
||||||
|
if (match) {
|
||||||
|
data.likes = parseInt(match[1], 10) || 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const commentsSelectors = [
|
||||||
|
".social-counts-comments__count",
|
||||||
|
"[data-test-id='comments-count']",
|
||||||
|
".social-counts__comments-count",
|
||||||
|
".feed-shared-social-action-bar__comments-count",
|
||||||
|
"button[aria-label*='comment']",
|
||||||
|
".social-actions-button__comments-count",
|
||||||
|
"[data-test-id='social-actions__comments-count']",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of commentsSelectors) {
|
||||||
|
const elem = el.querySelector(selector);
|
||||||
|
if (elem) {
|
||||||
|
const text = elem.textContent?.trim() || elem.getAttribute("aria-label") || "";
|
||||||
|
const match = text.match(/(\d+)/);
|
||||||
|
if (match) {
|
||||||
|
data.comments = parseInt(match[1], 10) || 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: Look for any button or element with comment text
|
||||||
|
if (data.comments === 0) {
|
||||||
|
const allButtons = el.querySelectorAll("button, span, div");
|
||||||
|
for (const btn of allButtons) {
|
||||||
|
const text = btn.textContent?.trim() || btn.getAttribute("aria-label") || "";
|
||||||
|
if (/comment/i.test(text)) {
|
||||||
|
const match = text.match(/(\d+)/);
|
||||||
|
if (match) {
|
||||||
|
data.comments = parseInt(match[1], 10) || 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}, keyword);
|
||||||
|
|
||||||
|
// Clean and format the extracted data
|
||||||
|
const authorName = cleanText(postData.authorName);
|
||||||
|
let authorUrl = postData.authorUrl || "";
|
||||||
|
if (authorUrl && !authorUrl.startsWith("http")) {
|
||||||
|
authorUrl = `https://www.linkedin.com${authorUrl}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const content = cleanText(postData.content);
|
||||||
|
const location = cleanText(postData.location);
|
||||||
|
const timestamp = postData.timestamp || "";
|
||||||
|
|
||||||
|
// Validate we have minimum required data
|
||||||
|
if (!postData.postId && !content) {
|
||||||
|
logger.debug(`⏭️ Post filtered: missing both postId and content`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log extraction results for debugging
|
||||||
|
const missingFields = [];
|
||||||
|
if (!authorName) missingFields.push("authorName");
|
||||||
|
if (!authorUrl) missingFields.push("authorUrl");
|
||||||
|
if (!location) missingFields.push("location");
|
||||||
|
if (!timestamp) missingFields.push("timestamp");
|
||||||
|
if (postData.likes === 0 && postData.comments === 0) missingFields.push("engagement");
|
||||||
|
|
||||||
|
if (missingFields.length > 0 && postData.postId) {
|
||||||
|
logger.debug(`⚠️ Post ${postData.postId.substring(0, 20)}... missing: ${missingFields.join(", ")}`);
|
||||||
|
|
||||||
|
// If location is missing, log sub-description content for debugging
|
||||||
|
if (!location && process.env.DEBUG_EXTRACTION === "true") {
|
||||||
|
try {
|
||||||
|
const subDescInfo = await postElement.evaluate((el) => {
|
||||||
|
const subDesc = el.querySelector(".feed-shared-actor__sub-description");
|
||||||
|
if (subDesc) {
|
||||||
|
return {
|
||||||
|
text: subDesc.textContent || subDesc.innerText || "",
|
||||||
|
html: subDesc.innerHTML.substring(0, 500),
|
||||||
|
links: Array.from(subDesc.querySelectorAll("a")).map(a => ({
|
||||||
|
text: a.textContent?.trim(),
|
||||||
|
href: a.getAttribute("href")
|
||||||
|
}))
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
if (subDescInfo) {
|
||||||
|
logger.debug(`Sub-description text: "${subDescInfo.text}"`);
|
||||||
|
logger.debug(`Sub-description links: ${JSON.stringify(subDescInfo.links)}`);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore errors in debugging
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optionally log HTML structure for first failed extraction (to help debug)
|
||||||
|
if (process.env.DEBUG_EXTRACTION === "true" && missingFields.length >= 3) {
|
||||||
|
try {
|
||||||
|
const htmlSnippet = await postElement.evaluate((el) => {
|
||||||
|
// Get the outer HTML of the element (limited to first 2000 chars)
|
||||||
|
const html = el.outerHTML || "";
|
||||||
|
return html.substring(0, 2000);
|
||||||
|
});
|
||||||
|
logger.debug(`HTML structure (first 2000 chars):\n${htmlSnippet}`);
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore errors in debugging
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
postId: cleanText(postId),
|
postId: cleanText(postData.postId),
|
||||||
authorName,
|
authorName,
|
||||||
authorUrl,
|
authorUrl,
|
||||||
content,
|
profileLink: authorUrl,
|
||||||
|
text: content,
|
||||||
|
content: content,
|
||||||
|
location: location,
|
||||||
|
profileLocation: location, // Alias for compatibility
|
||||||
timestamp,
|
timestamp,
|
||||||
keyword,
|
keyword,
|
||||||
likes: extractNumber(likesText),
|
likes: postData.likes || 0,
|
||||||
comments: extractNumber(commentsText),
|
comments: postData.comments || 0,
|
||||||
extractedAt: new Date().toISOString(),
|
extractedAt: new Date().toISOString(),
|
||||||
source: "linkedin",
|
source: "linkedin",
|
||||||
|
parser: "linkedout-parser",
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warning(`Error extracting post data: ${error.message}`);
|
logger.warning(`Error extracting post data: ${error.message}`);
|
||||||
|
logger.debug(`Stack trace: ${error.stack}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract location from a LinkedIn profile page
|
||||||
|
*/
|
||||||
|
async function extractLocationFromProfilePage(page, profileUrl) {
|
||||||
|
try {
|
||||||
|
// Ensure URL is complete
|
||||||
|
let fullUrl = profileUrl;
|
||||||
|
if (!fullUrl.startsWith("http")) {
|
||||||
|
fullUrl = `https://www.linkedin.com${fullUrl}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove query parameters that might cause issues
|
||||||
|
fullUrl = fullUrl.split("?")[0];
|
||||||
|
|
||||||
|
// Open profile in new tab
|
||||||
|
const profilePage = await page.context().newPage();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await profilePage.goto(fullUrl, {
|
||||||
|
waitUntil: "domcontentloaded",
|
||||||
|
timeout: 15000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Wait a bit for content to load
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||||
|
|
||||||
|
// Use the extractLocationFromProfile utility from ai-analyzer
|
||||||
|
const location = await extractLocationFromProfile(profilePage);
|
||||||
|
|
||||||
|
await profilePage.close();
|
||||||
|
|
||||||
|
return location;
|
||||||
|
} catch (error) {
|
||||||
|
await profilePage.close();
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.debug(`Failed to extract location from profile ${profileUrl}: ${error.message}`);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract numbers from text (e.g., "15 likes" -> 15)
|
* Extract numbers from text (e.g., "15 likes" -> 15)
|
||||||
*/
|
*/
|
||||||
|
|||||||
3667
package-lock.json
generated
3667
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,7 @@ const testData = JSON.parse(
|
|||||||
);
|
);
|
||||||
const aiResults = testData.positive;
|
const aiResults = testData.positive;
|
||||||
const context = "job layoffs and workforce reduction";
|
const context = "job layoffs and workforce reduction";
|
||||||
const model = "mistral"; // or your default model
|
const model = process.env.OLLAMA_MODEL || "mistral"; // Use OLLAMA_MODEL from env or default to mistral
|
||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
// Check if Ollama is available
|
// Check if Ollama is available
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user