add ai-analyzer and location-utils
This commit is contained in:
parent
ae22315c59
commit
b62854909b
7
.gitignore
vendored
7
.gitignore
vendored
@ -1,9 +1,10 @@
|
||||
.vscode/
|
||||
*.md
|
||||
!README.md
|
||||
node_modules/
|
||||
.env
|
||||
results/
|
||||
linkedout.exe
|
||||
linkedout-macos
|
||||
zip*
|
||||
*.7z
|
||||
*obfuscated.js
|
||||
.history
|
||||
.history
|
||||
|
||||
247
README.md
Normal file
247
README.md
Normal file
@ -0,0 +1,247 @@
|
||||
# LinkedOut - LinkedIn Posts Scraper
|
||||
|
||||
A Node.js application that automates LinkedIn login and scrapes posts containing specific keywords. The tool is designed to help track job market trends, layoffs, and open work opportunities by monitoring LinkedIn content.
|
||||
|
||||
## Features
|
||||
|
||||
- **Automated LinkedIn Login**: Uses Playwright to automate browser interactions
|
||||
- **Keyword-based Search**: Searches for posts containing keywords from CSV files or CLI
|
||||
- **Flexible Keyword Sources**: Supports multiple CSV files in `keywords/` or CLI-only mode
|
||||
- **Configurable Search Parameters**: Customizable date ranges, sorting options, city, and scroll behavior
|
||||
- **Duplicate Detection**: Prevents duplicate posts and profiles in results
|
||||
- **Clean Text Processing**: Removes hashtags, emojis, and URLs from post content
|
||||
- **Timestamped Results**: Saves results to JSON files with timestamps
|
||||
- **Command-line Overrides**: Support for runtime parameter adjustments
|
||||
- **Enhanced Geographic Location Validation**: Validates user locations against 200+ Canadian cities with smart matching
|
||||
- **Local AI Analysis (Ollama)**: Free, private, and fast post-processing with local LLMs
|
||||
- **Flexible Processing**: Disable features, run AI analysis immediately, or process results later
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (v14 or higher)
|
||||
- Valid LinkedIn account credentials
|
||||
- [Ollama](https://ollama.ai/) with a model (free, private, local AI)
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository or download the files
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. Copy the configuration template and customize:
|
||||
|
||||
```bash
|
||||
cp env-config.example .env
|
||||
```
|
||||
|
||||
4. Edit `.env` with your settings (see Configuration section below)
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables (.env file)
|
||||
|
||||
Create a `.env` file from `env-config.example`:
|
||||
|
||||
```env
|
||||
# LinkedIn Credentials (Required)
|
||||
LINKEDIN_USERNAME=your_email@example.com
|
||||
LINKEDIN_PASSWORD=your_password
|
||||
|
||||
# Basic Settings
|
||||
HEADLESS=true
|
||||
KEYWORDS=keywords-layoff.csv # Just the filename; always looks in keywords/ unless path is given
|
||||
DATE_POSTED=past-week
|
||||
SORT_BY=date_posted
|
||||
CITY=Toronto
|
||||
WHEELS=5
|
||||
|
||||
# Enhanced Location Filtering
|
||||
LOCATION_FILTER=Ontario,Manitoba
|
||||
ENABLE_LOCATION_CHECK=true
|
||||
|
||||
# Local AI Analysis (Ollama)
|
||||
ENABLE_LOCAL_AI=true
|
||||
OLLAMA_MODEL=mistral
|
||||
OLLAMA_HOST=http://localhost:11434
|
||||
RUN_LOCAL_AI_AFTER_SCRAPING=false # true = run after scraping, false = run manually
|
||||
AI_CONTEXT=job layoffs and workforce reduction
|
||||
AI_CONFIDENCE=0.7
|
||||
AI_BATCH_SIZE=3
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
#### Required
|
||||
|
||||
- `LINKEDIN_USERNAME`: Your LinkedIn email/username
|
||||
- `LINKEDIN_PASSWORD`: Your LinkedIn password
|
||||
|
||||
#### Basic Settings
|
||||
|
||||
- `HEADLESS`: Browser headless mode (`true`/`false`, default: `true`)
|
||||
- `KEYWORDS`: CSV file name (default: `keywords-layoff.csv` in `keywords/` folder)
|
||||
- `DATE_POSTED`: Filter by date (`past-24h`, `past-week`, `past-month`, or empty)
|
||||
- `SORT_BY`: Sort results (`relevance` or `date_posted`)
|
||||
- `CITY`: Search location (default: `Toronto`)
|
||||
- `WHEELS`: Number of scrolls to load posts (default: `5`)
|
||||
|
||||
#### Enhanced Location Filtering
|
||||
|
||||
- `LOCATION_FILTER`: Geographic filter - supports multiple provinces/cities:
|
||||
- Single: `Ontario` or `Toronto`
|
||||
- Multiple: `Ontario,Manitoba` or `Toronto,Vancouver`
|
||||
- `ENABLE_LOCATION_CHECK`: Enable location validation (`true`/`false`)
|
||||
|
||||
#### Local AI Analysis (Ollama)
|
||||
|
||||
- `ENABLE_LOCAL_AI=true`: Enable local AI analysis
|
||||
- `OLLAMA_MODEL`: Model to use (`mistral`, `llama2`, `codellama`)
|
||||
- `OLLAMA_HOST`: Ollama server URL (default: `http://localhost:11434`)
|
||||
- `RUN_LOCAL_AI_AFTER_SCRAPING`: Run AI immediately after scraping (`true`/`false`)
|
||||
- `AI_CONTEXT`: Context for analysis (e.g., `job layoffs`)
|
||||
- `AI_CONFIDENCE`: Minimum confidence threshold (0.0-1.0, default: 0.7)
|
||||
- `AI_BATCH_SIZE`: Posts per batch (default: 3)
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Commands
|
||||
|
||||
```bash
|
||||
# Standard scraping with configured settings
|
||||
node linkedout.js
|
||||
|
||||
# Visual mode (see browser)
|
||||
node linkedout.js --headless=false
|
||||
|
||||
# Use only these keywords (ignore CSV)
|
||||
node linkedout.js --keyword="layoff,downsizing"
|
||||
|
||||
# Add extra keywords to CSV/CLI list
|
||||
node linkedout.js --add-keyword="hiring freeze,open to work"
|
||||
|
||||
# Override city and date
|
||||
node linkedout.js --city="Vancouver" --date_posted=past-month
|
||||
|
||||
# Custom output file
|
||||
node linkedout.js --output=results/myfile.json
|
||||
|
||||
# Skip location and AI filtering (fastest)
|
||||
node linkedout.js --no-location --no-ai
|
||||
|
||||
# Run AI analysis immediately after scraping
|
||||
node linkedout.js --ai-after
|
||||
|
||||
# Show help
|
||||
node linkedout.js --help
|
||||
```
|
||||
|
||||
### All Command-line Options
|
||||
|
||||
- `--headless=true|false`: Override browser headless mode
|
||||
- `--keyword="kw1,kw2"`: Use only these keywords (comma-separated, overrides CSV)
|
||||
- `--add-keyword="kw1,kw2"`: Add extra keywords to CSV/CLI list
|
||||
- `--city="CityName"`: Override city
|
||||
- `--date_posted=VALUE`: Override date posted (past-24h, past-week, past-month, or empty)
|
||||
- `--sort_by=VALUE`: Override sort by (date_posted or relevance)
|
||||
- `--location_filter=VALUE`: Override location filter
|
||||
- `--output=FILE`: Output file name
|
||||
- `--no-location`: Disable location filtering
|
||||
- `--no-ai`: Disable AI analysis
|
||||
- `--ai-after`: Run local AI analysis after scraping
|
||||
- `--help, -h`: Show help message
|
||||
|
||||
### Keyword Files
|
||||
|
||||
- Place all keyword CSVs in the `keywords/` folder
|
||||
- Example: `keywords/keywords-layoff.csv`, `keywords/keywords-open-work.csv`
|
||||
- Custom CSV format: header `keyword` with one keyword per line
|
||||
|
||||
### Local AI Analysis Commands
|
||||
|
||||
After scraping, you can run AI analysis on the results:
|
||||
|
||||
```bash
|
||||
# Analyze latest results
|
||||
node ai-analyzer-local.js --context="job layoffs"
|
||||
|
||||
# Analyze specific file
|
||||
node ai-analyzer-local.js --input=results/results-2024-01-15.json --context="hiring"
|
||||
|
||||
# Use different model
|
||||
node ai-analyzer-local.js --model=llama2 --context="remote work"
|
||||
|
||||
# Change confidence and batch size
|
||||
node ai-analyzer-local.js --context="job layoffs" --confidence=0.8 --batch-size=5
|
||||
```
|
||||
|
||||
## Workflow Examples
|
||||
|
||||
### 1. Quick Start (All Features)
|
||||
|
||||
```bash
|
||||
node linkedout.js --ai-after
|
||||
```
|
||||
|
||||
### 2. Fast Scraping Only
|
||||
|
||||
```bash
|
||||
node linkedout.js --no-location --no-ai
|
||||
```
|
||||
|
||||
### 3. Location-Only Filtering
|
||||
|
||||
```bash
|
||||
node linkedout.js --no-ai
|
||||
```
|
||||
|
||||
### 4. Test Different AI Contexts
|
||||
|
||||
```bash
|
||||
node linkedout.js --no-ai
|
||||
node ai-analyzer-local.js --context="job layoffs"
|
||||
node ai-analyzer-local.js --context="hiring opportunities"
|
||||
node ai-analyzer-local.js --context="remote work"
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
linkedout/
|
||||
├── .env # Your configuration (create from template)
|
||||
├── env-config.example # Configuration template
|
||||
├── linkedout.js # Main scraper
|
||||
├── ai-analyzer-local.js # Free local AI analyzer (Ollama)
|
||||
├── location-utils.js # Enhanced location utilities
|
||||
├── package.json # Dependencies
|
||||
├── keywords/ # All keyword CSVs go here
|
||||
│ ├── keywords-layoff.csv
|
||||
│ └── keywords-open-work.csv
|
||||
├── results/ # Output directory
|
||||
└── README.md # This documentation
|
||||
```
|
||||
|
||||
## Legal & Security
|
||||
|
||||
- **Credentials**: Store securely in `.env`, add to `.gitignore`
|
||||
- **LinkedIn ToS**: Respect rate limits and usage guidelines
|
||||
- **Privacy**: Local AI keeps all data on your machine
|
||||
- **Usage**: Educational and research purposes only
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `playwright`: Browser automation
|
||||
- `dotenv`: Environment variables
|
||||
- `csv-parser`: CSV file reading
|
||||
- Built-in: `fs`, `path`, `child_process`
|
||||
|
||||
## Support
|
||||
|
||||
For issues:
|
||||
|
||||
1. Check this README
|
||||
2. Verify `.env` configuration
|
||||
3. Test with `--headless=false` for debugging
|
||||
4. Check Ollama status: `ollama list`
|
||||
540
ai-analyzer-local.js
Normal file
540
ai-analyzer-local.js
Normal file
@ -0,0 +1,540 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Local AI Post-Processing Analyzer for LinkedOut
|
||||
*
|
||||
* Uses Ollama for completely FREE local AI analysis.
|
||||
*
|
||||
* FEATURES:
|
||||
* - Analyze LinkedOut results for context relevance (layoffs, hiring, etc.)
|
||||
* - Works on latest or specified results file
|
||||
* - Batch processing for speed
|
||||
* - Configurable context, model, confidence, batch size
|
||||
* - CLI and .env configuration
|
||||
* - 100% local, private, and free
|
||||
*
|
||||
* USAGE:
|
||||
* node ai-analyzer-local.js [options]
|
||||
*
|
||||
* COMMAND-LINE OPTIONS:
|
||||
* --input=<file> Input JSON file (default: latest in results/)
|
||||
* --context=<text> AI context to analyze against (required)
|
||||
* --confidence=<num> Minimum confidence threshold (0.0-1.0, default: 0.7)
|
||||
* --model=<name> Ollama model to use (default: llama2)
|
||||
* --batch-size=<num> Number of posts to process at once (default: 3)
|
||||
* --output=<file> Output file (default: adds -ai-local suffix)
|
||||
* --help, -h Show this help message
|
||||
*
|
||||
* EXAMPLES:
|
||||
* node ai-analyzer-local.js --context="job layoffs"
|
||||
* node ai-analyzer-local.js --input=results/results-2024-01-15.json --context="hiring"
|
||||
* node ai-analyzer-local.js --model=mistral --context="remote work"
|
||||
* node ai-analyzer-local.js --context="job layoffs" --confidence=0.8 --batch-size=5
|
||||
*
|
||||
* ENVIRONMENT VARIABLES (.env file):
|
||||
* AI_CONTEXT, AI_CONFIDENCE, AI_BATCH_SIZE, OLLAMA_MODEL, OLLAMA_HOST
|
||||
* See README for full list.
|
||||
*
|
||||
* OUTPUT:
|
||||
* - Saves to results/ with -ai-local suffix unless --output is specified
|
||||
*
|
||||
* DEPENDENCIES:
|
||||
* - Ollama (https://ollama.ai/)
|
||||
* - Node.js built-ins: fs, path, fetch
|
||||
*
|
||||
* SECURITY & LEGAL:
|
||||
* - All analysis is local, no data leaves your machine
|
||||
* - Use responsibly for educational/research purposes
|
||||
*/
|
||||
|
||||
require("dotenv").config();
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
// Configuration from environment and command line
|
||||
const DEFAULT_CONTEXT =
|
||||
process.env.AI_CONTEXT || "job layoffs and workforce reduction";
|
||||
const DEFAULT_CONFIDENCE = parseFloat(process.env.AI_CONFIDENCE || "0.7");
|
||||
const DEFAULT_BATCH_SIZE = parseInt(process.env.AI_BATCH_SIZE || "3");
|
||||
const DEFAULT_MODEL = process.env.OLLAMA_MODEL || "llama2";
|
||||
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://localhost:11434";
|
||||
|
||||
// Parse command line arguments
|
||||
const args = process.argv.slice(2);
|
||||
let inputFile = null;
|
||||
let context = DEFAULT_CONTEXT;
|
||||
let confidenceThreshold = DEFAULT_CONFIDENCE;
|
||||
let batchSize = DEFAULT_BATCH_SIZE;
|
||||
let model = DEFAULT_MODEL;
|
||||
let outputFile = null;
|
||||
|
||||
for (const arg of args) {
|
||||
if (arg.startsWith("--input=")) {
|
||||
inputFile = arg.split("=")[1];
|
||||
} else if (arg.startsWith("--context=")) {
|
||||
context = arg.split("=")[1];
|
||||
} else if (arg.startsWith("--confidence=")) {
|
||||
confidenceThreshold = parseFloat(arg.split("=")[1]);
|
||||
} else if (arg.startsWith("--batch-size=")) {
|
||||
batchSize = parseInt(arg.split("=")[1]);
|
||||
} else if (arg.startsWith("--model=")) {
|
||||
model = arg.split("=")[1];
|
||||
} else if (arg.startsWith("--output=")) {
|
||||
outputFile = arg.split("=")[1];
|
||||
}
|
||||
}
|
||||
|
||||
if (!context) {
|
||||
console.error("❌ Error: No AI context specified");
|
||||
console.error('Use --context="your context" or set AI_CONTEXT in .env');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if Ollama is running and the model is available
|
||||
*/
|
||||
async function checkOllamaStatus() {
|
||||
try {
|
||||
// Check if Ollama is running
|
||||
const response = await fetch(`${OLLAMA_HOST}/api/tags`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama not running on ${OLLAMA_HOST}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const availableModels = data.models.map((m) => m.name);
|
||||
|
||||
console.log(`🤖 Ollama is running`);
|
||||
console.log(
|
||||
`📦 Available models: ${availableModels
|
||||
.map((m) => m.split(":")[0])
|
||||
.join(", ")}`
|
||||
);
|
||||
|
||||
// Check if requested model is available
|
||||
const modelExists = availableModels.some((m) => m.startsWith(model));
|
||||
if (!modelExists) {
|
||||
console.error(`❌ Model "${model}" not found`);
|
||||
console.error(`💡 Install it with: ollama pull ${model}`);
|
||||
console.error(
|
||||
`💡 Or choose from: ${availableModels
|
||||
.map((m) => m.split(":")[0])
|
||||
.join(", ")}`
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`✅ Using model: ${model}`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error("❌ Error connecting to Ollama:", error.message);
|
||||
console.error("💡 Make sure Ollama is installed and running:");
|
||||
console.error(" 1. Install: https://ollama.ai/");
|
||||
console.error(" 2. Start: ollama serve");
|
||||
console.error(` 3. Install model: ollama pull ${model}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the most recent results file if none specified
|
||||
*/
|
||||
function findLatestResultsFile() {
|
||||
const resultsDir = "results";
|
||||
if (!fs.existsSync(resultsDir)) {
|
||||
throw new Error("Results directory not found. Run the scraper first.");
|
||||
}
|
||||
|
||||
const files = fs
|
||||
.readdirSync(resultsDir)
|
||||
.filter(
|
||||
(f) =>
|
||||
f.startsWith("results-") && f.endsWith(".json") && !f.includes("-ai-")
|
||||
)
|
||||
.sort()
|
||||
.reverse();
|
||||
|
||||
if (files.length === 0) {
|
||||
throw new Error("No results files found. Run the scraper first.");
|
||||
}
|
||||
|
||||
return path.join(resultsDir, files[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze multiple posts using local Ollama
|
||||
*/
|
||||
async function analyzeBatch(posts, context, model) {
|
||||
console.log(`🤖 Analyzing batch of ${posts.length} posts with ${model}...`);
|
||||
|
||||
try {
|
||||
const prompt = `You are an expert at analyzing LinkedIn posts for relevance to specific contexts.
|
||||
|
||||
CONTEXT TO MATCH: "${context}"
|
||||
|
||||
Analyze these ${
|
||||
posts.length
|
||||
} LinkedIn posts and determine if each relates to the context above.
|
||||
|
||||
POSTS:
|
||||
${posts
|
||||
.map(
|
||||
(post, i) => `
|
||||
POST ${i + 1}:
|
||||
"${post.text.substring(0, 400)}${post.text.length > 400 ? "..." : ""}"
|
||||
`
|
||||
)
|
||||
.join("")}
|
||||
|
||||
For each post, provide:
|
||||
- Is it relevant to "${context}"? (YES/NO)
|
||||
- Confidence level (0.0 to 1.0)
|
||||
- Brief reasoning
|
||||
|
||||
Respond in this EXACT format for each post:
|
||||
POST 1: YES/NO | 0.X | brief reason
|
||||
POST 2: YES/NO | 0.X | brief reason
|
||||
POST 3: YES/NO | 0.X | brief reason
|
||||
|
||||
Examples:
|
||||
- For layoff context: "laid off 50 employees" = YES | 0.9 | mentions layoffs
|
||||
- For hiring context: "we're hiring developers" = YES | 0.8 | job posting
|
||||
- Unrelated content = NO | 0.1 | not relevant to context`;
|
||||
|
||||
const response = await fetch(`${OLLAMA_HOST}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.3,
|
||||
top_p: 0.9,
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`Ollama API error: ${response.status} ${response.statusText}`
|
||||
);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const aiResponse = data.response.trim();
|
||||
|
||||
// Parse the response
|
||||
const analyses = [];
|
||||
const lines = aiResponse.split("\n").filter((line) => line.trim());
|
||||
|
||||
for (let i = 0; i < posts.length; i++) {
|
||||
let analysis = {
|
||||
postIndex: i + 1,
|
||||
isRelevant: false,
|
||||
confidence: 0.5,
|
||||
reasoning: "Could not parse AI response",
|
||||
};
|
||||
|
||||
// Look for lines that match "POST X:" pattern
|
||||
const postPattern = new RegExp(`POST\\s*${i + 1}:?\\s*(.+)`, "i");
|
||||
|
||||
for (const line of lines) {
|
||||
const match = line.match(postPattern);
|
||||
if (match) {
|
||||
const content = match[1].trim();
|
||||
|
||||
// Parse: YES/NO | 0.X | reasoning
|
||||
const parts = content.split("|").map((p) => p.trim());
|
||||
|
||||
if (parts.length >= 3) {
|
||||
analysis.isRelevant = parts[0].toUpperCase().includes("YES");
|
||||
analysis.confidence = Math.max(
|
||||
0,
|
||||
Math.min(1, parseFloat(parts[1]) || 0.5)
|
||||
);
|
||||
analysis.reasoning = parts[2] || "No reasoning provided";
|
||||
} else {
|
||||
// Fallback parsing
|
||||
analysis.isRelevant =
|
||||
content.toUpperCase().includes("YES") ||
|
||||
content.toLowerCase().includes("relevant");
|
||||
analysis.confidence = 0.6;
|
||||
analysis.reasoning = content.substring(0, 100);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
analyses.push(analysis);
|
||||
}
|
||||
|
||||
// If we didn't get enough analyses, fill in defaults
|
||||
while (analyses.length < posts.length) {
|
||||
analyses.push({
|
||||
postIndex: analyses.length + 1,
|
||||
isRelevant: false,
|
||||
confidence: 0.3,
|
||||
reasoning: "AI response parsing failed",
|
||||
});
|
||||
}
|
||||
|
||||
return analyses;
|
||||
} catch (error) {
|
||||
console.error(`❌ Error in batch AI analysis: ${error.message}`);
|
||||
|
||||
// Fallback: mark all as relevant with low confidence
|
||||
return posts.map((_, i) => ({
|
||||
postIndex: i + 1,
|
||||
isRelevant: true,
|
||||
confidence: 0.3,
|
||||
reasoning: `Analysis failed: ${error.message}`,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze a single post using local Ollama (fallback)
|
||||
*/
|
||||
async function analyzeSinglePost(text, context, model) {
|
||||
const prompt = `Analyze this LinkedIn post for relevance to: "${context}"
|
||||
|
||||
Post: "${text}"
|
||||
|
||||
Is this post relevant to "${context}"? Provide:
|
||||
1. YES or NO
|
||||
2. Confidence (0.0 to 1.0)
|
||||
3. Brief reason
|
||||
|
||||
Format: YES/NO | 0.X | reason`;
|
||||
|
||||
try {
|
||||
const response = await fetch(`${OLLAMA_HOST}/api/generate`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: model,
|
||||
prompt: prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.3,
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Ollama API error: ${response.status}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const aiResponse = data.response.trim();
|
||||
|
||||
// Parse response
|
||||
const parts = aiResponse.split("|").map((p) => p.trim());
|
||||
|
||||
if (parts.length >= 3) {
|
||||
return {
|
||||
isRelevant: parts[0].toUpperCase().includes("YES"),
|
||||
confidence: Math.max(0, Math.min(1, parseFloat(parts[1]) || 0.5)),
|
||||
reasoning: parts[2],
|
||||
};
|
||||
} else {
|
||||
// Fallback parsing
|
||||
return {
|
||||
isRelevant:
|
||||
aiResponse.toLowerCase().includes("yes") ||
|
||||
aiResponse.toLowerCase().includes("relevant"),
|
||||
confidence: 0.6,
|
||||
reasoning: aiResponse.substring(0, 100),
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
isRelevant: true, // Default to include on error
|
||||
confidence: 0.3,
|
||||
reasoning: `Analysis failed: ${error.message}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main processing function
|
||||
*/
|
||||
async function main() {
|
||||
try {
|
||||
console.log("🚀 LinkedOut Local AI Analyzer Starting...");
|
||||
console.log(`📊 Context: "${context}"`);
|
||||
console.log(`🎯 Confidence Threshold: ${confidenceThreshold}`);
|
||||
console.log(`📦 Batch Size: ${batchSize}`);
|
||||
console.log(`🤖 Model: ${model}`);
|
||||
|
||||
// Check Ollama status
|
||||
await checkOllamaStatus();
|
||||
|
||||
// Determine input file
|
||||
if (!inputFile) {
|
||||
inputFile = findLatestResultsFile();
|
||||
console.log(`📂 Using latest results file: ${inputFile}`);
|
||||
} else {
|
||||
console.log(`📂 Using specified file: ${inputFile}`);
|
||||
}
|
||||
|
||||
// Load results
|
||||
if (!fs.existsSync(inputFile)) {
|
||||
throw new Error(`Input file not found: ${inputFile}`);
|
||||
}
|
||||
|
||||
const rawData = fs.readFileSync(inputFile, "utf-8");
|
||||
const results = JSON.parse(rawData);
|
||||
|
||||
if (!Array.isArray(results) || results.length === 0) {
|
||||
throw new Error("No posts found in input file");
|
||||
}
|
||||
|
||||
console.log(`📋 Loaded ${results.length} posts for analysis`);
|
||||
|
||||
// Process in batches
|
||||
const processedResults = [];
|
||||
let totalRelevant = 0;
|
||||
let totalProcessed = 0;
|
||||
|
||||
for (let i = 0; i < results.length; i += batchSize) {
|
||||
const batch = results.slice(i, i + batchSize);
|
||||
console.log(
|
||||
`\n📦 Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(
|
||||
results.length / batchSize
|
||||
)} (${batch.length} posts)`
|
||||
);
|
||||
|
||||
const analyses = await analyzeBatch(batch, context, model);
|
||||
|
||||
// Apply analyses to posts
|
||||
for (let j = 0; j < batch.length; j++) {
|
||||
const post = batch[j];
|
||||
const analysis = analyses[j];
|
||||
|
||||
const enhancedPost = {
|
||||
...post,
|
||||
aiRelevant: analysis.isRelevant,
|
||||
aiConfidence: analysis.confidence,
|
||||
aiReasoning: analysis.reasoning,
|
||||
aiModel: model,
|
||||
aiAnalyzedAt: new Date().toLocaleString("en-CA", {
|
||||
year: "numeric",
|
||||
month: "2-digit",
|
||||
day: "2-digit",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
second: "2-digit",
|
||||
hour12: false,
|
||||
}),
|
||||
aiType: "local-ollama",
|
||||
aiProcessed: true,
|
||||
};
|
||||
|
||||
// Apply confidence threshold
|
||||
if (analysis.confidence >= confidenceThreshold) {
|
||||
if (analysis.isRelevant) {
|
||||
processedResults.push(enhancedPost);
|
||||
totalRelevant++;
|
||||
}
|
||||
} else {
|
||||
// Include low-confidence posts but flag them
|
||||
enhancedPost.lowConfidence = true;
|
||||
processedResults.push(enhancedPost);
|
||||
}
|
||||
|
||||
totalProcessed++;
|
||||
console.log(
|
||||
` ${
|
||||
analysis.isRelevant ? "✅" : "❌"
|
||||
} Post ${totalProcessed}: ${analysis.confidence.toFixed(
|
||||
2
|
||||
)} confidence - ${analysis.reasoning.substring(0, 100)}...`
|
||||
);
|
||||
}
|
||||
|
||||
// Small delay between batches to be nice to the system
|
||||
if (i + batchSize < results.length) {
|
||||
console.log("⏳ Brief pause...");
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
}
|
||||
}
|
||||
|
||||
// Determine output file
|
||||
if (!outputFile) {
|
||||
const inputBasename = path.basename(inputFile, ".json");
|
||||
const inputDir = path.dirname(inputFile);
|
||||
outputFile = path.join(inputDir, `${inputBasename}-ai-local.json`);
|
||||
}
|
||||
|
||||
// Save results
|
||||
fs.writeFileSync(
|
||||
outputFile,
|
||||
JSON.stringify(processedResults, null, 2),
|
||||
"utf-8"
|
||||
);
|
||||
|
||||
console.log("\n🎉 Local AI Analysis Complete!");
|
||||
console.log(`📊 Results:`);
|
||||
console.log(` Total posts processed: ${totalProcessed}`);
|
||||
console.log(` Relevant posts found: ${totalRelevant}`);
|
||||
console.log(` Final results saved: ${processedResults.length}`);
|
||||
console.log(`📁 Output saved to: ${outputFile}`);
|
||||
console.log(`💰 Cost: $0.00 (completely free!)`);
|
||||
} catch (error) {
|
||||
console.error("❌ Error:", error.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Show help if requested
|
||||
if (args.includes("--help") || args.includes("-h")) {
|
||||
console.log(`
|
||||
LinkedOut Local AI Analyzer (Ollama)
|
||||
|
||||
🚀 FREE local AI analysis - No API costs, complete privacy!
|
||||
|
||||
Usage: node ai-analyzer-local.js [options]
|
||||
|
||||
Options:
|
||||
--input=<file> Input JSON file (default: latest in results/)
|
||||
--context=<text> AI context to analyze against (required)
|
||||
--confidence=<num> Minimum confidence threshold (0.0-1.0, default: 0.7)
|
||||
--model=<name> Ollama model to use (default: llama2)
|
||||
--batch-size=<num> Number of posts to process at once (default: 3)
|
||||
--output=<file> Output file (default: adds -ai-local suffix)
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
node ai-analyzer-local.js --context="job layoffs"
|
||||
node ai-analyzer-local.js --model=mistral --context="hiring opportunities"
|
||||
node ai-analyzer-local.js --context="remote work" --confidence=0.8
|
||||
|
||||
Prerequisites:
|
||||
1. Install Ollama: https://ollama.ai/
|
||||
2. Install a model: ollama pull llama2
|
||||
3. Start Ollama: ollama serve
|
||||
|
||||
Popular Models:
|
||||
- llama2 (good general purpose)
|
||||
- mistral (fast and accurate)
|
||||
- codellama (good for technical content)
|
||||
- llama2:13b (more accurate, slower)
|
||||
|
||||
Environment Variables:
|
||||
AI_CONTEXT Default context for analysis
|
||||
AI_CONFIDENCE Default confidence threshold
|
||||
AI_BATCH_SIZE Default batch size
|
||||
OLLAMA_MODEL Default model (llama2, mistral, etc.)
|
||||
OLLAMA_HOST Ollama host (default: http://localhost:11434)
|
||||
`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Run the analyzer
|
||||
main();
|
||||
720
linkedout.js
720
linkedout.js
@ -1,57 +1,132 @@
|
||||
/**
|
||||
* LinkedIn Posts Scraper (linkedout)
|
||||
* LinkedIn Posts Scraper (LinkedOut)
|
||||
*
|
||||
* This script logs into LinkedIn using credentials stored in a .env file,
|
||||
* reads keywords from a CSV file (keywords.csv), and scrapes posts matching
|
||||
* those keywords from LinkedIn's content search.
|
||||
* A comprehensive tool for scraping LinkedIn posts based on keyword searches.
|
||||
* Designed to track job market trends, layoffs, and open work opportunities
|
||||
* by monitoring LinkedIn content automatically.
|
||||
*
|
||||
* Usage:
|
||||
* node linkedout.js [--headless=true|false] [--keyword=additional_keyword]
|
||||
* FEATURES:
|
||||
* - Automated LinkedIn login with browser automation
|
||||
* - Keyword-based post searching from CSV files or CLI
|
||||
* - Configurable search parameters (date, location, sorting)
|
||||
* - Duplicate detection for posts and profiles
|
||||
* - Text cleaning (removes hashtags, URLs, emojis)
|
||||
* - Timestamped JSON output files
|
||||
* - Command-line parameter overrides (see below)
|
||||
* - Enhanced geographic location validation
|
||||
* - Optional local AI-powered context analysis (Ollama)
|
||||
*
|
||||
* Command-line Parameters:
|
||||
* --headless: Override the headless mode (true or false). Defaults to value in .env (HEADLESS).
|
||||
* --keyword: Append an additional keyword to the list of keywords from keywords.csv.
|
||||
* USAGE:
|
||||
* node linkedout.js [options]
|
||||
*
|
||||
* Output:
|
||||
* Saves results to a timestamped JSON file in the 'results' directory.
|
||||
* COMMAND-LINE OPTIONS:
|
||||
* --headless=true|false Override browser headless mode
|
||||
* --keyword="kw1,kw2" Use only these keywords (comma-separated, overrides CSV)
|
||||
* --add-keyword="kw1,kw2" Add extra keywords to CSV/CLI list
|
||||
* --city="CityName" Override city
|
||||
* --date_posted=VALUE Override date posted (past-24h, past-week, past-month, or empty)
|
||||
* --sort_by=VALUE Override sort by (date_posted or relevance)
|
||||
* --location_filter=VALUE Override location filter
|
||||
* --output=FILE Output file name
|
||||
* --no-location Disable location filtering
|
||||
* --no-ai Disable AI analysis
|
||||
* --ai-after Run local AI analysis after scraping
|
||||
* --help, -h Show this help message
|
||||
*
|
||||
* Requirements:
|
||||
* - Node.js environment (or use the compiled executable)
|
||||
* - Playwright installed (or included in the binary)
|
||||
* - dotenv package for environment variables
|
||||
* - csv-parser package for reading CSV files
|
||||
* EXAMPLES:
|
||||
* node linkedout.js # Standard scraping
|
||||
* node linkedout.js --headless=false # Visual mode
|
||||
* node linkedout.js --keyword="layoff,downsizing" # Only these keywords
|
||||
* node linkedout.js --add-keyword="hiring freeze" # Add extra keyword(s)
|
||||
* node linkedout.js --city="Vancouver" --date_posted=past-month
|
||||
* node linkedout.js --output=results/myfile.json
|
||||
* node linkedout.js --no-location --no-ai # Fastest, no filters
|
||||
* node linkedout.js --ai-after # Run AI after scraping
|
||||
*
|
||||
* Environment Variables (.env):
|
||||
* LINKEDIN_USERNAME - Your LinkedIn username
|
||||
* LINKEDIN_PASSWORD - Your LinkedIn password
|
||||
* HEADLESS - Default headless mode (true or false)
|
||||
* POST-PROCESSING AI ANALYSIS:
|
||||
* node ai-analyzer-local.js --context="job layoffs" # Run on latest results file
|
||||
* node ai-analyzer-local.js --input=results/results-2024-01-15.json --context="hiring"
|
||||
*
|
||||
* Example:
|
||||
* node linkedout.js --headless=true --keyword=layoff
|
||||
* ENVIRONMENT VARIABLES (.env file):
|
||||
* KEYWORDS=keywords-layoff.csv (filename only, always looks in keywords/ folder unless path is given)
|
||||
* See README for full list.
|
||||
*
|
||||
* OUTPUT:
|
||||
* - Saves to results/results-YYYY-MM-DD-HH-MM.json (or as specified by --output)
|
||||
* - Enhanced format with optional location validation and local AI analysis
|
||||
*
|
||||
* KEYWORD FILES:
|
||||
* - Place all keyword CSVs in the keywords/ folder
|
||||
* - keywords-layoff.csv: 33+ layoff-related terms
|
||||
* - keywords-open-work.csv: Terms for finding people open to work
|
||||
* - Custom CSV format: header "keyword" with one keyword per line
|
||||
*
|
||||
* DEPENDENCIES:
|
||||
* - playwright: Browser automation
|
||||
* - dotenv: Environment variable management
|
||||
* - csv-parser: CSV file parsing
|
||||
* - Node.js built-ins: fs, path, child_process
|
||||
*
|
||||
* SECURITY & LEGAL:
|
||||
* - Store credentials securely in .env file
|
||||
* - Respect LinkedIn's Terms of Service
|
||||
* - Use responsibly for educational/research purposes
|
||||
* - Consider rate limiting and LinkedIn API for production use
|
||||
*/
|
||||
process.env.PLAYWRIGHT_BROWSERS_PATH = "0";
|
||||
//process.env.PLAYWRIGHT_BROWSERS_PATH = "0";
|
||||
// Suppress D-Bus notification errors in WSL
|
||||
process.env.NO_AT_BRIDGE = "1";
|
||||
process.env.DBUS_SESSION_BUS_ADDRESS = "/dev/null";
|
||||
|
||||
const { chromium } = require("playwright");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
require("dotenv").config();
|
||||
const csv = require("csv-parser");
|
||||
const { spawn } = require("child_process");
|
||||
|
||||
const DATE_POSTED = process.env.DATE_POSTED || "past-week"; // "past-24h", "past-week", "past-month", or ""
|
||||
const SORT_BY = process.env.SORT_BY || "date_posted"; // "relevance", "date_posted"
|
||||
const WHEELS = process.env.WHEELS || 5;
|
||||
// Core configuration
|
||||
const DATE_POSTED = process.env.DATE_POSTED || "past-week";
|
||||
const SORT_BY = process.env.SORT_BY || "date_posted";
|
||||
const WHEELS = parseInt(process.env.WHEELS) || 5;
|
||||
const CITY = process.env.CITY || "Toronto";
|
||||
|
||||
// Read credentials and headless mode from .env
|
||||
// Location filtering configuration
|
||||
const LOCATION_FILTER = process.env.LOCATION_FILTER || "";
|
||||
const ENABLE_LOCATION_CHECK = process.env.ENABLE_LOCATION_CHECK === "true";
|
||||
|
||||
// Local AI analysis configuration
|
||||
const ENABLE_LOCAL_AI = process.env.ENABLE_LOCAL_AI === "true";
|
||||
const RUN_LOCAL_AI_AFTER_SCRAPING =
|
||||
process.env.RUN_LOCAL_AI_AFTER_SCRAPING === "true";
|
||||
const AI_CONTEXT =
|
||||
process.env.AI_CONTEXT || "job layoffs and workforce reduction";
|
||||
|
||||
// Import enhanced location utilities
|
||||
const {
|
||||
parseLocationFilters,
|
||||
validateLocationAgainstFilters,
|
||||
extractLocationFromProfile,
|
||||
} = require("./location-utils");
|
||||
|
||||
// Read credentials
|
||||
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
||||
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
||||
|
||||
// Default headless mode from .env
|
||||
let HEADLESS = process.env.HEADLESS === "true";
|
||||
|
||||
// Parse command-line arguments
|
||||
const args = process.argv.slice(2);
|
||||
let additionalKeyword = null;
|
||||
let cliKeywords = null; // If set, only use these
|
||||
let additionalKeywords = [];
|
||||
let disableLocation = false;
|
||||
let disableAI = false;
|
||||
let runAIAfter = RUN_LOCAL_AI_AFTER_SCRAPING;
|
||||
let cliCity = null;
|
||||
let cliDatePosted = null;
|
||||
let cliSortBy = null;
|
||||
let cliLocationFilter = null;
|
||||
let cliOutput = null;
|
||||
let showHelp = false;
|
||||
|
||||
for (const arg of args) {
|
||||
if (arg.startsWith("--headless=")) {
|
||||
@ -59,7 +134,99 @@ for (const arg of args) {
|
||||
HEADLESS = val === "true";
|
||||
}
|
||||
if (arg.startsWith("--keyword=")) {
|
||||
additionalKeyword = arg.split("=")[1];
|
||||
cliKeywords = arg
|
||||
.split("=")[1]
|
||||
.split(",")
|
||||
.map((k) => k.trim())
|
||||
.filter(Boolean);
|
||||
}
|
||||
if (arg.startsWith("--add-keyword=")) {
|
||||
additionalKeywords = additionalKeywords.concat(
|
||||
arg
|
||||
.split("=")[1]
|
||||
.split(",")
|
||||
.map((k) => k.trim())
|
||||
.filter(Boolean)
|
||||
);
|
||||
}
|
||||
if (arg === "--no-location") {
|
||||
disableLocation = true;
|
||||
}
|
||||
if (arg === "--no-ai") {
|
||||
disableAI = true;
|
||||
}
|
||||
if (arg === "--ai-after") {
|
||||
runAIAfter = true;
|
||||
}
|
||||
if (arg.startsWith("--city=")) {
|
||||
cliCity = arg.split("=")[1];
|
||||
}
|
||||
if (arg.startsWith("--date_posted=")) {
|
||||
cliDatePosted = arg.split("=")[1];
|
||||
}
|
||||
if (arg.startsWith("--sort_by=")) {
|
||||
cliSortBy = arg.split("=")[1];
|
||||
}
|
||||
if (arg.startsWith("--location_filter=")) {
|
||||
cliLocationFilter = arg.split("=")[1];
|
||||
}
|
||||
if (arg.startsWith("--output=")) {
|
||||
cliOutput = arg.split("=")[1];
|
||||
}
|
||||
if (arg === "--help" || arg === "-h") {
|
||||
showHelp = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (showHelp) {
|
||||
console.log(
|
||||
`\nLinkedOut - LinkedIn Posts Scraper\n\nUsage: node linkedout.js [options]\n\nOptions:\n --headless=true|false Override browser headless mode\n --keyword="kw1,kw2" Use only these keywords (comma-separated, overrides CSV)\n --add-keyword="kw1,kw2" Add extra keywords to CSV list\n --city="CityName" Override city\n --date_posted=VALUE Override date posted (past-24h, past-week, past-month or '')\n --sort_by=VALUE Override sort by (date_posted or relevance)\n --location_filter=VALUE Override location filter\n --output=FILE Output file name\n --no-location Disable location filtering\n --no-ai Disable AI analysis\n --ai-after Run local AI analysis after scraping\n --help, -h Show this help message\n\nExamples:\n node linkedout.js --keyword="layoff,downsizing"\n node linkedout.js --add-keyword="hiring freeze"\n node linkedout.js --city="Vancouver" --date_posted=past-month\n node linkedout.js --output=results/myfile.json\n`
|
||||
);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Use CLI overrides if provided
|
||||
const EFFECTIVE_CITY = cliCity || CITY;
|
||||
const EFFECTIVE_DATE_POSTED = cliDatePosted || DATE_POSTED;
|
||||
const EFFECTIVE_SORT_BY = cliSortBy || SORT_BY;
|
||||
const EFFECTIVE_LOCATION_FILTER = cliLocationFilter || LOCATION_FILTER;
|
||||
|
||||
// Read keywords from CSV or CLI
|
||||
const keywords = [];
|
||||
let keywordEnv = process.env.KEYWORDS || "keywords-layoff.csv";
|
||||
let csvPath = path.join(
|
||||
process.cwd(),
|
||||
keywordEnv.includes("/") ? keywordEnv : `keywords/${keywordEnv}`
|
||||
);
|
||||
|
||||
function loadKeywordsAndStart() {
|
||||
if (cliKeywords) {
|
||||
// Only use CLI keywords
|
||||
cliKeywords.forEach((k) => keywords.push(k));
|
||||
if (additionalKeywords.length > 0) {
|
||||
additionalKeywords.forEach((k) => keywords.push(k));
|
||||
}
|
||||
startScraper();
|
||||
} else {
|
||||
// Load from CSV, then add any additional keywords
|
||||
fs.createReadStream(csvPath)
|
||||
.pipe(csv())
|
||||
.on("data", (row) => {
|
||||
if (row.keyword) keywords.push(row.keyword.trim());
|
||||
})
|
||||
.on("end", () => {
|
||||
if (keywords.length === 0) {
|
||||
console.error("No keywords found in csv");
|
||||
process.exit(1);
|
||||
}
|
||||
if (additionalKeywords.length > 0) {
|
||||
additionalKeywords.forEach((k) => keywords.push(k));
|
||||
console.log(
|
||||
`Added additional keywords: ${additionalKeywords.join(", ")}`
|
||||
);
|
||||
}
|
||||
startScraper();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -84,9 +251,10 @@ function buildSearchUrl(keyword, city) {
|
||||
let url = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
|
||||
keyword + " " + city
|
||||
)}`;
|
||||
if (DATE_POSTED)
|
||||
url += `&datePosted=${encodeURIComponent(`"${DATE_POSTED}"`)}`;
|
||||
if (SORT_BY) url += `&sortBy=${encodeURIComponent(`"${SORT_BY}"`)}`;
|
||||
if (EFFECTIVE_DATE_POSTED)
|
||||
url += `&datePosted=${encodeURIComponent(`"${EFFECTIVE_DATE_POSTED}"`)}`;
|
||||
if (EFFECTIVE_SORT_BY)
|
||||
url += `&sortBy=${encodeURIComponent(`"${EFFECTIVE_SORT_BY}"`)}`;
|
||||
url += `&origin=FACETED_SEARCH`;
|
||||
return url;
|
||||
}
|
||||
@ -95,144 +263,386 @@ function containsAnyKeyword(text, keywords) {
|
||||
return keywords.some((k) => text.toLowerCase().includes(k.toLowerCase()));
|
||||
}
|
||||
|
||||
// Read keywords from CSV
|
||||
const keywords = [];
|
||||
const csvPath = path.join(
|
||||
process.cwd(),
|
||||
process.env.KEYWORDS || "keywords-layoff.csv"
|
||||
);
|
||||
/**
|
||||
* Enhanced profile location validation with smart waiting (no timeouts)
|
||||
* Uses a new tab to avoid disrupting the main scraping flow
|
||||
*/
|
||||
async function validateProfileLocation(
|
||||
context,
|
||||
profileLink,
|
||||
locationFilterString
|
||||
) {
|
||||
if (!locationFilterString || !ENABLE_LOCATION_CHECK || disableLocation) {
|
||||
return {
|
||||
isValid: true,
|
||||
location: "Not checked",
|
||||
matchedFilter: null,
|
||||
reasoning: "Location check disabled",
|
||||
error: null,
|
||||
};
|
||||
}
|
||||
|
||||
fs.createReadStream(csvPath)
|
||||
.pipe(csv())
|
||||
.on("data", (row) => {
|
||||
if (row.keyword) keywords.push(row.keyword.trim());
|
||||
})
|
||||
.on("end", async () => {
|
||||
if (keywords.length === 0) {
|
||||
console.error("No keywords found in csv");
|
||||
process.exit(1);
|
||||
}
|
||||
let profilePage = null;
|
||||
try {
|
||||
console.log(`🌍 Checking profile location: ${profileLink}`);
|
||||
|
||||
// Append additional keyword if provided
|
||||
if (additionalKeyword) {
|
||||
keywords.push(additionalKeyword);
|
||||
console.log(`Added additional keyword from CLI: ${additionalKeyword}`);
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: HEADLESS,
|
||||
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||
});
|
||||
const context = await browser.newContext();
|
||||
const page = await Promise.race([
|
||||
context.newPage(),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error("newPage timeout")), 10000)
|
||||
),
|
||||
]).catch((err) => {
|
||||
console.error("Failed to create new page:", err);
|
||||
process.exit(1);
|
||||
// Create a new page/tab for profile validation
|
||||
profilePage = await context.newPage();
|
||||
await profilePage.goto(profileLink, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 10000,
|
||||
});
|
||||
|
||||
try {
|
||||
await page.goto("https://www.linkedin.com/login");
|
||||
await page.fill('input[name="session_key"]', LINKEDIN_USERNAME);
|
||||
await page.fill('input[name="session_password"]', LINKEDIN_PASSWORD);
|
||||
await page.click('button[type="submit"]');
|
||||
await page.waitForSelector("img.global-nav__me-photo", {
|
||||
timeout: 10000,
|
||||
});
|
||||
// Always use smart waiting for key profile elements
|
||||
await Promise.race([
|
||||
profilePage.waitForSelector("h1", { timeout: 3000 }),
|
||||
profilePage.waitForSelector("[data-field='experience_section']", {
|
||||
timeout: 3000,
|
||||
}),
|
||||
profilePage.waitForSelector(".pv-text-details__left-panel", {
|
||||
timeout: 3000,
|
||||
}),
|
||||
]);
|
||||
|
||||
const seenPosts = new Set();
|
||||
const seenProfiles = new Set();
|
||||
const results = [];
|
||||
// Use enhanced location extraction
|
||||
const location = await extractLocationFromProfile(profilePage);
|
||||
|
||||
for (const keyword of keywords) {
|
||||
const searchUrl = buildSearchUrl(keyword, CITY);
|
||||
await page.goto(searchUrl, { waitUntil: "load" });
|
||||
if (!location) {
|
||||
return {
|
||||
isValid: false,
|
||||
location: "Location not found",
|
||||
matchedFilter: null,
|
||||
reasoning: "Could not extract location from profile",
|
||||
error: "Location extraction failed",
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
await page.waitForSelector(".feed-shared-update-v2", {
|
||||
timeout: 3000,
|
||||
// Parse location filters
|
||||
const locationFilters = parseLocationFilters(locationFilterString);
|
||||
|
||||
// Validate against filters
|
||||
const validationResult = validateLocationAgainstFilters(
|
||||
location,
|
||||
locationFilters
|
||||
);
|
||||
|
||||
return {
|
||||
isValid: validationResult.isValid,
|
||||
location,
|
||||
matchedFilter: validationResult.matchedFilter,
|
||||
reasoning: validationResult.reasoning,
|
||||
error: validationResult.isValid ? null : validationResult.reasoning,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(`❌ Error checking profile location: ${error.message}`);
|
||||
return {
|
||||
isValid: false,
|
||||
location: "Error checking location",
|
||||
matchedFilter: null,
|
||||
reasoning: `Error: ${error.message}`,
|
||||
error: error.message,
|
||||
};
|
||||
} finally {
|
||||
// Always close the profile page to clean up
|
||||
if (profilePage) {
|
||||
try {
|
||||
await profilePage.close();
|
||||
} catch (closeError) {
|
||||
console.error(`⚠️ Error closing profile page: ${closeError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run local AI analysis after scraping is complete
|
||||
*/
|
||||
async function runPostScrapingLocalAI(resultsFile) {
|
||||
if (disableAI || !ENABLE_LOCAL_AI || !runAIAfter) {
|
||||
return;
|
||||
}
|
||||
|
||||
console.log("\n🧠 Starting post-scraping local AI analysis...");
|
||||
|
||||
const analyzerScript = "ai-analyzer-local.js";
|
||||
const args = [`--input=${resultsFile}`, `--context=${AI_CONTEXT}`];
|
||||
|
||||
console.log(`🚀 Running: node ${analyzerScript} ${args.join(" ")}`);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const child = spawn("node", [analyzerScript, ...args], {
|
||||
stdio: "inherit",
|
||||
cwd: process.cwd(),
|
||||
});
|
||||
|
||||
child.on("close", (code) => {
|
||||
if (code === 0) {
|
||||
console.log("✅ Local AI analysis completed successfully");
|
||||
resolve();
|
||||
} else {
|
||||
console.error(`❌ Local AI analysis failed with code ${code}`);
|
||||
reject(new Error(`Local AI analysis process exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
child.on("error", (error) => {
|
||||
console.error(`❌ Failed to run local AI analysis: ${error.message}`);
|
||||
reject(error);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function startScraper() {
|
||||
console.log("\n🚀 LinkedOut Scraper Starting...");
|
||||
console.log(`📊 Keywords: ${keywords.length}`);
|
||||
console.log(
|
||||
`🌍 Location Filter: ${
|
||||
ENABLE_LOCATION_CHECK && !disableLocation
|
||||
? LOCATION_FILTER || "None"
|
||||
: "Disabled"
|
||||
}`
|
||||
);
|
||||
console.log(
|
||||
`🧠 Local AI Analysis: ${
|
||||
ENABLE_LOCAL_AI && !disableAI
|
||||
? runAIAfter
|
||||
? "After scraping"
|
||||
: "Manual"
|
||||
: "Disabled"
|
||||
}`
|
||||
);
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: HEADLESS,
|
||||
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||
});
|
||||
const context = await browser.newContext();
|
||||
const page = await Promise.race([
|
||||
context.newPage(),
|
||||
new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error("newPage timeout")), 10000)
|
||||
),
|
||||
]).catch((err) => {
|
||||
console.error("Failed to create new page:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
let scrapeError = null;
|
||||
try {
|
||||
await page.goto("https://www.linkedin.com/login");
|
||||
await page.fill('input[name="session_key"]', LINKEDIN_USERNAME);
|
||||
await page.fill('input[name="session_password"]', LINKEDIN_PASSWORD);
|
||||
await page.click('button[type="submit"]');
|
||||
await page.waitForSelector("img.global-nav__me-photo", {
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
const seenPosts = new Set();
|
||||
const seenProfiles = new Set();
|
||||
const results = [];
|
||||
const rejectedResults = [];
|
||||
|
||||
for (const keyword of keywords) {
|
||||
const searchUrl = buildSearchUrl(keyword, EFFECTIVE_CITY);
|
||||
await page.goto(searchUrl, { waitUntil: "load" });
|
||||
|
||||
try {
|
||||
await page.waitForSelector(".feed-shared-update-v2", {
|
||||
timeout: 3000,
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(
|
||||
`---\nNo posts found for keyword: ${keyword}\nCity: ${EFFECTIVE_CITY}\nDate posted: ${EFFECTIVE_DATE_POSTED}\nSort by: ${EFFECTIVE_SORT_BY}`
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (let i = 0; i < WHEELS; i++) {
|
||||
await page.mouse.wheel(0, 1000);
|
||||
await page.waitForTimeout(1000);
|
||||
}
|
||||
|
||||
const postContainers = await page.$$(".feed-shared-update-v2");
|
||||
for (const container of postContainers) {
|
||||
let text = "";
|
||||
const textHandle = await container.$(
|
||||
"div.update-components-text, span.break-words"
|
||||
);
|
||||
if (textHandle) {
|
||||
text = (await textHandle.textContent()) || "";
|
||||
text = cleanText(text);
|
||||
}
|
||||
if (
|
||||
!text ||
|
||||
seenPosts.has(text) ||
|
||||
text.length < 30 ||
|
||||
!/[a-zA-Z0-9]/.test(text)
|
||||
) {
|
||||
rejectedResults.push({
|
||||
rejected: true,
|
||||
reason: !text
|
||||
? "No text"
|
||||
: seenPosts.has(text)
|
||||
? "Duplicate post"
|
||||
: text.length < 30
|
||||
? "Text too short"
|
||||
: "No alphanumeric content",
|
||||
keyword,
|
||||
text,
|
||||
profileLink: null,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
seenPosts.add(text);
|
||||
|
||||
let profileLink = "";
|
||||
const profileLinkElement = await container.$('a[href*="/in/"]');
|
||||
if (profileLinkElement) {
|
||||
profileLink = await profileLinkElement.getAttribute("href");
|
||||
if (profileLink && !profileLink.startsWith("http")) {
|
||||
profileLink = `https://www.linkedin.com${profileLink}`;
|
||||
}
|
||||
profileLink = profileLink.split("?")[0];
|
||||
}
|
||||
|
||||
if (!profileLink || seenProfiles.has(profileLink)) {
|
||||
rejectedResults.push({
|
||||
rejected: true,
|
||||
reason: !profileLink ? "No profile link" : "Duplicate profile",
|
||||
keyword,
|
||||
text,
|
||||
profileLink,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
seenProfiles.add(profileLink);
|
||||
|
||||
// Double-check keyword presence
|
||||
if (!containsAnyKeyword(text, keywords)) {
|
||||
rejectedResults.push({
|
||||
rejected: true,
|
||||
reason: "Keyword not present",
|
||||
keyword,
|
||||
text,
|
||||
profileLink,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log("---");
|
||||
console.log("Keyword:", keyword);
|
||||
console.log("Post:", text.substring(0, 100) + "...");
|
||||
console.log("Profile:", profileLink);
|
||||
|
||||
// Enhanced location validation
|
||||
const locationCheck = await validateProfileLocation(
|
||||
context,
|
||||
profileLink,
|
||||
EFFECTIVE_LOCATION_FILTER
|
||||
);
|
||||
console.log("📍 Location:", locationCheck.location);
|
||||
console.log("🎯 Match:", locationCheck.reasoning);
|
||||
|
||||
if (!locationCheck.isValid) {
|
||||
rejectedResults.push({
|
||||
rejected: true,
|
||||
reason: `Location filter failed: ${locationCheck.error}`,
|
||||
keyword,
|
||||
text,
|
||||
profileLink,
|
||||
location: locationCheck.location,
|
||||
locationReasoning: locationCheck.reasoning,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(
|
||||
`---\nNo posts found for keyword: ${keyword}\nDate posted: ${DATE_POSTED}\nSort by: ${SORT_BY}`
|
||||
"❌ Skipping - Location filter failed:",
|
||||
locationCheck.error
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (let i = 0; i < WHEELS; i++) {
|
||||
await page.mouse.wheel(0, 1000);
|
||||
await page.waitForTimeout(1000);
|
||||
}
|
||||
console.log("✅ Post passed all filters");
|
||||
|
||||
const postContainers = await page.$$(".feed-shared-update-v2");
|
||||
for (const container of postContainers) {
|
||||
let text = "";
|
||||
const textHandle = await container.$(
|
||||
"div.update-components-text, span.break-words"
|
||||
);
|
||||
if (textHandle) {
|
||||
text = (await textHandle.textContent()) || "";
|
||||
text = cleanText(text);
|
||||
}
|
||||
if (
|
||||
!text ||
|
||||
seenPosts.has(text) ||
|
||||
text.length < 30 ||
|
||||
!/[a-zA-Z0-9]/.test(text)
|
||||
)
|
||||
continue;
|
||||
seenPosts.add(text);
|
||||
|
||||
let profileLink = "";
|
||||
const profileLinkElement = await container.$('a[href*="/in/"]');
|
||||
if (profileLinkElement) {
|
||||
profileLink = await profileLinkElement.getAttribute("href");
|
||||
if (profileLink && !profileLink.startsWith("http")) {
|
||||
profileLink = `https://www.linkedin.com${profileLink}`;
|
||||
}
|
||||
profileLink = profileLink.split("?")[0];
|
||||
}
|
||||
|
||||
if (!profileLink || seenProfiles.has(profileLink)) continue;
|
||||
seenProfiles.add(profileLink);
|
||||
|
||||
// Double-check keyword presence
|
||||
if (!containsAnyKeyword(text, keywords)) continue;
|
||||
|
||||
console.log("---");
|
||||
console.log("Keyword:", keyword);
|
||||
console.log("Post:", text);
|
||||
console.log("Profile:", profileLink);
|
||||
|
||||
results.push({
|
||||
keyword,
|
||||
text,
|
||||
profileLink,
|
||||
});
|
||||
}
|
||||
results.push({
|
||||
keyword,
|
||||
text,
|
||||
profileLink,
|
||||
location: locationCheck.location,
|
||||
locationValid: locationCheck.isValid,
|
||||
locationMatchedFilter: locationCheck.matchedFilter,
|
||||
locationReasoning: locationCheck.reasoning,
|
||||
timestamp: new Date().toLocaleString("en-CA", {
|
||||
year: "numeric",
|
||||
month: "2-digit",
|
||||
day: "2-digit",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
second: "2-digit",
|
||||
hour12: false,
|
||||
}),
|
||||
aiProcessed: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
const timestamp = `${now.getFullYear()}-${String(
|
||||
now.getMonth() + 1
|
||||
).padStart(2, "0")}-${String(now.getDate()).padStart(2, "0")}-${String(
|
||||
const now = new Date();
|
||||
const timestamp =
|
||||
cliOutput ||
|
||||
`${now.getFullYear()}-${String(now.getMonth() + 1).padStart(
|
||||
2,
|
||||
"0"
|
||||
)}-${String(now.getDate()).padStart(2, "0")}-${String(
|
||||
now.getHours()
|
||||
).padStart(2, "0")}-${String(now.getMinutes()).padStart(2, "0")}`;
|
||||
const resultsDir = "results";
|
||||
const resultsFile = `${resultsDir}/results-${timestamp}.json`;
|
||||
const resultsDir = "results";
|
||||
const resultsFile = `${resultsDir}/results-${timestamp}.json`;
|
||||
const rejectedFile = `${resultsDir}/results-${timestamp}-rejected.json`;
|
||||
|
||||
if (!fs.existsSync(resultsDir)) {
|
||||
fs.mkdirSync(resultsDir);
|
||||
}
|
||||
|
||||
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2), "utf-8");
|
||||
console.log(`Saved ${results.length} posts to ${resultsFile}`);
|
||||
} catch (err) {
|
||||
console.error("Error:", err);
|
||||
} finally {
|
||||
await browser.close();
|
||||
if (!fs.existsSync(resultsDir)) {
|
||||
fs.mkdirSync(resultsDir);
|
||||
}
|
||||
});
|
||||
|
||||
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2), "utf-8");
|
||||
fs.writeFileSync(
|
||||
rejectedFile,
|
||||
JSON.stringify(rejectedResults, null, 2),
|
||||
"utf-8"
|
||||
);
|
||||
console.log(`\n🎉 Scraping Complete!`);
|
||||
console.log(`📊 Saved ${results.length} posts to ${resultsFile}`);
|
||||
console.log(
|
||||
`📋 Saved ${rejectedResults.length} rejected posts to ${rejectedFile}`
|
||||
);
|
||||
|
||||
// Run local AI analysis if requested
|
||||
if (runAIAfter && results.length > 0 && !scrapeError) {
|
||||
try {
|
||||
await runPostScrapingLocalAI(resultsFile);
|
||||
} catch (error) {
|
||||
console.error(
|
||||
"⚠️ Local AI analysis failed, but scraping completed successfully"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n💡 Next steps:`);
|
||||
console.log(` 📋 Review results in ${resultsFile}`);
|
||||
if (!runAIAfter && !disableAI) {
|
||||
console.log(` 🧠 Local AI Analysis:`);
|
||||
console.log(` node ai-analyzer-local.js --context="${AI_CONTEXT}"`);
|
||||
console.log(
|
||||
` node ai-analyzer-local.js --input=${resultsFile} --context="your context"`
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
scrapeError = err;
|
||||
console.error("Error:", err);
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
loadKeywordsAndStart();
|
||||
|
||||
1126
location-utils.js
Normal file
1126
location-utils.js
Normal file
File diff suppressed because it is too large
Load Diff
16
package-lock.json
generated
16
package-lock.json
generated
@ -11,7 +11,7 @@
|
||||
"dependencies": {
|
||||
"csv-parser": "^3.2.0",
|
||||
"dotenv": "^17.0.0",
|
||||
"playwright": "^1.53.1"
|
||||
"playwright": "^1.53.2"
|
||||
}
|
||||
},
|
||||
"node_modules/csv-parser": {
|
||||
@ -53,12 +53,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/playwright": {
|
||||
"version": "1.53.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.53.1.tgz",
|
||||
"integrity": "sha512-LJ13YLr/ocweuwxyGf1XNFWIU4M2zUSo149Qbp+A4cpwDjsxRPj7k6H25LBrEHiEwxvRbD8HdwvQmRMSvquhYw==",
|
||||
"version": "1.53.2",
|
||||
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.53.2.tgz",
|
||||
"integrity": "sha512-6K/qQxVFuVQhRQhFsVZ9fGeatxirtrpPgxzBYWyZLEXJzqYwuL4fuNmfOfD5et1tJE4GScKyPNeLhZeRwuTU3A==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"playwright-core": "1.53.1"
|
||||
"playwright-core": "1.53.2"
|
||||
},
|
||||
"bin": {
|
||||
"playwright": "cli.js"
|
||||
@ -71,9 +71,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/playwright-core": {
|
||||
"version": "1.53.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.53.1.tgz",
|
||||
"integrity": "sha512-Z46Oq7tLAyT0lGoFx4DOuB1IA9D1TPj0QkYxpPVUnGDqHHvDpCftu1J2hM2PiWsNMoZh8+LQaarAWcDfPBc6zg==",
|
||||
"version": "1.53.2",
|
||||
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.53.2.tgz",
|
||||
"integrity": "sha512-ox/OytMy+2w1jcYEYlOo1Hhp8hZkLCximMTUTMBXjGUA1KoFfiSZ+DU+3a739jsPY0yoKH2TFy9S2fsJas8yAw==",
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"playwright-core": "cli.js"
|
||||
|
||||
@ -13,6 +13,6 @@
|
||||
"dependencies": {
|
||||
"csv-parser": "^3.2.0",
|
||||
"dotenv": "^17.0.0",
|
||||
"playwright": "^1.53.1"
|
||||
"playwright": "^1.53.2"
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,19 +1,19 @@
|
||||
console.log("START!");
|
||||
|
||||
const { chromium } = require("playwright");
|
||||
(async () => {
|
||||
console.log("browser!");
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||
});
|
||||
console.log("new page!");
|
||||
|
||||
const page = await browser.newPage();
|
||||
console.log("GOTO!");
|
||||
|
||||
await page.goto("https://example.com");
|
||||
console.log("Success!");
|
||||
await browser.close();
|
||||
})();
|
||||
console.log("START!");
|
||||
|
||||
const { chromium } = require("playwright");
|
||||
(async () => {
|
||||
console.log("browser!");
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||
});
|
||||
console.log("new page!");
|
||||
|
||||
const page = await browser.newPage();
|
||||
console.log("GOTO!");
|
||||
|
||||
await page.goto("https://example.com");
|
||||
console.log("Success!");
|
||||
await browser.close();
|
||||
})();
|
||||
Loading…
x
Reference in New Issue
Block a user