649 lines
21 KiB
JavaScript
649 lines
21 KiB
JavaScript
/**
|
|
* LinkedIn Posts Scraper (LinkedOut)
|
|
*
|
|
* A comprehensive tool for scraping LinkedIn posts based on keyword searches.
|
|
* Designed to track job market trends, layoffs, and open work opportunities
|
|
* by monitoring LinkedIn content automatically.
|
|
*
|
|
* FEATURES:
|
|
* - Automated LinkedIn login with browser automation
|
|
* - Keyword-based post searching from CSV files or CLI
|
|
* - Configurable search parameters (date, location, sorting)
|
|
* - Duplicate detection for posts and profiles
|
|
* - Text cleaning (removes hashtags, URLs, emojis)
|
|
* - Timestamped JSON output files
|
|
* - Command-line parameter overrides (see below)
|
|
* - Enhanced geographic location validation
|
|
* - Optional local AI-powered context analysis (Ollama)
|
|
*
|
|
* USAGE:
|
|
* node linkedout.js [options]
|
|
*
|
|
* COMMAND-LINE OPTIONS:
|
|
* --headless=true|false Override browser headless mode
|
|
* --keyword="kw1,kw2" Use only these keywords (comma-separated, overrides CSV)
|
|
* --add-keyword="kw1,kw2" Add extra keywords to CSV/CLI list
|
|
* --city="CityName" Override city
|
|
* --date_posted=VALUE Override date posted (past-24h, past-week, past-month, or empty)
|
|
* --sort_by=VALUE Override sort by (date_posted or relevance)
|
|
* --location_filter=VALUE Override location filter
|
|
* --output=FILE Output file name
|
|
* --no-location Disable location filtering
|
|
* --no-ai Disable AI analysis
|
|
* --ai-after Run local AI analysis after scraping
|
|
* --help, -h Show this help message
|
|
*
|
|
* EXAMPLES:
|
|
* node linkedout.js # Standard scraping
|
|
* node linkedout.js --headless=false # Visual mode
|
|
* node linkedout.js --keyword="layoff,downsizing" # Only these keywords
|
|
* node linkedout.js --add-keyword="hiring freeze" # Add extra keyword(s)
|
|
* node linkedout.js --city="Vancouver" --date_posted=past-month
|
|
* node linkedout.js --output=results/myfile.json
|
|
* node linkedout.js --no-location --no-ai # Fastest, no filters
|
|
* node linkedout.js --ai-after # Run AI after scraping
|
|
*
|
|
* POST-PROCESSING AI ANALYSIS:
|
|
* node ai-analyzer-local.js --context="job layoffs" # Run on latest results file
|
|
* node ai-analyzer-local.js --input=results/results-2024-01-15.json --context="hiring"
|
|
*
|
|
* ENVIRONMENT VARIABLES (.env file):
|
|
* KEYWORDS=keywords-layoff.csv (filename only, always looks in keywords/ folder unless path is given)
|
|
* See README for full list.
|
|
*
|
|
* OUTPUT:
|
|
* - Saves to results/results-YYYY-MM-DD-HH-MM.json (or as specified by --output)
|
|
* - Enhanced format with optional location validation and local AI analysis
|
|
*
|
|
* KEYWORD FILES:
|
|
* - Place all keyword CSVs in the keywords/ folder
|
|
* - keywords-layoff.csv: 33+ layoff-related terms
|
|
* - keywords-open-work.csv: Terms for finding people open to work
|
|
* - Custom CSV format: header "keyword" with one keyword per line
|
|
*
|
|
* DEPENDENCIES:
|
|
* - playwright: Browser automation
|
|
* - dotenv: Environment variable management
|
|
* - csv-parser: CSV file parsing
|
|
* - Node.js built-ins: fs, path, child_process
|
|
*
|
|
* SECURITY & LEGAL:
|
|
* - Store credentials securely in .env file
|
|
* - Respect LinkedIn's Terms of Service
|
|
* - Use responsibly for educational/research purposes
|
|
* - Consider rate limiting and LinkedIn API for production use
|
|
*/
|
|
//process.env.PLAYWRIGHT_BROWSERS_PATH = "0";
|
|
// Suppress D-Bus notification errors in WSL
|
|
process.env.NO_AT_BRIDGE = "1";
|
|
process.env.DBUS_SESSION_BUS_ADDRESS = "/dev/null";
|
|
|
|
const { chromium } = require("playwright");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
require("dotenv").config();
|
|
const csv = require("csv-parser");
|
|
const { spawn } = require("child_process");
|
|
|
|
// Core configuration
|
|
const DATE_POSTED = process.env.DATE_POSTED || "past-week";
|
|
const SORT_BY = process.env.SORT_BY || "date_posted";
|
|
const WHEELS = parseInt(process.env.WHEELS) || 5;
|
|
const CITY = process.env.CITY || "Toronto";
|
|
|
|
// Location filtering configuration
|
|
const LOCATION_FILTER = process.env.LOCATION_FILTER || "";
|
|
const ENABLE_LOCATION_CHECK = process.env.ENABLE_LOCATION_CHECK === "true";
|
|
|
|
// Local AI analysis configuration
|
|
const ENABLE_LOCAL_AI = process.env.ENABLE_LOCAL_AI === "true";
|
|
const RUN_LOCAL_AI_AFTER_SCRAPING =
|
|
process.env.RUN_LOCAL_AI_AFTER_SCRAPING === "true";
|
|
const AI_CONTEXT =
|
|
process.env.AI_CONTEXT || "job layoffs and workforce reduction";
|
|
|
|
// Import enhanced location utilities
|
|
const {
|
|
parseLocationFilters,
|
|
validateLocationAgainstFilters,
|
|
extractLocationFromProfile,
|
|
} = require("./location-utils");
|
|
|
|
// Read credentials
|
|
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
|
|
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
|
|
let HEADLESS = process.env.HEADLESS === "true";
|
|
|
|
// Parse command-line arguments
|
|
const args = process.argv.slice(2);
|
|
let cliKeywords = null; // If set, only use these
|
|
let additionalKeywords = [];
|
|
let disableLocation = false;
|
|
let disableAI = false;
|
|
let runAIAfter = RUN_LOCAL_AI_AFTER_SCRAPING;
|
|
let cliCity = null;
|
|
let cliDatePosted = null;
|
|
let cliSortBy = null;
|
|
let cliLocationFilter = null;
|
|
let cliOutput = null;
|
|
let showHelp = false;
|
|
|
|
for (const arg of args) {
|
|
if (arg.startsWith("--headless=")) {
|
|
const val = arg.split("=")[1].toLowerCase();
|
|
HEADLESS = val === "true";
|
|
}
|
|
if (arg.startsWith("--keyword=")) {
|
|
cliKeywords = arg
|
|
.split("=")[1]
|
|
.split(",")
|
|
.map((k) => k.trim())
|
|
.filter(Boolean);
|
|
}
|
|
if (arg.startsWith("--add-keyword=")) {
|
|
additionalKeywords = additionalKeywords.concat(
|
|
arg
|
|
.split("=")[1]
|
|
.split(",")
|
|
.map((k) => k.trim())
|
|
.filter(Boolean)
|
|
);
|
|
}
|
|
if (arg === "--no-location") {
|
|
disableLocation = true;
|
|
}
|
|
if (arg === "--no-ai") {
|
|
disableAI = true;
|
|
}
|
|
if (arg === "--ai-after") {
|
|
runAIAfter = true;
|
|
}
|
|
if (arg.startsWith("--city=")) {
|
|
cliCity = arg.split("=")[1];
|
|
}
|
|
if (arg.startsWith("--date_posted=")) {
|
|
cliDatePosted = arg.split("=")[1];
|
|
}
|
|
if (arg.startsWith("--sort_by=")) {
|
|
cliSortBy = arg.split("=")[1];
|
|
}
|
|
if (arg.startsWith("--location_filter=")) {
|
|
cliLocationFilter = arg.split("=")[1];
|
|
}
|
|
if (arg.startsWith("--output=")) {
|
|
cliOutput = arg.split("=")[1];
|
|
}
|
|
if (arg === "--help" || arg === "-h") {
|
|
showHelp = true;
|
|
}
|
|
}
|
|
|
|
if (showHelp) {
|
|
console.log(
|
|
`\nLinkedOut - LinkedIn Posts Scraper\n\nUsage: node linkedout.js [options]\n\nOptions:\n --headless=true|false Override browser headless mode\n --keyword="kw1,kw2" Use only these keywords (comma-separated, overrides CSV)\n --add-keyword="kw1,kw2" Add extra keywords to CSV list\n --city="CityName" Override city\n --date_posted=VALUE Override date posted (past-24h, past-week, past-month or '')\n --sort_by=VALUE Override sort by (date_posted or relevance)\n --location_filter=VALUE Override location filter\n --output=FILE Output file name\n --no-location Disable location filtering\n --no-ai Disable AI analysis\n --ai-after Run local AI analysis after scraping\n --help, -h Show this help message\n\nExamples:\n node linkedout.js --keyword="layoff,downsizing"\n node linkedout.js --add-keyword="hiring freeze"\n node linkedout.js --city="Vancouver" --date_posted=past-month\n node linkedout.js --output=results/myfile.json\n`
|
|
);
|
|
process.exit(0);
|
|
}
|
|
|
|
// Use CLI overrides if provided
|
|
const EFFECTIVE_CITY = cliCity || CITY;
|
|
const EFFECTIVE_DATE_POSTED = cliDatePosted || DATE_POSTED;
|
|
const EFFECTIVE_SORT_BY = cliSortBy || SORT_BY;
|
|
const EFFECTIVE_LOCATION_FILTER = cliLocationFilter || LOCATION_FILTER;
|
|
|
|
// Read keywords from CSV or CLI
|
|
const keywords = [];
|
|
let keywordEnv = process.env.KEYWORDS || "keywords-layoff.csv";
|
|
let csvPath = path.join(
|
|
process.cwd(),
|
|
keywordEnv.includes("/") ? keywordEnv : `keywords/${keywordEnv}`
|
|
);
|
|
|
|
function loadKeywordsAndStart() {
|
|
if (cliKeywords) {
|
|
// Only use CLI keywords
|
|
cliKeywords.forEach((k) => keywords.push(k));
|
|
if (additionalKeywords.length > 0) {
|
|
additionalKeywords.forEach((k) => keywords.push(k));
|
|
}
|
|
startScraper();
|
|
} else {
|
|
// Load from CSV, then add any additional keywords
|
|
fs.createReadStream(csvPath)
|
|
.pipe(csv())
|
|
.on("data", (row) => {
|
|
if (row.keyword) keywords.push(row.keyword.trim());
|
|
})
|
|
.on("end", () => {
|
|
if (keywords.length === 0) {
|
|
console.error("No keywords found in csv");
|
|
process.exit(1);
|
|
}
|
|
if (additionalKeywords.length > 0) {
|
|
additionalKeywords.forEach((k) => keywords.push(k));
|
|
console.log(
|
|
`Added additional keywords: ${additionalKeywords.join(", ")}`
|
|
);
|
|
}
|
|
startScraper();
|
|
});
|
|
}
|
|
}
|
|
|
|
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
|
|
throw new Error("Missing LinkedIn credentials in .env file.");
|
|
}
|
|
|
|
function cleanText(text) {
|
|
text = text.replace(/#\w+/g, "");
|
|
text = text.replace(/\bhashtag\b/gi, "");
|
|
text = text.replace(/hashtag-\w+/gi, "");
|
|
text = text.replace(/https?:\/\/[^\s]+/g, "");
|
|
text = text.replace(
|
|
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
|
|
""
|
|
);
|
|
text = text.replace(/\s+/g, " ").trim();
|
|
return text;
|
|
}
|
|
|
|
function buildSearchUrl(keyword, city) {
|
|
let url = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
|
|
keyword + " " + city
|
|
)}`;
|
|
if (EFFECTIVE_DATE_POSTED)
|
|
url += `&datePosted=${encodeURIComponent(`"${EFFECTIVE_DATE_POSTED}"`)}`;
|
|
if (EFFECTIVE_SORT_BY)
|
|
url += `&sortBy=${encodeURIComponent(`"${EFFECTIVE_SORT_BY}"`)}`;
|
|
url += `&origin=FACETED_SEARCH`;
|
|
return url;
|
|
}
|
|
|
|
function containsAnyKeyword(text, keywords) {
|
|
return keywords.some((k) => text.toLowerCase().includes(k.toLowerCase()));
|
|
}
|
|
|
|
/**
|
|
* Enhanced profile location validation with smart waiting (no timeouts)
|
|
* Uses a new tab to avoid disrupting the main scraping flow
|
|
*/
|
|
async function validateProfileLocation(
|
|
context,
|
|
profileLink,
|
|
locationFilterString
|
|
) {
|
|
if (!locationFilterString || !ENABLE_LOCATION_CHECK || disableLocation) {
|
|
return {
|
|
isValid: true,
|
|
location: "Not checked",
|
|
matchedFilter: null,
|
|
reasoning: "Location check disabled",
|
|
error: null,
|
|
};
|
|
}
|
|
|
|
let profilePage = null;
|
|
try {
|
|
console.log(`🌍 Checking profile location: ${profileLink}`);
|
|
|
|
// Create a new page/tab for profile validation
|
|
profilePage = await context.newPage();
|
|
await profilePage.goto(profileLink, {
|
|
waitUntil: "domcontentloaded",
|
|
timeout: 10000,
|
|
});
|
|
|
|
// Always use smart waiting for key profile elements
|
|
await Promise.race([
|
|
profilePage.waitForSelector("h1", { timeout: 3000 }),
|
|
profilePage.waitForSelector("[data-field='experience_section']", {
|
|
timeout: 3000,
|
|
}),
|
|
profilePage.waitForSelector(".pv-text-details__left-panel", {
|
|
timeout: 3000,
|
|
}),
|
|
]);
|
|
|
|
// Use enhanced location extraction
|
|
const location = await extractLocationFromProfile(profilePage);
|
|
|
|
if (!location) {
|
|
return {
|
|
isValid: false,
|
|
location: "Location not found",
|
|
matchedFilter: null,
|
|
reasoning: "Could not extract location from profile",
|
|
error: "Location extraction failed",
|
|
};
|
|
}
|
|
|
|
// Parse location filters
|
|
const locationFilters = parseLocationFilters(locationFilterString);
|
|
|
|
// Validate against filters
|
|
const validationResult = validateLocationAgainstFilters(
|
|
location,
|
|
locationFilters
|
|
);
|
|
|
|
return {
|
|
isValid: validationResult.isValid,
|
|
location,
|
|
matchedFilter: validationResult.matchedFilter,
|
|
reasoning: validationResult.reasoning,
|
|
error: validationResult.isValid ? null : validationResult.reasoning,
|
|
};
|
|
} catch (error) {
|
|
console.error(`❌ Error checking profile location: ${error.message}`);
|
|
return {
|
|
isValid: false,
|
|
location: "Error checking location",
|
|
matchedFilter: null,
|
|
reasoning: `Error: ${error.message}`,
|
|
error: error.message,
|
|
};
|
|
} finally {
|
|
// Always close the profile page to clean up
|
|
if (profilePage) {
|
|
try {
|
|
await profilePage.close();
|
|
} catch (closeError) {
|
|
console.error(`⚠️ Error closing profile page: ${closeError.message}`);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Run local AI analysis after scraping is complete
|
|
*/
|
|
async function runPostScrapingLocalAI(resultsFile) {
|
|
if (disableAI || !ENABLE_LOCAL_AI || !runAIAfter) {
|
|
return;
|
|
}
|
|
|
|
console.log("\n🧠 Starting post-scraping local AI analysis...");
|
|
|
|
const analyzerScript = "ai-analyzer-local.js";
|
|
const args = [`--input=${resultsFile}`, `--context=${AI_CONTEXT}`];
|
|
|
|
console.log(`🚀 Running: node ${analyzerScript} ${args.join(" ")}`);
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const child = spawn("node", [analyzerScript, ...args], {
|
|
stdio: "inherit",
|
|
cwd: process.cwd(),
|
|
});
|
|
|
|
child.on("close", (code) => {
|
|
if (code === 0) {
|
|
console.log("✅ Local AI analysis completed successfully");
|
|
resolve();
|
|
} else {
|
|
console.error(`❌ Local AI analysis failed with code ${code}`);
|
|
reject(new Error(`Local AI analysis process exited with code ${code}`));
|
|
}
|
|
});
|
|
|
|
child.on("error", (error) => {
|
|
console.error(`❌ Failed to run local AI analysis: ${error.message}`);
|
|
reject(error);
|
|
});
|
|
});
|
|
}
|
|
|
|
async function startScraper() {
|
|
console.log("\n🚀 LinkedOut Scraper Starting...");
|
|
console.log(`📊 Keywords: ${keywords.length}`);
|
|
console.log(
|
|
`🌍 Location Filter: ${
|
|
ENABLE_LOCATION_CHECK && !disableLocation
|
|
? LOCATION_FILTER || "None"
|
|
: "Disabled"
|
|
}`
|
|
);
|
|
console.log(
|
|
`🧠 Local AI Analysis: ${
|
|
ENABLE_LOCAL_AI && !disableAI
|
|
? runAIAfter
|
|
? "After scraping"
|
|
: "Manual"
|
|
: "Disabled"
|
|
}`
|
|
);
|
|
|
|
const browser = await chromium.launch({
|
|
headless: HEADLESS,
|
|
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
|
});
|
|
const context = await browser.newContext();
|
|
const page = await Promise.race([
|
|
context.newPage(),
|
|
new Promise((_, reject) =>
|
|
setTimeout(() => reject(new Error("newPage timeout")), 10000)
|
|
),
|
|
]).catch((err) => {
|
|
console.error("Failed to create new page:", err);
|
|
process.exit(1);
|
|
});
|
|
|
|
let scrapeError = null;
|
|
try {
|
|
await page.goto("https://www.linkedin.com/login");
|
|
await page.fill('input[name="session_key"]', LINKEDIN_USERNAME);
|
|
await page.fill('input[name="session_password"]', LINKEDIN_PASSWORD);
|
|
await page.click('button[type="submit"]');
|
|
await page.waitForSelector("img.global-nav__me-photo", {
|
|
timeout: 15000,
|
|
});
|
|
|
|
const seenPosts = new Set();
|
|
const seenProfiles = new Set();
|
|
const results = [];
|
|
const rejectedResults = [];
|
|
|
|
for (const keyword of keywords) {
|
|
const searchUrl = buildSearchUrl(keyword, EFFECTIVE_CITY);
|
|
await page.goto(searchUrl, { waitUntil: "load" });
|
|
|
|
try {
|
|
await page.waitForSelector(".feed-shared-update-v2", {
|
|
timeout: 3000,
|
|
});
|
|
} catch (error) {
|
|
console.log(
|
|
`---\nNo posts found for keyword: ${keyword}\nCity: ${EFFECTIVE_CITY}\nDate posted: ${EFFECTIVE_DATE_POSTED}\nSort by: ${EFFECTIVE_SORT_BY}`
|
|
);
|
|
continue;
|
|
}
|
|
|
|
for (let i = 0; i < WHEELS; i++) {
|
|
await page.mouse.wheel(0, 1000);
|
|
await page.waitForTimeout(1000);
|
|
}
|
|
|
|
const postContainers = await page.$$(".feed-shared-update-v2");
|
|
for (const container of postContainers) {
|
|
let text = "";
|
|
const textHandle = await container.$(
|
|
"div.update-components-text, span.break-words"
|
|
);
|
|
if (textHandle) {
|
|
text = (await textHandle.textContent()) || "";
|
|
text = cleanText(text);
|
|
}
|
|
if (
|
|
!text ||
|
|
seenPosts.has(text) ||
|
|
text.length < 30 ||
|
|
!/[a-zA-Z0-9]/.test(text)
|
|
) {
|
|
rejectedResults.push({
|
|
rejected: true,
|
|
reason: !text
|
|
? "No text"
|
|
: seenPosts.has(text)
|
|
? "Duplicate post"
|
|
: text.length < 30
|
|
? "Text too short"
|
|
: "No alphanumeric content",
|
|
keyword,
|
|
text,
|
|
profileLink: null,
|
|
timestamp: new Date().toISOString(),
|
|
});
|
|
continue;
|
|
}
|
|
seenPosts.add(text);
|
|
|
|
let profileLink = "";
|
|
const profileLinkElement = await container.$('a[href*="/in/"]');
|
|
if (profileLinkElement) {
|
|
profileLink = await profileLinkElement.getAttribute("href");
|
|
if (profileLink && !profileLink.startsWith("http")) {
|
|
profileLink = `https://www.linkedin.com${profileLink}`;
|
|
}
|
|
profileLink = profileLink.split("?")[0];
|
|
}
|
|
|
|
if (!profileLink || seenProfiles.has(profileLink)) {
|
|
rejectedResults.push({
|
|
rejected: true,
|
|
reason: !profileLink ? "No profile link" : "Duplicate profile",
|
|
keyword,
|
|
text,
|
|
profileLink,
|
|
timestamp: new Date().toISOString(),
|
|
});
|
|
continue;
|
|
}
|
|
seenProfiles.add(profileLink);
|
|
|
|
// Double-check keyword presence
|
|
if (!containsAnyKeyword(text, keywords)) {
|
|
rejectedResults.push({
|
|
rejected: true,
|
|
reason: "Keyword not present",
|
|
keyword,
|
|
text,
|
|
profileLink,
|
|
timestamp: new Date().toISOString(),
|
|
});
|
|
continue;
|
|
}
|
|
|
|
console.log("---");
|
|
console.log("Keyword:", keyword);
|
|
console.log("Post:", text.substring(0, 100) + "...");
|
|
console.log("Profile:", profileLink);
|
|
|
|
// Enhanced location validation
|
|
const locationCheck = await validateProfileLocation(
|
|
context,
|
|
profileLink,
|
|
EFFECTIVE_LOCATION_FILTER
|
|
);
|
|
console.log("📍 Location:", locationCheck.location);
|
|
console.log("🎯 Match:", locationCheck.reasoning);
|
|
|
|
if (!locationCheck.isValid) {
|
|
rejectedResults.push({
|
|
rejected: true,
|
|
reason: `Location filter failed: ${locationCheck.error}`,
|
|
keyword,
|
|
text,
|
|
profileLink,
|
|
location: locationCheck.location,
|
|
locationReasoning: locationCheck.reasoning,
|
|
timestamp: new Date().toISOString(),
|
|
});
|
|
console.log(
|
|
"❌ Skipping - Location filter failed:",
|
|
locationCheck.error
|
|
);
|
|
continue;
|
|
}
|
|
|
|
console.log("✅ Post passed all filters");
|
|
|
|
results.push({
|
|
keyword,
|
|
text,
|
|
profileLink,
|
|
location: locationCheck.location,
|
|
locationValid: locationCheck.isValid,
|
|
locationMatchedFilter: locationCheck.matchedFilter,
|
|
locationReasoning: locationCheck.reasoning,
|
|
timestamp: new Date().toLocaleString("en-CA", {
|
|
year: "numeric",
|
|
month: "2-digit",
|
|
day: "2-digit",
|
|
hour: "2-digit",
|
|
minute: "2-digit",
|
|
second: "2-digit",
|
|
hour12: false,
|
|
}),
|
|
aiProcessed: false,
|
|
});
|
|
}
|
|
}
|
|
|
|
const now = new Date();
|
|
const timestamp =
|
|
cliOutput ||
|
|
`${now.getFullYear()}-${String(now.getMonth() + 1).padStart(
|
|
2,
|
|
"0"
|
|
)}-${String(now.getDate()).padStart(2, "0")}-${String(
|
|
now.getHours()
|
|
).padStart(2, "0")}-${String(now.getMinutes()).padStart(2, "0")}`;
|
|
const resultsDir = "results";
|
|
const resultsFile = `${resultsDir}/results-${timestamp}.json`;
|
|
const rejectedFile = `${resultsDir}/results-${timestamp}-rejected.json`;
|
|
|
|
if (!fs.existsSync(resultsDir)) {
|
|
fs.mkdirSync(resultsDir);
|
|
}
|
|
|
|
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2), "utf-8");
|
|
fs.writeFileSync(
|
|
rejectedFile,
|
|
JSON.stringify(rejectedResults, null, 2),
|
|
"utf-8"
|
|
);
|
|
console.log(`\n🎉 Scraping Complete!`);
|
|
console.log(`📊 Saved ${results.length} posts to ${resultsFile}`);
|
|
console.log(
|
|
`📋 Saved ${rejectedResults.length} rejected posts to ${rejectedFile}`
|
|
);
|
|
|
|
// Run local AI analysis if requested
|
|
if (runAIAfter && results.length > 0 && !scrapeError) {
|
|
try {
|
|
await runPostScrapingLocalAI(resultsFile);
|
|
} catch (error) {
|
|
console.error(
|
|
"⚠️ Local AI analysis failed, but scraping completed successfully"
|
|
);
|
|
}
|
|
}
|
|
|
|
console.log(`\n💡 Next steps:`);
|
|
console.log(` 📋 Review results in ${resultsFile}`);
|
|
if (!runAIAfter && !disableAI) {
|
|
console.log(` 🧠 Local AI Analysis:`);
|
|
console.log(` node ai-analyzer-local.js --context="${AI_CONTEXT}"`);
|
|
console.log(
|
|
` node ai-analyzer-local.js --input=${resultsFile} --context="your context"`
|
|
);
|
|
}
|
|
} catch (err) {
|
|
scrapeError = err;
|
|
console.error("Error:", err);
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
loadKeywordsAndStart();
|