/** * LinkedIn Posts Scraper (LinkedOut) * * A comprehensive tool for scraping LinkedIn posts based on keyword searches. * Designed to track job market trends, layoffs, and open work opportunities * by monitoring LinkedIn content automatically. * * FEATURES: * - Automated LinkedIn login with browser automation * - Keyword-based post searching from CSV files or CLI * - Configurable search parameters (date, location, sorting) * - Duplicate detection for posts and profiles * - Text cleaning (removes hashtags, URLs, emojis) * - Timestamped JSON output files * - Command-line parameter overrides (see below) * - Enhanced geographic location validation * - Optional local AI-powered context analysis (Ollama) * * USAGE: * node linkedout.js [options] * * COMMAND-LINE OPTIONS: * --headless=true|false Override browser headless mode * --keyword="kw1,kw2" Use only these keywords (comma-separated, overrides CSV) * --add-keyword="kw1,kw2" Add extra keywords to CSV/CLI list * --city="CityName" Override city * --date_posted=VALUE Override date posted (past-24h, past-week, past-month, or empty) * --sort_by=VALUE Override sort by (date_posted or relevance) * --location_filter=VALUE Override location filter * --output=FILE Output file name * --no-location Disable location filtering * --no-ai Disable AI analysis * --ai-after Run local AI analysis after scraping * --help, -h Show this help message * * EXAMPLES: * node linkedout.js # Standard scraping * node linkedout.js --headless=false # Visual mode * node linkedout.js --keyword="layoff,downsizing" # Only these keywords * node linkedout.js --add-keyword="hiring freeze" # Add extra keyword(s) * node linkedout.js --city="Vancouver" --date_posted=past-month * node linkedout.js --output=results/myfile.json * node linkedout.js --no-location --no-ai # Fastest, no filters * node linkedout.js --ai-after # Run AI after scraping * * POST-PROCESSING AI ANALYSIS: * node ai-analyzer-local.js --context="job layoffs" # Run on latest results file * node ai-analyzer-local.js --input=results/results-2024-01-15.json --context="hiring" * * ENVIRONMENT VARIABLES (.env file): * KEYWORDS=keywords-layoff.csv (filename only, always looks in keywords/ folder unless path is given) * See README for full list. * * OUTPUT: * - Saves to results/results-YYYY-MM-DD-HH-MM.json (or as specified by --output) * - Enhanced format with optional location validation and local AI analysis * * KEYWORD FILES: * - Place all keyword CSVs in the keywords/ folder * - keywords-layoff.csv: 33+ layoff-related terms * - keywords-open-work.csv: Terms for finding people open to work * - Custom CSV format: header "keyword" with one keyword per line * * DEPENDENCIES: * - playwright: Browser automation * - dotenv: Environment variable management * - csv-parser: CSV file parsing * - Node.js built-ins: fs, path, child_process * * SECURITY & LEGAL: * - Store credentials securely in .env file * - Respect LinkedIn's Terms of Service * - Use responsibly for educational/research purposes * - Consider rate limiting and LinkedIn API for production use */ //process.env.PLAYWRIGHT_BROWSERS_PATH = "0"; // Suppress D-Bus notification errors in WSL process.env.NO_AT_BRIDGE = "1"; process.env.DBUS_SESSION_BUS_ADDRESS = "/dev/null"; const { chromium } = require("playwright"); const fs = require("fs"); const path = require("path"); require("dotenv").config(); const csv = require("csv-parser"); const { spawn } = require("child_process"); // Core configuration const DATE_POSTED = process.env.DATE_POSTED || "past-week"; const SORT_BY = process.env.SORT_BY || "date_posted"; const WHEELS = parseInt(process.env.WHEELS) || 5; const CITY = process.env.CITY || "Toronto"; // Location filtering configuration const LOCATION_FILTER = process.env.LOCATION_FILTER || ""; const ENABLE_LOCATION_CHECK = process.env.ENABLE_LOCATION_CHECK === "true"; // Local AI analysis configuration const ENABLE_LOCAL_AI = process.env.ENABLE_LOCAL_AI === "true"; const RUN_LOCAL_AI_AFTER_SCRAPING = process.env.RUN_LOCAL_AI_AFTER_SCRAPING === "true"; const AI_CONTEXT = process.env.AI_CONTEXT || "job layoffs and workforce reduction"; // Import enhanced location utilities const { parseLocationFilters, validateLocationAgainstFilters, extractLocationFromProfile, } = require("./location-utils"); // Read credentials const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME; const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD; let HEADLESS = process.env.HEADLESS === "true"; // Parse command-line arguments const args = process.argv.slice(2); let cliKeywords = null; // If set, only use these let additionalKeywords = []; let disableLocation = false; let disableAI = false; let runAIAfter = RUN_LOCAL_AI_AFTER_SCRAPING; let cliCity = null; let cliDatePosted = null; let cliSortBy = null; let cliLocationFilter = null; let cliOutput = null; let showHelp = false; for (const arg of args) { if (arg.startsWith("--headless=")) { const val = arg.split("=")[1].toLowerCase(); HEADLESS = val === "true"; } if (arg.startsWith("--keyword=")) { cliKeywords = arg .split("=")[1] .split(",") .map((k) => k.trim()) .filter(Boolean); } if (arg.startsWith("--add-keyword=")) { additionalKeywords = additionalKeywords.concat( arg .split("=")[1] .split(",") .map((k) => k.trim()) .filter(Boolean) ); } if (arg === "--no-location") { disableLocation = true; } if (arg === "--no-ai") { disableAI = true; } if (arg === "--ai-after") { runAIAfter = true; } if (arg.startsWith("--city=")) { cliCity = arg.split("=")[1]; } if (arg.startsWith("--date_posted=")) { cliDatePosted = arg.split("=")[1]; } if (arg.startsWith("--sort_by=")) { cliSortBy = arg.split("=")[1]; } if (arg.startsWith("--location_filter=")) { cliLocationFilter = arg.split("=")[1]; } if (arg.startsWith("--output=")) { cliOutput = arg.split("=")[1]; } if (arg === "--help" || arg === "-h") { showHelp = true; } } if (showHelp) { console.log( `\nLinkedOut - LinkedIn Posts Scraper\n\nUsage: node linkedout.js [options]\n\nOptions:\n --headless=true|false Override browser headless mode\n --keyword="kw1,kw2" Use only these keywords (comma-separated, overrides CSV)\n --add-keyword="kw1,kw2" Add extra keywords to CSV list\n --city="CityName" Override city\n --date_posted=VALUE Override date posted (past-24h, past-week, past-month or '')\n --sort_by=VALUE Override sort by (date_posted or relevance)\n --location_filter=VALUE Override location filter\n --output=FILE Output file name\n --no-location Disable location filtering\n --no-ai Disable AI analysis\n --ai-after Run local AI analysis after scraping\n --help, -h Show this help message\n\nExamples:\n node linkedout.js --keyword="layoff,downsizing"\n node linkedout.js --add-keyword="hiring freeze"\n node linkedout.js --city="Vancouver" --date_posted=past-month\n node linkedout.js --output=results/myfile.json\n` ); process.exit(0); } // Use CLI overrides if provided const EFFECTIVE_CITY = cliCity || CITY; const EFFECTIVE_DATE_POSTED = cliDatePosted || DATE_POSTED; const EFFECTIVE_SORT_BY = cliSortBy || SORT_BY; const EFFECTIVE_LOCATION_FILTER = cliLocationFilter || LOCATION_FILTER; // Read keywords from CSV or CLI const keywords = []; let keywordEnv = process.env.KEYWORDS || "keywords-layoff.csv"; let csvPath = path.join( process.cwd(), keywordEnv.includes("/") ? keywordEnv : `keywords/${keywordEnv}` ); function loadKeywordsAndStart() { if (cliKeywords) { // Only use CLI keywords cliKeywords.forEach((k) => keywords.push(k)); if (additionalKeywords.length > 0) { additionalKeywords.forEach((k) => keywords.push(k)); } startScraper(); } else { // Load from CSV, then add any additional keywords fs.createReadStream(csvPath) .pipe(csv()) .on("data", (row) => { if (row.keyword) keywords.push(row.keyword.trim()); }) .on("end", () => { if (keywords.length === 0) { console.error("No keywords found in csv"); process.exit(1); } if (additionalKeywords.length > 0) { additionalKeywords.forEach((k) => keywords.push(k)); console.log( `Added additional keywords: ${additionalKeywords.join(", ")}` ); } startScraper(); }); } } if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) { throw new Error("Missing LinkedIn credentials in .env file."); } function cleanText(text) { text = text.replace(/#\w+/g, ""); text = text.replace(/\bhashtag\b/gi, ""); text = text.replace(/hashtag-\w+/gi, ""); text = text.replace(/https?:\/\/[^\s]+/g, ""); text = text.replace( /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu, "" ); text = text.replace(/\s+/g, " ").trim(); return text; } function buildSearchUrl(keyword, city) { let url = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent( keyword + " " + city )}`; if (EFFECTIVE_DATE_POSTED) url += `&datePosted=${encodeURIComponent(`"${EFFECTIVE_DATE_POSTED}"`)}`; if (EFFECTIVE_SORT_BY) url += `&sortBy=${encodeURIComponent(`"${EFFECTIVE_SORT_BY}"`)}`; url += `&origin=FACETED_SEARCH`; return url; } function containsAnyKeyword(text, keywords) { return keywords.some((k) => text.toLowerCase().includes(k.toLowerCase())); } /** * Enhanced profile location validation with smart waiting (no timeouts) * Uses a new tab to avoid disrupting the main scraping flow */ async function validateProfileLocation( context, profileLink, locationFilterString ) { if (!locationFilterString || !ENABLE_LOCATION_CHECK || disableLocation) { return { isValid: true, location: "Not checked", matchedFilter: null, reasoning: "Location check disabled", error: null, }; } let profilePage = null; try { console.log(`šŸŒ Checking profile location: ${profileLink}`); // Create a new page/tab for profile validation profilePage = await context.newPage(); await profilePage.goto(profileLink, { waitUntil: "domcontentloaded", timeout: 10000, }); // Always use smart waiting for key profile elements await Promise.race([ profilePage.waitForSelector("h1", { timeout: 3000 }), profilePage.waitForSelector("[data-field='experience_section']", { timeout: 3000, }), profilePage.waitForSelector(".pv-text-details__left-panel", { timeout: 3000, }), ]); // Use enhanced location extraction const location = await extractLocationFromProfile(profilePage); if (!location) { return { isValid: false, location: "Location not found", matchedFilter: null, reasoning: "Could not extract location from profile", error: "Location extraction failed", }; } // Parse location filters const locationFilters = parseLocationFilters(locationFilterString); // Validate against filters const validationResult = validateLocationAgainstFilters( location, locationFilters ); return { isValid: validationResult.isValid, location, matchedFilter: validationResult.matchedFilter, reasoning: validationResult.reasoning, error: validationResult.isValid ? null : validationResult.reasoning, }; } catch (error) { console.error(`āŒ Error checking profile location: ${error.message}`); return { isValid: false, location: "Error checking location", matchedFilter: null, reasoning: `Error: ${error.message}`, error: error.message, }; } finally { // Always close the profile page to clean up if (profilePage) { try { await profilePage.close(); } catch (closeError) { console.error(`āš ļø Error closing profile page: ${closeError.message}`); } } } } /** * Run local AI analysis after scraping is complete */ async function runPostScrapingLocalAI(resultsFile) { if (disableAI || !ENABLE_LOCAL_AI || !runAIAfter) { return; } console.log("\n🧠 Starting post-scraping local AI analysis..."); const analyzerScript = "ai-analyzer-local.js"; const args = [`--input=${resultsFile}`, `--context=${AI_CONTEXT}`]; console.log(`šŸš€ Running: node ${analyzerScript} ${args.join(" ")}`); return new Promise((resolve, reject) => { const child = spawn("node", [analyzerScript, ...args], { stdio: "inherit", cwd: process.cwd(), }); child.on("close", (code) => { if (code === 0) { console.log("āœ… Local AI analysis completed successfully"); resolve(); } else { console.error(`āŒ Local AI analysis failed with code ${code}`); reject(new Error(`Local AI analysis process exited with code ${code}`)); } }); child.on("error", (error) => { console.error(`āŒ Failed to run local AI analysis: ${error.message}`); reject(error); }); }); } async function startScraper() { console.log("\nšŸš€ LinkedOut Scraper Starting..."); console.log(`šŸ“Š Keywords: ${keywords.length}`); console.log( `šŸŒ Location Filter: ${ ENABLE_LOCATION_CHECK && !disableLocation ? LOCATION_FILTER || "None" : "Disabled" }` ); console.log( `🧠 Local AI Analysis: ${ ENABLE_LOCAL_AI && !disableAI ? runAIAfter ? "After scraping" : "Manual" : "Disabled" }` ); const browser = await chromium.launch({ headless: HEADLESS, args: ["--no-sandbox", "--disable-setuid-sandbox"], }); const context = await browser.newContext(); const page = await Promise.race([ context.newPage(), new Promise((_, reject) => setTimeout(() => reject(new Error("newPage timeout")), 10000) ), ]).catch((err) => { console.error("Failed to create new page:", err); process.exit(1); }); let scrapeError = null; try { await page.goto("https://www.linkedin.com/login"); await page.fill('input[name="session_key"]', LINKEDIN_USERNAME); await page.fill('input[name="session_password"]', LINKEDIN_PASSWORD); await page.click('button[type="submit"]'); await page.waitForSelector("img.global-nav__me-photo", { timeout: 15000, }); const seenPosts = new Set(); const seenProfiles = new Set(); const results = []; const rejectedResults = []; for (const keyword of keywords) { const searchUrl = buildSearchUrl(keyword, EFFECTIVE_CITY); await page.goto(searchUrl, { waitUntil: "load" }); try { await page.waitForSelector(".feed-shared-update-v2", { timeout: 3000, }); } catch (error) { console.log( `---\nNo posts found for keyword: ${keyword}\nCity: ${EFFECTIVE_CITY}\nDate posted: ${EFFECTIVE_DATE_POSTED}\nSort by: ${EFFECTIVE_SORT_BY}` ); continue; } for (let i = 0; i < WHEELS; i++) { await page.mouse.wheel(0, 1000); await page.waitForTimeout(1000); } const postContainers = await page.$$(".feed-shared-update-v2"); for (const container of postContainers) { let text = ""; const textHandle = await container.$( "div.update-components-text, span.break-words" ); if (textHandle) { text = (await textHandle.textContent()) || ""; text = cleanText(text); } if ( !text || seenPosts.has(text) || text.length < 30 || !/[a-zA-Z0-9]/.test(text) ) { rejectedResults.push({ rejected: true, reason: !text ? "No text" : seenPosts.has(text) ? "Duplicate post" : text.length < 30 ? "Text too short" : "No alphanumeric content", keyword, text, profileLink: null, timestamp: new Date().toISOString(), }); continue; } seenPosts.add(text); let profileLink = ""; const profileLinkElement = await container.$('a[href*="/in/"]'); if (profileLinkElement) { profileLink = await profileLinkElement.getAttribute("href"); if (profileLink && !profileLink.startsWith("http")) { profileLink = `https://www.linkedin.com${profileLink}`; } profileLink = profileLink.split("?")[0]; } if (!profileLink || seenProfiles.has(profileLink)) { rejectedResults.push({ rejected: true, reason: !profileLink ? "No profile link" : "Duplicate profile", keyword, text, profileLink, timestamp: new Date().toISOString(), }); continue; } seenProfiles.add(profileLink); // Double-check keyword presence if (!containsAnyKeyword(text, keywords)) { rejectedResults.push({ rejected: true, reason: "Keyword not present", keyword, text, profileLink, timestamp: new Date().toISOString(), }); continue; } console.log("---"); console.log("Keyword:", keyword); console.log("Post:", text.substring(0, 100) + "..."); console.log("Profile:", profileLink); // Enhanced location validation const locationCheck = await validateProfileLocation( context, profileLink, EFFECTIVE_LOCATION_FILTER ); console.log("šŸ“ Location:", locationCheck.location); console.log("šŸŽÆ Match:", locationCheck.reasoning); if (!locationCheck.isValid) { rejectedResults.push({ rejected: true, reason: `Location filter failed: ${locationCheck.error}`, keyword, text, profileLink, location: locationCheck.location, locationReasoning: locationCheck.reasoning, timestamp: new Date().toISOString(), }); console.log( "āŒ Skipping - Location filter failed:", locationCheck.error ); continue; } console.log("āœ… Post passed all filters"); results.push({ keyword, text, profileLink, location: locationCheck.location, locationValid: locationCheck.isValid, locationMatchedFilter: locationCheck.matchedFilter, locationReasoning: locationCheck.reasoning, timestamp: new Date().toLocaleString("en-CA", { year: "numeric", month: "2-digit", day: "2-digit", hour: "2-digit", minute: "2-digit", second: "2-digit", hour12: false, }), aiProcessed: false, }); } } const now = new Date(); const timestamp = cliOutput || `${now.getFullYear()}-${String(now.getMonth() + 1).padStart( 2, "0" )}-${String(now.getDate()).padStart(2, "0")}-${String( now.getHours() ).padStart(2, "0")}-${String(now.getMinutes()).padStart(2, "0")}`; const resultsDir = "results"; const resultsFile = `${resultsDir}/results-${timestamp}.json`; const rejectedFile = `${resultsDir}/results-${timestamp}-rejected.json`; if (!fs.existsSync(resultsDir)) { fs.mkdirSync(resultsDir); } fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2), "utf-8"); fs.writeFileSync( rejectedFile, JSON.stringify(rejectedResults, null, 2), "utf-8" ); console.log(`\nšŸŽ‰ Scraping Complete!`); console.log(`šŸ“Š Saved ${results.length} posts to ${resultsFile}`); console.log( `šŸ“‹ Saved ${rejectedResults.length} rejected posts to ${rejectedFile}` ); // Run local AI analysis if requested if (runAIAfter && results.length > 0 && !scrapeError) { try { await runPostScrapingLocalAI(resultsFile); } catch (error) { console.error( "āš ļø Local AI analysis failed, but scraping completed successfully" ); } } console.log(`\nšŸ’” Next steps:`); console.log(` šŸ“‹ Review results in ${resultsFile}`); if (!runAIAfter && !disableAI) { console.log(` 🧠 Local AI Analysis:`); console.log(` node ai-analyzer-local.js --context="${AI_CONTEXT}"`); console.log( ` node ai-analyzer-local.js --input=${resultsFile} --context="your context"` ); } } catch (err) { scrapeError = err; console.error("Error:", err); } finally { await browser.close(); } } loadKeywordsAndStart();