commit 8860bcecc62503951f81809c87886ffeb1d1a809 Author: ilia Date: Sun Jun 29 14:06:38 2025 -0400 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c58ff6c --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +node_modules/ +.env +results/ +linkedout.exe +linkedout-macos +zip* +*.7z +*obfuscated.js \ No newline at end of file diff --git a/keywords.csv b/keywords.csv new file mode 100644 index 0000000..a54008a --- /dev/null +++ b/keywords.csv @@ -0,0 +1,34 @@ +keyword +layoff +terminated +termination +redundancy +redundancies +restructuring +cost cutting +workforce reduction +job cuts +job loss +downsizing +furlough +separation +outplacement +workforce adjustment +rightsizing +business realignment +organizational change +position elimination +role elimination +job elimination +staff reduction +headcount reduction +voluntary separation +involuntary separation +mass layoff +company reorganization +department closure +site closure +plant closure +office closure +workforce optimization +workforce transition diff --git a/linkedout.js b/linkedout.js new file mode 100644 index 0000000..3ba74d1 --- /dev/null +++ b/linkedout.js @@ -0,0 +1,222 @@ +/** + * LinkedIn Posts Scraper (linkedout) + * + * This script logs into LinkedIn using credentials stored in a .env file, + * reads keywords from a CSV file (keywords.csv), and scrapes posts matching + * those keywords from LinkedIn's content search. + * + * Usage: + * node linkedout.js [--headless=true|false] [--keyword=additional_keyword] + * + * Command-line Parameters: + * --headless: Override the headless mode (true or false). Defaults to value in .env (HEADLESS). + * --keyword: Append an additional keyword to the list of keywords from keywords.csv. + * + * Output: + * Saves results to a timestamped JSON file in the 'results' directory. + * + * Requirements: + * - Node.js environment (or use the compiled executable) + * - Playwright installed (or included in the binary) + * - dotenv package for environment variables + * - csv-parser package for reading CSV files + * + * Environment Variables (.env): + * LINKEDIN_USERNAME - Your LinkedIn username + * LINKEDIN_PASSWORD - Your LinkedIn password + * HEADLESS - Default headless mode (true or false) + * + * Example: + * node linkedout.js --headless=true --keyword=layoff + */ +process.env.PLAYWRIGHT_BROWSERS_PATH = '0'; + +const { chromium } = require("playwright"); +const fs = require("fs"); +const path = require("path"); +require("dotenv").config(); +const csv = require("csv-parser"); + +const DATE_POSTED = "past-week"; // "past-24h", "past-week", "past-month", or "" +const SORT_BY = "date_posted"; // "relevance", "date_posted" +const WHEELS = 5; +const CITY = "Toronto"; + +// Read credentials and headless mode from .env +const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME; +const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD; + +// Default headless mode from .env +let HEADLESS = process.env.HEADLESS === "true"; + +// Parse command-line arguments +const args = process.argv.slice(2); +let additionalKeyword = null; + +for (const arg of args) { + if (arg.startsWith('--headless=')) { + const val = arg.split('=')[1].toLowerCase(); + HEADLESS = val === 'true'; + } + if (arg.startsWith('--keyword=')) { + additionalKeyword = arg.split('=')[1]; + } +} + +if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) { + throw new Error("Missing LinkedIn credentials in .env file."); +} + +function cleanText(text) { + text = text.replace(/#\w+/g, ""); + text = text.replace(/\bhashtag\b/gi, ""); + text = text.replace(/hashtag-\w+/gi, ""); + text = text.replace(/https?:\/\/[^\s]+/g, ""); + text = text.replace( + /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu, + "" + ); + text = text.replace(/\s+/g, " ").trim(); + return text; +} + +function buildSearchUrl(keyword, city) { + let url = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent( + keyword + " " + city + )}`; + if (DATE_POSTED) + url += `&datePosted=${encodeURIComponent(`"${DATE_POSTED}"`)}`; + if (SORT_BY) url += `&sortBy=${encodeURIComponent(`"${SORT_BY}"`)}`; + url += `&origin=FACETED_SEARCH`; + return url; +} + +function containsAnyKeyword(text, keywords) { + return keywords.some((k) => text.toLowerCase().includes(k.toLowerCase())); +} + +// Read keywords from CSV +const keywords = []; +const csvPath = path.join(process.cwd(), "keywords.csv"); + + +fs.createReadStream(csvPath) + .pipe(csv()) + .on("data", (row) => { + if (row.keyword) keywords.push(row.keyword.trim()); + }) + .on("end", async () => { + if (keywords.length === 0) { + console.error("No keywords found in keywords.csv"); + process.exit(1); + } + + // Append additional keyword if provided + if (additionalKeyword) { + keywords.push(additionalKeyword); + console.log(`Added additional keyword from CLI: ${additionalKeyword}`); + } + + const browser = await chromium.launch({ headless: HEADLESS }); + const context = await browser.newContext(); + const page = await context.newPage(); + + try { + await page.goto("https://www.linkedin.com/login"); + await page.fill('input[name="session_key"]', LINKEDIN_USERNAME); + await page.fill('input[name="session_password"]', LINKEDIN_PASSWORD); + await page.click('button[type="submit"]'); + await page.waitForSelector("img.global-nav__me-photo", { timeout: 10000 }); + + const seenPosts = new Set(); + const seenProfiles = new Set(); + const results = []; + + for (const keyword of keywords) { + const searchUrl = buildSearchUrl(keyword, CITY); + await page.goto(searchUrl, { waitUntil: "load" }); + + try { + await page.waitForSelector(".feed-shared-update-v2", { timeout: 3000 }); + } catch (error) { + console.log( + `---\nNo posts found for keyword: ${keyword}\nDate posted: ${DATE_POSTED}\nSort by: ${SORT_BY}` + ); + continue; + } + + for (let i = 0; i < WHEELS; i++) { + await page.mouse.wheel(0, 1000); + await page.waitForTimeout(1000); + } + + const postContainers = await page.$$(".feed-shared-update-v2"); + for (const container of postContainers) { + let text = ""; + const textHandle = await container.$( + "div.update-components-text, span.break-words" + ); + if (textHandle) { + text = (await textHandle.textContent()) || ""; + text = cleanText(text); + } + if ( + !text || + seenPosts.has(text) || + text.length < 30 || + !/[a-zA-Z0-9]/.test(text) + ) + continue; + seenPosts.add(text); + + let profileLink = ""; + const profileLinkElement = await container.$('a[href*="/in/"]'); + if (profileLinkElement) { + profileLink = await profileLinkElement.getAttribute("href"); + if (profileLink && !profileLink.startsWith("http")) { + profileLink = `https://www.linkedin.com${profileLink}`; + } + profileLink = profileLink.split("?")[0]; + } + + if (!profileLink || seenProfiles.has(profileLink)) continue; + seenProfiles.add(profileLink); + + // Double-check keyword presence + if (!containsAnyKeyword(text, keywords)) continue; + + console.log("---"); + console.log("Keyword:", keyword); + console.log("Post:", text); + console.log("Profile:", profileLink); + + results.push({ + keyword, + text, + profileLink, + }); + } + } + + const now = new Date(); + const timestamp = `${now.getFullYear()}-${String(now.getMonth() + 1).padStart( + 2, + "0" + )}-${String(now.getDate()).padStart(2, "0")}-${String( + now.getHours() + ).padStart(2, "0")}-${String(now.getMinutes()).padStart(2, "0")}`; + const resultsDir = "results"; + const resultsFile = `${resultsDir}/results-${timestamp}.json`; + + if (!fs.existsSync(resultsDir)) { + fs.mkdirSync(resultsDir); + } + + fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2), "utf-8"); + console.log(`Saved ${results.length} posts to ${resultsFile}`); + } catch (err) { + console.error("Error:", err); + } finally { + await browser.close(); + } + }); diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..e1c40a5 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,86 @@ +{ + "name": "linkedin-scraper", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "linkedin-scraper", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "csv-parser": "^3.2.0", + "dotenv": "^17.0.0", + "playwright": "^1.53.1" + } + }, + "node_modules/csv-parser": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.2.0.tgz", + "integrity": "sha512-fgKbp+AJbn1h2dcAHKIdKNSSjfp43BZZykXsCjzALjKy80VXQNHPFJ6T9Afwdzoj24aMkq8GwDS7KGcDPpejrA==", + "license": "MIT", + "bin": { + "csv-parser": "bin/csv-parser" + }, + "engines": { + "node": ">= 10" + } + }, + "node_modules/dotenv": { + "version": "17.0.0", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.0.0.tgz", + "integrity": "sha512-A0BJ5lrpJVSfnMMXjmeO0xUnoxqsBHWCoqqTnGwGYVdnctqXXUEhJOO7LxmgxJon9tEZFGpe0xPRX0h2v3AANQ==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/playwright": { + "version": "1.53.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.53.1.tgz", + "integrity": "sha512-LJ13YLr/ocweuwxyGf1XNFWIU4M2zUSo149Qbp+A4cpwDjsxRPj7k6H25LBrEHiEwxvRbD8HdwvQmRMSvquhYw==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.53.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.53.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.53.1.tgz", + "integrity": "sha512-Z46Oq7tLAyT0lGoFx4DOuB1IA9D1TPj0QkYxpPVUnGDqHHvDpCftu1J2hM2PiWsNMoZh8+LQaarAWcDfPBc6zg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..58efb3d --- /dev/null +++ b/package.json @@ -0,0 +1,18 @@ +{ + "name": "linkedin-scraper", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "type": "commonjs", + "dependencies": { + "csv-parser": "^3.2.0", + "dotenv": "^17.0.0", + "playwright": "^1.53.1" + } +}