Initial commit

This commit is contained in:
ilia 2025-06-29 14:06:38 -04:00
commit 8860bcecc6
5 changed files with 368 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
node_modules/
.env
results/
linkedout.exe
linkedout-macos
zip*
*.7z
*obfuscated.js

34
keywords.csv Normal file
View File

@ -0,0 +1,34 @@
keyword
layoff
terminated
termination
redundancy
redundancies
restructuring
cost cutting
workforce reduction
job cuts
job loss
downsizing
furlough
separation
outplacement
workforce adjustment
rightsizing
business realignment
organizational change
position elimination
role elimination
job elimination
staff reduction
headcount reduction
voluntary separation
involuntary separation
mass layoff
company reorganization
department closure
site closure
plant closure
office closure
workforce optimization
workforce transition
1 keyword
2 layoff
3 terminated
4 termination
5 redundancy
6 redundancies
7 restructuring
8 cost cutting
9 workforce reduction
10 job cuts
11 job loss
12 downsizing
13 furlough
14 separation
15 outplacement
16 workforce adjustment
17 rightsizing
18 business realignment
19 organizational change
20 position elimination
21 role elimination
22 job elimination
23 staff reduction
24 headcount reduction
25 voluntary separation
26 involuntary separation
27 mass layoff
28 company reorganization
29 department closure
30 site closure
31 plant closure
32 office closure
33 workforce optimization
34 workforce transition

222
linkedout.js Normal file
View File

@ -0,0 +1,222 @@
/**
* LinkedIn Posts Scraper (linkedout)
*
* This script logs into LinkedIn using credentials stored in a .env file,
* reads keywords from a CSV file (keywords.csv), and scrapes posts matching
* those keywords from LinkedIn's content search.
*
* Usage:
* node linkedout.js [--headless=true|false] [--keyword=additional_keyword]
*
* Command-line Parameters:
* --headless: Override the headless mode (true or false). Defaults to value in .env (HEADLESS).
* --keyword: Append an additional keyword to the list of keywords from keywords.csv.
*
* Output:
* Saves results to a timestamped JSON file in the 'results' directory.
*
* Requirements:
* - Node.js environment (or use the compiled executable)
* - Playwright installed (or included in the binary)
* - dotenv package for environment variables
* - csv-parser package for reading CSV files
*
* Environment Variables (.env):
* LINKEDIN_USERNAME - Your LinkedIn username
* LINKEDIN_PASSWORD - Your LinkedIn password
* HEADLESS - Default headless mode (true or false)
*
* Example:
* node linkedout.js --headless=true --keyword=layoff
*/
process.env.PLAYWRIGHT_BROWSERS_PATH = '0';
const { chromium } = require("playwright");
const fs = require("fs");
const path = require("path");
require("dotenv").config();
const csv = require("csv-parser");
const DATE_POSTED = "past-week"; // "past-24h", "past-week", "past-month", or ""
const SORT_BY = "date_posted"; // "relevance", "date_posted"
const WHEELS = 5;
const CITY = "Toronto";
// Read credentials and headless mode from .env
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
// Default headless mode from .env
let HEADLESS = process.env.HEADLESS === "true";
// Parse command-line arguments
const args = process.argv.slice(2);
let additionalKeyword = null;
for (const arg of args) {
if (arg.startsWith('--headless=')) {
const val = arg.split('=')[1].toLowerCase();
HEADLESS = val === 'true';
}
if (arg.startsWith('--keyword=')) {
additionalKeyword = arg.split('=')[1];
}
}
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
throw new Error("Missing LinkedIn credentials in .env file.");
}
function cleanText(text) {
text = text.replace(/#\w+/g, "");
text = text.replace(/\bhashtag\b/gi, "");
text = text.replace(/hashtag-\w+/gi, "");
text = text.replace(/https?:\/\/[^\s]+/g, "");
text = text.replace(
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
""
);
text = text.replace(/\s+/g, " ").trim();
return text;
}
function buildSearchUrl(keyword, city) {
let url = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
keyword + " " + city
)}`;
if (DATE_POSTED)
url += `&datePosted=${encodeURIComponent(`"${DATE_POSTED}"`)}`;
if (SORT_BY) url += `&sortBy=${encodeURIComponent(`"${SORT_BY}"`)}`;
url += `&origin=FACETED_SEARCH`;
return url;
}
function containsAnyKeyword(text, keywords) {
return keywords.some((k) => text.toLowerCase().includes(k.toLowerCase()));
}
// Read keywords from CSV
const keywords = [];
const csvPath = path.join(process.cwd(), "keywords.csv");
fs.createReadStream(csvPath)
.pipe(csv())
.on("data", (row) => {
if (row.keyword) keywords.push(row.keyword.trim());
})
.on("end", async () => {
if (keywords.length === 0) {
console.error("No keywords found in keywords.csv");
process.exit(1);
}
// Append additional keyword if provided
if (additionalKeyword) {
keywords.push(additionalKeyword);
console.log(`Added additional keyword from CLI: ${additionalKeyword}`);
}
const browser = await chromium.launch({ headless: HEADLESS });
const context = await browser.newContext();
const page = await context.newPage();
try {
await page.goto("https://www.linkedin.com/login");
await page.fill('input[name="session_key"]', LINKEDIN_USERNAME);
await page.fill('input[name="session_password"]', LINKEDIN_PASSWORD);
await page.click('button[type="submit"]');
await page.waitForSelector("img.global-nav__me-photo", { timeout: 10000 });
const seenPosts = new Set();
const seenProfiles = new Set();
const results = [];
for (const keyword of keywords) {
const searchUrl = buildSearchUrl(keyword, CITY);
await page.goto(searchUrl, { waitUntil: "load" });
try {
await page.waitForSelector(".feed-shared-update-v2", { timeout: 3000 });
} catch (error) {
console.log(
`---\nNo posts found for keyword: ${keyword}\nDate posted: ${DATE_POSTED}\nSort by: ${SORT_BY}`
);
continue;
}
for (let i = 0; i < WHEELS; i++) {
await page.mouse.wheel(0, 1000);
await page.waitForTimeout(1000);
}
const postContainers = await page.$$(".feed-shared-update-v2");
for (const container of postContainers) {
let text = "";
const textHandle = await container.$(
"div.update-components-text, span.break-words"
);
if (textHandle) {
text = (await textHandle.textContent()) || "";
text = cleanText(text);
}
if (
!text ||
seenPosts.has(text) ||
text.length < 30 ||
!/[a-zA-Z0-9]/.test(text)
)
continue;
seenPosts.add(text);
let profileLink = "";
const profileLinkElement = await container.$('a[href*="/in/"]');
if (profileLinkElement) {
profileLink = await profileLinkElement.getAttribute("href");
if (profileLink && !profileLink.startsWith("http")) {
profileLink = `https://www.linkedin.com${profileLink}`;
}
profileLink = profileLink.split("?")[0];
}
if (!profileLink || seenProfiles.has(profileLink)) continue;
seenProfiles.add(profileLink);
// Double-check keyword presence
if (!containsAnyKeyword(text, keywords)) continue;
console.log("---");
console.log("Keyword:", keyword);
console.log("Post:", text);
console.log("Profile:", profileLink);
results.push({
keyword,
text,
profileLink,
});
}
}
const now = new Date();
const timestamp = `${now.getFullYear()}-${String(now.getMonth() + 1).padStart(
2,
"0"
)}-${String(now.getDate()).padStart(2, "0")}-${String(
now.getHours()
).padStart(2, "0")}-${String(now.getMinutes()).padStart(2, "0")}`;
const resultsDir = "results";
const resultsFile = `${resultsDir}/results-${timestamp}.json`;
if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir);
}
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2), "utf-8");
console.log(`Saved ${results.length} posts to ${resultsFile}`);
} catch (err) {
console.error("Error:", err);
} finally {
await browser.close();
}
});

86
package-lock.json generated Normal file
View File

@ -0,0 +1,86 @@
{
"name": "linkedin-scraper",
"version": "1.0.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "linkedin-scraper",
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"csv-parser": "^3.2.0",
"dotenv": "^17.0.0",
"playwright": "^1.53.1"
}
},
"node_modules/csv-parser": {
"version": "3.2.0",
"resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.2.0.tgz",
"integrity": "sha512-fgKbp+AJbn1h2dcAHKIdKNSSjfp43BZZykXsCjzALjKy80VXQNHPFJ6T9Afwdzoj24aMkq8GwDS7KGcDPpejrA==",
"license": "MIT",
"bin": {
"csv-parser": "bin/csv-parser"
},
"engines": {
"node": ">= 10"
}
},
"node_modules/dotenv": {
"version": "17.0.0",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.0.0.tgz",
"integrity": "sha512-A0BJ5lrpJVSfnMMXjmeO0xUnoxqsBHWCoqqTnGwGYVdnctqXXUEhJOO7LxmgxJon9tEZFGpe0xPRX0h2v3AANQ==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://dotenvx.com"
}
},
"node_modules/fsevents": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
"hasInstallScript": true,
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
}
},
"node_modules/playwright": {
"version": "1.53.1",
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.53.1.tgz",
"integrity": "sha512-LJ13YLr/ocweuwxyGf1XNFWIU4M2zUSo149Qbp+A4cpwDjsxRPj7k6H25LBrEHiEwxvRbD8HdwvQmRMSvquhYw==",
"license": "Apache-2.0",
"dependencies": {
"playwright-core": "1.53.1"
},
"bin": {
"playwright": "cli.js"
},
"engines": {
"node": ">=18"
},
"optionalDependencies": {
"fsevents": "2.3.2"
}
},
"node_modules/playwright-core": {
"version": "1.53.1",
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.53.1.tgz",
"integrity": "sha512-Z46Oq7tLAyT0lGoFx4DOuB1IA9D1TPj0QkYxpPVUnGDqHHvDpCftu1J2hM2PiWsNMoZh8+LQaarAWcDfPBc6zg==",
"license": "Apache-2.0",
"bin": {
"playwright-core": "cli.js"
},
"engines": {
"node": ">=18"
}
}
}
}

18
package.json Normal file
View File

@ -0,0 +1,18 @@
{
"name": "linkedin-scraper",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"type": "commonjs",
"dependencies": {
"csv-parser": "^3.2.0",
"dotenv": "^17.0.0",
"playwright": "^1.53.1"
}
}