linkedout/linkedout.js
2025-06-29 17:27:56 -04:00

236 lines
7.4 KiB
JavaScript

/**
* LinkedIn Posts Scraper (linkedout)
*
* This script logs into LinkedIn using credentials stored in a .env file,
* reads keywords from a CSV file (keywords.csv), and scrapes posts matching
* those keywords from LinkedIn's content search.
*
* Usage:
* node linkedout.js [--headless=true|false] [--keyword=additional_keyword]
*
* Command-line Parameters:
* --headless: Override the headless mode (true or false). Defaults to value in .env (HEADLESS).
* --keyword: Append an additional keyword to the list of keywords from keywords.csv.
*
* Output:
* Saves results to a timestamped JSON file in the 'results' directory.
*
* Requirements:
* - Node.js environment (or use the compiled executable)
* - Playwright installed (or included in the binary)
* - dotenv package for environment variables
* - csv-parser package for reading CSV files
*
* Environment Variables (.env):
* LINKEDIN_USERNAME - Your LinkedIn username
* LINKEDIN_PASSWORD - Your LinkedIn password
* HEADLESS - Default headless mode (true or false)
*
* Example:
* node linkedout.js --headless=true --keyword=layoff
*/
process.env.PLAYWRIGHT_BROWSERS_PATH = "0";
const { chromium } = require("playwright");
const fs = require("fs");
const path = require("path");
require("dotenv").config();
const csv = require("csv-parser");
const DATE_POSTED = "past-week"; // "past-24h", "past-week", "past-month", or ""
const SORT_BY = "date_posted"; // "relevance", "date_posted"
const WHEELS = 5;
const CITY = "Toronto";
// Read credentials and headless mode from .env
const LINKEDIN_USERNAME = process.env.LINKEDIN_USERNAME;
const LINKEDIN_PASSWORD = process.env.LINKEDIN_PASSWORD;
// Default headless mode from .env
let HEADLESS = process.env.HEADLESS === "true";
// Parse command-line arguments
const args = process.argv.slice(2);
let additionalKeyword = null;
for (const arg of args) {
if (arg.startsWith("--headless=")) {
const val = arg.split("=")[1].toLowerCase();
HEADLESS = val === "true";
}
if (arg.startsWith("--keyword=")) {
additionalKeyword = arg.split("=")[1];
}
}
if (!LINKEDIN_USERNAME || !LINKEDIN_PASSWORD) {
throw new Error("Missing LinkedIn credentials in .env file.");
}
function cleanText(text) {
text = text.replace(/#\w+/g, "");
text = text.replace(/\bhashtag\b/gi, "");
text = text.replace(/hashtag-\w+/gi, "");
text = text.replace(/https?:\/\/[^\s]+/g, "");
text = text.replace(
/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}]/gu,
""
);
text = text.replace(/\s+/g, " ").trim();
return text;
}
function buildSearchUrl(keyword, city) {
let url = `https://www.linkedin.com/search/results/content/?keywords=${encodeURIComponent(
keyword + " " + city
)}`;
if (DATE_POSTED)
url += `&datePosted=${encodeURIComponent(`"${DATE_POSTED}"`)}`;
if (SORT_BY) url += `&sortBy=${encodeURIComponent(`"${SORT_BY}"`)}`;
url += `&origin=FACETED_SEARCH`;
return url;
}
function containsAnyKeyword(text, keywords) {
return keywords.some((k) => text.toLowerCase().includes(k.toLowerCase()));
}
// Read keywords from CSV
const keywords = [];
const csvPath = path.join(process.cwd(), "keywords.csv");
fs.createReadStream(csvPath)
.pipe(csv())
.on("data", (row) => {
if (row.keyword) keywords.push(row.keyword.trim());
})
.on("end", async () => {
if (keywords.length === 0) {
console.error("No keywords found in keywords.csv");
process.exit(1);
}
// Append additional keyword if provided
if (additionalKeyword) {
keywords.push(additionalKeyword);
console.log(`Added additional keyword from CLI: ${additionalKeyword}`);
}
const browser = await chromium.launch({
headless: HEADLESS,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const context = await browser.newContext();
const page = await Promise.race([
context.newPage(),
new Promise((_, reject) =>
setTimeout(() => reject(new Error("newPage timeout")), 10000)
),
]).catch((err) => {
console.error("Failed to create new page:", err);
process.exit(1);
});
try {
await page.goto("https://www.linkedin.com/login");
await page.fill('input[name="session_key"]', LINKEDIN_USERNAME);
await page.fill('input[name="session_password"]', LINKEDIN_PASSWORD);
await page.click('button[type="submit"]');
await page.waitForSelector("img.global-nav__me-photo", {
timeout: 10000,
});
const seenPosts = new Set();
const seenProfiles = new Set();
const results = [];
for (const keyword of keywords) {
const searchUrl = buildSearchUrl(keyword, CITY);
await page.goto(searchUrl, { waitUntil: "load" });
try {
await page.waitForSelector(".feed-shared-update-v2", {
timeout: 3000,
});
} catch (error) {
console.log(
`---\nNo posts found for keyword: ${keyword}\nDate posted: ${DATE_POSTED}\nSort by: ${SORT_BY}`
);
continue;
}
for (let i = 0; i < WHEELS; i++) {
await page.mouse.wheel(0, 1000);
await page.waitForTimeout(1000);
}
const postContainers = await page.$$(".feed-shared-update-v2");
for (const container of postContainers) {
let text = "";
const textHandle = await container.$(
"div.update-components-text, span.break-words"
);
if (textHandle) {
text = (await textHandle.textContent()) || "";
text = cleanText(text);
}
if (
!text ||
seenPosts.has(text) ||
text.length < 30 ||
!/[a-zA-Z0-9]/.test(text)
)
continue;
seenPosts.add(text);
let profileLink = "";
const profileLinkElement = await container.$('a[href*="/in/"]');
if (profileLinkElement) {
profileLink = await profileLinkElement.getAttribute("href");
if (profileLink && !profileLink.startsWith("http")) {
profileLink = `https://www.linkedin.com${profileLink}`;
}
profileLink = profileLink.split("?")[0];
}
if (!profileLink || seenProfiles.has(profileLink)) continue;
seenProfiles.add(profileLink);
// Double-check keyword presence
if (!containsAnyKeyword(text, keywords)) continue;
console.log("---");
console.log("Keyword:", keyword);
console.log("Post:", text);
console.log("Profile:", profileLink);
results.push({
keyword,
text,
profileLink,
});
}
}
const now = new Date();
const timestamp = `${now.getFullYear()}-${String(
now.getMonth() + 1
).padStart(2, "0")}-${String(now.getDate()).padStart(2, "0")}-${String(
now.getHours()
).padStart(2, "0")}-${String(now.getMinutes()).padStart(2, "0")}`;
const resultsDir = "results";
const resultsFile = `${resultsDir}/results-${timestamp}.json`;
if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir);
}
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2), "utf-8");
console.log(`Saved ${results.length} posts to ${resultsFile}`);
} catch (err) {
console.error("Error:", err);
} finally {
await browser.close();
}
});