multiple search term support in jobspy
This commit is contained in:
parent
4b4ce5567f
commit
c4fa1794ea
@ -35,7 +35,8 @@ services:
|
|||||||
|
|
||||||
# JobSpy (Indeed/LinkedIn scraping) - optional
|
# JobSpy (Indeed/LinkedIn scraping) - optional
|
||||||
- JOBSPY_SITES=${JOBSPY_SITES:-indeed,linkedin}
|
- JOBSPY_SITES=${JOBSPY_SITES:-indeed,linkedin}
|
||||||
- JOBSPY_SEARCH_TERM=${JOBSPY_SEARCH_TERM:-web developer}
|
# Preferred: pipe-separated list, e.g. "web developer|frontend developer|react developer"
|
||||||
|
- JOBSPY_SEARCH_TERMS=${JOBSPY_SEARCH_TERMS:-web developer|frontend developer|react developer}
|
||||||
- JOBSPY_LOCATION=${JOBSPY_LOCATION:-UK}
|
- JOBSPY_LOCATION=${JOBSPY_LOCATION:-UK}
|
||||||
- JOBSPY_RESULTS_WANTED=${JOBSPY_RESULTS_WANTED:-200}
|
- JOBSPY_RESULTS_WANTED=${JOBSPY_RESULTS_WANTED:-200}
|
||||||
- JOBSPY_HOURS_OLD=${JOBSPY_HOURS_OLD:-72}
|
- JOBSPY_HOURS_OLD=${JOBSPY_HOURS_OLD:-72}
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { spawn } from 'child_process';
|
import { spawn } from 'child_process';
|
||||||
import { readFile, mkdir } from 'fs/promises';
|
import { readFile, mkdir, unlink } from 'fs/promises';
|
||||||
import { join, dirname } from 'path';
|
import { join, dirname } from 'path';
|
||||||
import { fileURLToPath } from 'url';
|
import { fileURLToPath } from 'url';
|
||||||
import type { CreateJobInput, JobSource } from '../../shared/types.js';
|
import type { CreateJobInput, JobSource } from '../../shared/types.js';
|
||||||
@ -110,7 +110,7 @@ function formatSalary(params: {
|
|||||||
|
|
||||||
export interface RunJobSpyOptions {
|
export interface RunJobSpyOptions {
|
||||||
sites?: Array<JobSource>;
|
sites?: Array<JobSource>;
|
||||||
searchTerm?: string;
|
searchTerms?: string[];
|
||||||
location?: string;
|
location?: string;
|
||||||
resultsWanted?: number;
|
resultsWanted?: number;
|
||||||
hoursOld?: number;
|
hoursOld?: number;
|
||||||
@ -129,14 +129,25 @@ export async function runJobSpy(options: RunJobSpyOptions = {}): Promise<JobSpyR
|
|||||||
const outputDir = join(dataDir, 'imports');
|
const outputDir = join(dataDir, 'imports');
|
||||||
await mkdir(outputDir, { recursive: true });
|
await mkdir(outputDir, { recursive: true });
|
||||||
|
|
||||||
const outputCsv = join(outputDir, 'jobspy_jobs.csv');
|
|
||||||
const outputJson = join(outputDir, 'jobspy_jobs.json');
|
|
||||||
|
|
||||||
const sites = (options.sites ?? ['indeed', 'linkedin'])
|
const sites = (options.sites ?? ['indeed', 'linkedin'])
|
||||||
.filter((s) => s === 'indeed' || s === 'linkedin')
|
.filter((s) => s === 'indeed' || s === 'linkedin')
|
||||||
.join(',');
|
.join(',');
|
||||||
|
|
||||||
|
const searchTerms = resolveSearchTerms(options);
|
||||||
|
if (searchTerms.length === 0) {
|
||||||
|
return { success: true, jobs: [] };
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
const jobs: CreateJobInput[] = [];
|
||||||
|
const seenJobUrls = new Set<string>();
|
||||||
|
|
||||||
|
for (let i = 0; i < searchTerms.length; i++) {
|
||||||
|
const searchTerm = searchTerms[i];
|
||||||
|
const suffix = `${i + 1}_${slugForFilename(searchTerm)}`;
|
||||||
|
const outputCsv = join(outputDir, `jobspy_jobs_${suffix}.csv`);
|
||||||
|
const outputJson = join(outputDir, `jobspy_jobs_${suffix}.json`);
|
||||||
|
|
||||||
await new Promise<void>((resolve, reject) => {
|
await new Promise<void>((resolve, reject) => {
|
||||||
const pythonPath = getPythonPath();
|
const pythonPath = getPythonPath();
|
||||||
const child = spawn(pythonPath, [JOBSPY_SCRIPT], {
|
const child = spawn(pythonPath, [JOBSPY_SCRIPT], {
|
||||||
@ -146,7 +157,7 @@ export async function runJobSpy(options: RunJobSpyOptions = {}): Promise<JobSpyR
|
|||||||
env: {
|
env: {
|
||||||
...process.env,
|
...process.env,
|
||||||
JOBSPY_SITES: sites || 'indeed,linkedin',
|
JOBSPY_SITES: sites || 'indeed,linkedin',
|
||||||
JOBSPY_SEARCH_TERM: options.searchTerm ?? process.env.JOBSPY_SEARCH_TERM ?? 'web developer',
|
JOBSPY_SEARCH_TERM: searchTerm,
|
||||||
JOBSPY_LOCATION: options.location ?? process.env.JOBSPY_LOCATION ?? 'UK',
|
JOBSPY_LOCATION: options.location ?? process.env.JOBSPY_LOCATION ?? 'UK',
|
||||||
JOBSPY_RESULTS_WANTED: String(options.resultsWanted ?? process.env.JOBSPY_RESULTS_WANTED ?? 200),
|
JOBSPY_RESULTS_WANTED: String(options.resultsWanted ?? process.env.JOBSPY_RESULTS_WANTED ?? 200),
|
||||||
JOBSPY_HOURS_OLD: String(options.hoursOld ?? process.env.JOBSPY_HOURS_OLD ?? 72),
|
JOBSPY_HOURS_OLD: String(options.hoursOld ?? process.env.JOBSPY_HOURS_OLD ?? 72),
|
||||||
@ -168,7 +179,80 @@ export async function runJobSpy(options: RunJobSpyOptions = {}): Promise<JobSpyR
|
|||||||
|
|
||||||
const raw = await readFile(outputJson, 'utf-8');
|
const raw = await readFile(outputJson, 'utf-8');
|
||||||
const parsed = JSON.parse(raw) as Array<Record<string, unknown>>;
|
const parsed = JSON.parse(raw) as Array<Record<string, unknown>>;
|
||||||
|
const mapped = mapJobSpyRows(parsed);
|
||||||
|
|
||||||
|
for (const job of mapped) {
|
||||||
|
const url = job.jobUrl;
|
||||||
|
if (seenJobUrls.has(url)) continue;
|
||||||
|
seenJobUrls.add(url);
|
||||||
|
jobs.push(job);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await unlink(outputJson);
|
||||||
|
await unlink(outputCsv);
|
||||||
|
} catch {
|
||||||
|
// Ignore cleanup errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { success: true, jobs };
|
||||||
|
} catch (error) {
|
||||||
|
const message = error instanceof Error ? error.message : 'Unknown error';
|
||||||
|
return { success: false, jobs: [], error: message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveSearchTerms(options: RunJobSpyOptions): string[] {
|
||||||
|
const fromOptions = options.searchTerms?.length ? options.searchTerms : null;
|
||||||
|
const fromEnv = parseSearchTermsEnv(process.env.JOBSPY_SEARCH_TERMS);
|
||||||
|
const raw = fromOptions ?? fromEnv ?? ['web developer'];
|
||||||
|
const out: string[] = [];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
for (const term of raw) {
|
||||||
|
const normalized = term.trim();
|
||||||
|
if (!normalized) continue;
|
||||||
|
const key = normalized.toLowerCase();
|
||||||
|
if (seen.has(key)) continue;
|
||||||
|
seen.add(key);
|
||||||
|
out.push(normalized);
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseSearchTermsEnv(raw: string | undefined): string[] | null {
|
||||||
|
if (!raw) return null;
|
||||||
|
const trimmed = raw.trim();
|
||||||
|
if (!trimmed) return null;
|
||||||
|
|
||||||
|
if (trimmed.startsWith('[')) {
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(trimmed) as unknown;
|
||||||
|
if (Array.isArray(parsed) && parsed.every((v) => typeof v === 'string')) {
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// fall through
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const delimiter = trimmed.includes('|') ? '|' : trimmed.includes('\n') ? '\n' : ',';
|
||||||
|
const split = trimmed.split(delimiter).map((t) => t.trim()).filter(Boolean);
|
||||||
|
return split.length > 0 ? split : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function slugForFilename(input: string): string {
|
||||||
|
const slug = input
|
||||||
|
.toLowerCase()
|
||||||
|
.replace(/[^a-z0-9]+/g, '_')
|
||||||
|
.replace(/^_+|_+$/g, '')
|
||||||
|
.slice(0, 40);
|
||||||
|
return slug || 'term';
|
||||||
|
}
|
||||||
|
|
||||||
|
function mapJobSpyRows(parsed: Array<Record<string, unknown>>): CreateJobInput[] {
|
||||||
const jobs: CreateJobInput[] = [];
|
const jobs: CreateJobInput[] = [];
|
||||||
|
|
||||||
for (const row of parsed) {
|
for (const row of parsed) {
|
||||||
@ -233,9 +317,5 @@ export async function runJobSpy(options: RunJobSpyOptions = {}): Promise<JobSpyR
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return { success: true, jobs };
|
return jobs;
|
||||||
} catch (error) {
|
|
||||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
||||||
return { success: false, jobs: [], error: message };
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user