job_spy implementation

This commit is contained in:
DaKheera47 2025-12-14 22:24:34 +00:00
parent 4a00b3b900
commit cefb75a9ec
21 changed files with 1152 additions and 52 deletions

View File

@ -23,3 +23,15 @@ NOTION_DATABASE_ID=
# Optional: Webhook secret for n8n automation # Optional: Webhook secret for n8n automation
WEBHOOK_SECRET= WEBHOOK_SECRET=
# =============================================================================
# JobSpy (Indeed/LinkedIn scraping) - optional
# =============================================================================
# These control the Python JobSpy scraper used by the pipeline.
JOBSPY_SITES=indeed,linkedin
JOBSPY_SEARCH_TERM=web developer
JOBSPY_LOCATION=UK
JOBSPY_RESULTS_WANTED=200
JOBSPY_HOURS_OLD=72
JOBSPY_COUNTRY_INDEED=UK
JOBSPY_LINKEDIN_FETCH_DESCRIPTION=1

View File

@ -21,8 +21,8 @@ RUN apt-get update && apt-get install -y \
# Set working directory # Set working directory
WORKDIR /app WORKDIR /app
# Install Playwright and Firefox only # Install Playwright and Firefox only (plus JobSpy for Indeed/LinkedIn scraping)
RUN pip3 install --no-cache-dir --break-system-packages playwright && \ RUN pip3 install --no-cache-dir --break-system-packages playwright python-jobspy && \
npx playwright install firefox npx playwright install firefox
# Copy package files first for better caching # Copy package files first for better caching
@ -43,6 +43,7 @@ RUN npx camoufox fetch
WORKDIR /app WORKDIR /app
COPY orchestrator ./orchestrator COPY orchestrator ./orchestrator
COPY job-extractor ./job-extractor COPY job-extractor ./job-extractor
COPY jobspy-extractor ./jobspy-extractor
COPY resume-generator ./resume-generator COPY resume-generator ./resume-generator
# Build the orchestrator (client + server) # Build the orchestrator (client + server)

View File

@ -33,6 +33,15 @@ services:
- PIPELINE_TOP_N=${PIPELINE_TOP_N:-10} - PIPELINE_TOP_N=${PIPELINE_TOP_N:-10}
- PIPELINE_MIN_SCORE=${PIPELINE_MIN_SCORE:-50} - PIPELINE_MIN_SCORE=${PIPELINE_MIN_SCORE:-50}
# JobSpy (Indeed/LinkedIn scraping) - optional
- JOBSPY_SITES=${JOBSPY_SITES:-indeed,linkedin}
- JOBSPY_SEARCH_TERM=${JOBSPY_SEARCH_TERM:-web developer}
- JOBSPY_LOCATION=${JOBSPY_LOCATION:-UK}
- JOBSPY_RESULTS_WANTED=${JOBSPY_RESULTS_WANTED:-200}
- JOBSPY_HOURS_OLD=${JOBSPY_HOURS_OLD:-72}
- JOBSPY_COUNTRY_INDEED=${JOBSPY_COUNTRY_INDEED:-UK}
- JOBSPY_LINKEDIN_FETCH_DESCRIPTION=${JOBSPY_LINKEDIN_FETCH_DESCRIPTION:-1}
# Optional: Notion integration # Optional: Notion integration
- NOTION_API_KEY=${NOTION_API_KEY:-} - NOTION_API_KEY=${NOTION_API_KEY:-}
- NOTION_DATABASE_ID=${NOTION_DATABASE_ID:-} - NOTION_DATABASE_ID=${NOTION_DATABASE_ID:-}

View File

@ -0,0 +1 @@
python-jobspy

View File

@ -0,0 +1,77 @@
import csv
import os
from pathlib import Path
from jobspy import scrape_jobs
def _env_str(name: str, default: str) -> str:
value = os.getenv(name)
return value if value and value.strip() else default
def _env_int(name: str, default: int) -> int:
value = os.getenv(name)
if value is None or value.strip() == "":
return default
try:
return int(value)
except ValueError:
return default
def _env_bool(name: str, default: bool) -> bool:
value = os.getenv(name)
if value is None or value.strip() == "":
return default
return value.strip().lower() in ("1", "true", "yes", "y", "on")
def _parse_sites(raw: str) -> list[str]:
return [s.strip() for s in raw.split(",") if s.strip()]
def main() -> int:
sites = _parse_sites(_env_str("JOBSPY_SITES", "indeed,linkedin"))
search_term = _env_str("JOBSPY_SEARCH_TERM", "web developer")
location = _env_str("JOBSPY_LOCATION", "UK")
results_wanted = _env_int("JOBSPY_RESULTS_WANTED", 200)
hours_old = _env_int("JOBSPY_HOURS_OLD", 72)
country_indeed = _env_str("JOBSPY_COUNTRY_INDEED", "UK")
linkedin_fetch_description = _env_bool("JOBSPY_LINKEDIN_FETCH_DESCRIPTION", True)
output_csv = Path(_env_str("JOBSPY_OUTPUT_CSV", "jobs.csv"))
output_json = Path(_env_str("JOBSPY_OUTPUT_JSON", str(output_csv.with_suffix(".json"))))
output_csv.parent.mkdir(parents=True, exist_ok=True)
output_json.parent.mkdir(parents=True, exist_ok=True)
jobs = scrape_jobs(
site_name=sites,
search_term=search_term,
location=location,
results_wanted=results_wanted,
hours_old=hours_old,
country_indeed=country_indeed,
linkedin_fetch_description=linkedin_fetch_description,
)
print(f"Found {len(jobs)} jobs")
jobs.to_csv(
output_csv,
quoting=csv.QUOTE_NONNUMERIC,
escapechar="\\",
index=False,
)
jobs.to_json(output_json, orient="records", force_ascii=False)
print(f"Wrote CSV: {output_csv}")
print(f"Wrote JSON: {output_json}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -19,3 +19,14 @@ PIPELINE_MIN_SCORE=50
# RXResume credentials (for PDF generation) # RXResume credentials (for PDF generation)
RXRESUME_EMAIL= RXRESUME_EMAIL=
RXRESUME_PASSWORD= RXRESUME_PASSWORD=
# =============================================================================
# JobSpy (Indeed/LinkedIn scraping) - optional
# =============================================================================
JOBSPY_SITES=indeed,linkedin
JOBSPY_SEARCH_TERM=web developer
JOBSPY_LOCATION=UK
JOBSPY_RESULTS_WANTED=200
JOBSPY_HOURS_OLD=72
JOBSPY_COUNTRY_INDEED=UK
JOBSPY_LINKEDIN_FETCH_DESCRIPTION=1

View File

@ -9,6 +9,7 @@
"version": "1.0.0", "version": "1.0.0",
"dependencies": { "dependencies": {
"@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-alert-dialog": "^1.1.15",
"@radix-ui/react-dropdown-menu": "^2.1.15",
"@radix-ui/react-progress": "^1.1.8", "@radix-ui/react-progress": "^1.1.8",
"@radix-ui/react-separator": "^1.1.8", "@radix-ui/react-separator": "^1.1.8",
"@radix-ui/react-slot": "^1.2.4", "@radix-ui/react-slot": "^1.2.4",
@ -1229,6 +1230,40 @@
"node": ">=12" "node": ">=12"
} }
}, },
"node_modules/@floating-ui/core": {
"version": "1.7.3",
"resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.7.3.tgz",
"integrity": "sha512-sGnvb5dmrJaKEZ+LDIpguvdX3bDlEllmv4/ClQ9awcmCZrlx5jQyyMWFM5kBI+EyNOCDDiKk8il0zeuX3Zlg/w==",
"dependencies": {
"@floating-ui/utils": "^0.2.10"
}
},
"node_modules/@floating-ui/dom": {
"version": "1.7.4",
"resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.7.4.tgz",
"integrity": "sha512-OOchDgh4F2CchOX94cRVqhvy7b3AFb+/rQXyswmzmGakRfkMgoWVjfnLWkRirfLEfuD4ysVW16eXzwt3jHIzKA==",
"dependencies": {
"@floating-ui/core": "^1.7.3",
"@floating-ui/utils": "^0.2.10"
}
},
"node_modules/@floating-ui/react-dom": {
"version": "2.1.6",
"resolved": "https://registry.npmjs.org/@floating-ui/react-dom/-/react-dom-2.1.6.tgz",
"integrity": "sha512-4JX6rEatQEvlmgU80wZyq9RT96HZJa88q8hp0pBd+LrczeDI4o6uA2M+uvxngVHo4Ihr8uibXxH6+70zhAFrVw==",
"dependencies": {
"@floating-ui/dom": "^1.7.4"
},
"peerDependencies": {
"react": ">=16.8.0",
"react-dom": ">=16.8.0"
}
},
"node_modules/@floating-ui/utils": {
"version": "0.2.10",
"resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.10.tgz",
"integrity": "sha512-aGTxbpbg8/b5JfU1HXSrbH3wXZuLPJcNEcZQFMxLs3oSzgtVu6nFPkbbGGUvBcUjKV2YyB9Wxxabo+HEH9tcRQ=="
},
"node_modules/@jridgewell/gen-mapping": { "node_modules/@jridgewell/gen-mapping": {
"version": "0.3.13", "version": "0.3.13",
"resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
@ -1335,6 +1370,28 @@
} }
} }
}, },
"node_modules/@radix-ui/react-arrow": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-collection": { "node_modules/@radix-ui/react-collection": {
"version": "1.1.7", "version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
@ -1497,6 +1554,34 @@
} }
} }
}, },
"node_modules/@radix-ui/react-dropdown-menu": {
"version": "2.1.16",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.16.tgz",
"integrity": "sha512-1PLGQEynI/3OX/ftV54COn+3Sud/Mn8vALg2rWnBLnRaGtJDduNW/22XjlGgPdpcIbiQxjKtb7BkcjP00nqfJw==",
"dependencies": {
"@radix-ui/primitive": "1.1.3",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-id": "1.1.1",
"@radix-ui/react-menu": "2.1.16",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-controllable-state": "1.2.2"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-focus-guards": { "node_modules/@radix-ui/react-focus-guards": {
"version": "1.1.3", "version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
@ -1552,6 +1637,93 @@
} }
} }
}, },
"node_modules/@radix-ui/react-menu": {
"version": "2.1.16",
"resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.16.tgz",
"integrity": "sha512-72F2T+PLlphrqLcAotYPp0uJMr5SjP5SL01wfEspJbru5Zs5vQaSHb4VB3ZMJPimgHHCHG7gMOeOB9H3Hdmtxg==",
"dependencies": {
"@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-dismissable-layer": "1.1.11",
"@radix-ui/react-focus-guards": "1.1.3",
"@radix-ui/react-focus-scope": "1.1.7",
"@radix-ui/react-id": "1.1.1",
"@radix-ui/react-popper": "1.2.8",
"@radix-ui/react-portal": "1.1.9",
"@radix-ui/react-presence": "1.1.5",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-roving-focus": "1.1.11",
"@radix-ui/react-slot": "1.2.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"aria-hidden": "^1.2.4",
"react-remove-scroll": "^2.6.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-slot": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz",
"integrity": "sha512-aeNmHnBxbi2St0au6VBVC7JXFlhLlOnvIIlePNniyUNAClzmtAUEY8/pBiK3iHjufOlwA+c20/8jngo7xcrg8A==",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-popper": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
"integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
"dependencies": {
"@floating-ui/react-dom": "^2.0.0",
"@radix-ui/react-arrow": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-rect": "1.1.1",
"@radix-ui/react-use-size": "1.1.1",
"@radix-ui/rect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-portal": { "node_modules/@radix-ui/react-portal": {
"version": "1.1.9", "version": "1.1.9",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@ -1896,6 +2068,45 @@
} }
} }
}, },
"node_modules/@radix-ui/react-use-rect": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-rect/-/react-use-rect-1.1.1.tgz",
"integrity": "sha512-QTYuDesS0VtuHNNvMh+CjlKJ4LJickCMUAqjlE3+j8w+RlRpwyX3apEQKGFzbZGdo7XNG1tXa+bQqIE7HIXT2w==",
"dependencies": {
"@radix-ui/rect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-use-size": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-size/-/react-use-size-1.1.1.tgz",
"integrity": "sha512-ewrXRDTAqAXlkl6t/fkXWNAhFX9I+CkKlw6zjEwk86RSPKwZr3xpBRso655aqYafwtnbpHLj6toFzmd6xdVptQ==",
"dependencies": {
"@radix-ui/react-use-layout-effect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/@radix-ui/rect": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/@radix-ui/rect/-/rect-1.1.1.tgz",
"integrity": "sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw=="
},
"node_modules/@rolldown/pluginutils": { "node_modules/@rolldown/pluginutils": {
"version": "1.0.0-beta.27", "version": "1.0.0-beta.27",
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz", "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz",

View File

@ -18,6 +18,7 @@
"pipeline:run": "tsx src/server/pipeline/run.ts" "pipeline:run": "tsx src/server/pipeline/run.ts"
}, },
"dependencies": { "dependencies": {
"@radix-ui/react-dropdown-menu": "^2.1.15",
"@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-alert-dialog": "^1.1.15",
"@radix-ui/react-progress": "^1.1.8", "@radix-ui/react-progress": "^1.1.8",
"@radix-ui/react-separator": "^1.1.8", "@radix-ui/react-separator": "^1.1.8",

View File

@ -6,10 +6,13 @@ import React, { useCallback, useEffect, useState } from "react";
import { toast } from "sonner"; import { toast } from "sonner";
import { Toaster } from "@/components/ui/sonner"; import { Toaster } from "@/components/ui/sonner";
import type { Job, JobStatus } from "../shared/types"; import type { Job, JobSource, JobStatus } from "../shared/types";
import { Header, JobList, PipelineProgress, Stats } from "./components"; import { Header, JobList, PipelineProgress, Stats } from "./components";
import * as api from "./api"; import * as api from "./api";
const DEFAULT_PIPELINE_SOURCES: JobSource[] = ["gradcracker", "indeed", "linkedin"];
const PIPELINE_SOURCES_STORAGE_KEY = "jobops.pipeline.sources";
export const App: React.FC = () => { export const App: React.FC = () => {
const [jobs, setJobs] = useState<Job[]>([]); const [jobs, setJobs] = useState<Job[]>([]);
const [stats, setStats] = useState<Record<JobStatus, number>>({ const [stats, setStats] = useState<Record<JobStatus, number>>({
@ -24,6 +27,27 @@ export const App: React.FC = () => {
const [isPipelineRunning, setIsPipelineRunning] = useState(false); const [isPipelineRunning, setIsPipelineRunning] = useState(false);
const [processingJobId, setProcessingJobId] = useState<string | null>(null); const [processingJobId, setProcessingJobId] = useState<string | null>(null);
const [isProcessingAll, setIsProcessingAll] = useState(false); const [isProcessingAll, setIsProcessingAll] = useState(false);
const [pipelineSources, setPipelineSources] = useState<JobSource[]>(() => {
try {
const raw = localStorage.getItem(PIPELINE_SOURCES_STORAGE_KEY);
if (!raw) return DEFAULT_PIPELINE_SOURCES;
const parsed = JSON.parse(raw) as unknown;
const allowed: JobSource[] = ["gradcracker", "indeed", "linkedin"];
if (!Array.isArray(parsed)) return DEFAULT_PIPELINE_SOURCES;
const next = parsed.filter((value): value is JobSource => allowed.includes(value));
return next.length > 0 ? next : DEFAULT_PIPELINE_SOURCES;
} catch {
return DEFAULT_PIPELINE_SOURCES;
}
});
useEffect(() => {
try {
localStorage.setItem(PIPELINE_SOURCES_STORAGE_KEY, JSON.stringify(pipelineSources));
} catch {
// Ignore localStorage errors
}
}, [pipelineSources]);
const loadJobs = useCallback(async () => { const loadJobs = useCallback(async () => {
try { try {
@ -63,8 +87,10 @@ export const App: React.FC = () => {
const handleRunPipeline = async () => { const handleRunPipeline = async () => {
try { try {
setIsPipelineRunning(true); setIsPipelineRunning(true);
await api.runPipeline(); await api.runPipeline({ sources: pipelineSources });
toast.message("Pipeline started", { description: "This may take a few minutes." }); toast.message("Pipeline started", {
description: `Sources: ${pipelineSources.join(", ")}. This may take a few minutes.`,
});
const pollInterval = setInterval(async () => { const pollInterval = setInterval(async () => {
try { try {
@ -170,6 +196,8 @@ export const App: React.FC = () => {
onClearDatabase={handleClearDatabase} onClearDatabase={handleClearDatabase}
isPipelineRunning={isPipelineRunning} isPipelineRunning={isPipelineRunning}
isLoading={isLoading} isLoading={isLoading}
pipelineSources={pipelineSources}
onPipelineSourcesChange={setPipelineSources}
/> />
<main className="container mx-auto max-w-7xl space-y-6 px-4 py-6 pb-12"> <main className="container mx-auto max-w-7xl space-y-6 px-4 py-6 pb-12">
@ -190,4 +218,3 @@ export const App: React.FC = () => {
</> </>
); );
}; };

View File

@ -7,6 +7,7 @@ import type {
ApiResponse, ApiResponse,
JobsListResponse, JobsListResponse,
PipelineStatusResponse, PipelineStatusResponse,
JobSource,
PipelineRun PipelineRun
} from '../../shared/types'; } from '../../shared/types';
@ -83,6 +84,7 @@ export async function getPipelineRuns(): Promise<PipelineRun[]> {
export async function runPipeline(config?: { export async function runPipeline(config?: {
topN?: number; topN?: number;
minSuitabilityScore?: number; minSuitabilityScore?: number;
sources?: JobSource[];
}): Promise<{ message: string }> { }): Promise<{ message: string }> {
return fetchApi<{ message: string }>('/pipeline/run', { return fetchApi<{ message: string }>('/pipeline/run', {
method: 'POST', method: 'POST',

View File

@ -3,7 +3,7 @@
*/ */
import React from "react"; import React from "react";
import { Loader2, Play, RefreshCcw, Rocket, Trash2 } from "lucide-react"; import { ChevronDown, Loader2, Play, RefreshCcw, Rocket, Trash2 } from "lucide-react";
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import { import {
@ -17,6 +17,16 @@ import {
AlertDialogTitle, AlertDialogTitle,
AlertDialogTrigger, AlertDialogTrigger,
} from "@/components/ui/alert-dialog"; } from "@/components/ui/alert-dialog";
import {
DropdownMenu,
DropdownMenuCheckboxItem,
DropdownMenuContent,
DropdownMenuItem,
DropdownMenuLabel,
DropdownMenuSeparator,
DropdownMenuTrigger,
} from "@/components/ui/dropdown-menu";
import type { JobSource } from "../../shared/types";
interface HeaderProps { interface HeaderProps {
onRunPipeline: () => void; onRunPipeline: () => void;
@ -24,6 +34,8 @@ interface HeaderProps {
onClearDatabase: () => void; onClearDatabase: () => void;
isPipelineRunning: boolean; isPipelineRunning: boolean;
isLoading: boolean; isLoading: boolean;
pipelineSources: JobSource[];
onPipelineSourcesChange: (sources: JobSource[]) => void;
} }
export const Header: React.FC<HeaderProps> = ({ export const Header: React.FC<HeaderProps> = ({
@ -32,7 +44,26 @@ export const Header: React.FC<HeaderProps> = ({
onClearDatabase, onClearDatabase,
isPipelineRunning, isPipelineRunning,
isLoading, isLoading,
pipelineSources,
onPipelineSourcesChange,
}) => { }) => {
const sourceLabel: Record<JobSource, string> = {
gradcracker: "Gradcracker",
indeed: "Indeed",
linkedin: "LinkedIn",
};
const orderedSources: JobSource[] = ["gradcracker", "indeed", "linkedin"];
const toggleSource = (source: JobSource, checked: boolean) => {
const next = checked
? Array.from(new Set([...pipelineSources, source]))
: pipelineSources.filter((s) => s !== source);
if (next.length === 0) return;
onPipelineSourcesChange(next);
};
return ( return (
<header className="sticky top-0 z-40 border-b bg-background/80 backdrop-blur supports-[backdrop-filter]:bg-background/60"> <header className="sticky top-0 z-40 border-b bg-background/80 backdrop-blur supports-[backdrop-filter]:bg-background/60">
<div className="container mx-auto flex max-w-7xl items-center justify-between gap-4 px-4 py-4"> <div className="container mx-auto flex max-w-7xl items-center justify-between gap-4 px-4 py-4">
@ -81,19 +112,62 @@ export const Header: React.FC<HeaderProps> = ({
<span className="hidden sm:inline">Refresh</span> <span className="hidden sm:inline">Refresh</span>
</Button> </Button>
<Button size="sm" onClick={onRunPipeline} disabled={isPipelineRunning}> <div className="flex items-center">
{isPipelineRunning ? ( <Button
<> size="sm"
<Loader2 className="h-4 w-4 animate-spin" /> onClick={onRunPipeline}
Running... disabled={isPipelineRunning}
</> className="rounded-r-none"
) : ( >
<> {isPipelineRunning ? (
<Play className="h-4 w-4" /> <>
Run Pipeline <Loader2 className="h-4 w-4 animate-spin" />
</> Running...
)} </>
</Button> ) : (
<>
<Play className="h-4 w-4" />
Run Pipeline
</>
)}
</Button>
<DropdownMenu>
<DropdownMenuTrigger asChild>
<Button
size="sm"
disabled={isPipelineRunning}
className="rounded-l-none border-l border-primary-foreground/20 px-2"
aria-label="Select pipeline sources"
>
<ChevronDown className="h-4 w-4" />
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end" className="w-56">
<DropdownMenuLabel>Sources</DropdownMenuLabel>
<DropdownMenuSeparator />
{orderedSources.map((source) => (
<DropdownMenuCheckboxItem
key={source}
checked={pipelineSources.includes(source)}
onCheckedChange={(checked) => toggleSource(source, Boolean(checked))}
>
{sourceLabel[source]}
</DropdownMenuCheckboxItem>
))}
<DropdownMenuSeparator />
<DropdownMenuItem onSelect={() => onPipelineSourcesChange(orderedSources)}>
All sources
</DropdownMenuItem>
<DropdownMenuItem onSelect={() => onPipelineSourcesChange(["gradcracker"])}>
Gradcracker only
</DropdownMenuItem>
<DropdownMenuItem onSelect={() => onPipelineSourcesChange(["indeed", "linkedin"])}>
Indeed + LinkedIn only
</DropdownMenuItem>
</DropdownMenuContent>
</DropdownMenu>
</div>
</div> </div>
</div> </div>
</header> </header>

View File

@ -55,6 +55,8 @@ export const JobCard: React.FC<JobCardProps> = ({
}) => { }) => {
const sourceLabel: Record<Job["source"], string> = { const sourceLabel: Record<Job["source"], string> = {
gradcracker: "Gradcracker", gradcracker: "Gradcracker",
indeed: "Indeed",
linkedin: "LinkedIn",
}; };
const hasPdf = !!job.pdfPath; const hasPdf = !!job.pdfPath;

View File

@ -0,0 +1,193 @@
import * as React from "react"
import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu"
import { Check, ChevronRight, Circle } from "lucide-react"
import { cn } from "@/lib/utils"
const DropdownMenu = DropdownMenuPrimitive.Root
const DropdownMenuTrigger = DropdownMenuPrimitive.Trigger
const DropdownMenuGroup = DropdownMenuPrimitive.Group
const DropdownMenuPortal = DropdownMenuPrimitive.Portal
const DropdownMenuSub = DropdownMenuPrimitive.Sub
const DropdownMenuRadioGroup = DropdownMenuPrimitive.RadioGroup
const DropdownMenuSubTrigger = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.SubTrigger>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.SubTrigger> & {
inset?: boolean
}
>(({ className, inset, children, ...props }, ref) => (
<DropdownMenuPrimitive.SubTrigger
ref={ref}
className={cn(
"flex cursor-default select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none focus:bg-accent data-[state=open]:bg-accent",
inset && "pl-8",
className
)}
{...props}
>
{children}
<ChevronRight className="ml-auto h-4 w-4" />
</DropdownMenuPrimitive.SubTrigger>
))
DropdownMenuSubTrigger.displayName = DropdownMenuPrimitive.SubTrigger.displayName
const DropdownMenuSubContent = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.SubContent>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.SubContent>
>(({ className, ...props }, ref) => (
<DropdownMenuPrimitive.SubContent
ref={ref}
className={cn(
"z-50 min-w-[8rem] overflow-hidden rounded-md border bg-popover p-1 text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
className
)}
{...props}
/>
))
DropdownMenuSubContent.displayName = DropdownMenuPrimitive.SubContent.displayName
const DropdownMenuContent = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.Content>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Content>
>(({ className, sideOffset = 4, ...props }, ref) => (
<DropdownMenuPrimitive.Portal>
<DropdownMenuPrimitive.Content
ref={ref}
sideOffset={sideOffset}
className={cn(
"z-50 min-w-[8rem] overflow-hidden rounded-md border bg-popover p-1 text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
className
)}
{...props}
/>
</DropdownMenuPrimitive.Portal>
))
DropdownMenuContent.displayName = DropdownMenuPrimitive.Content.displayName
const DropdownMenuItem = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.Item>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Item> & {
inset?: boolean
}
>(({ className, inset, ...props }, ref) => (
<DropdownMenuPrimitive.Item
ref={ref}
className={cn(
"relative flex cursor-default select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
inset && "pl-8",
className
)}
{...props}
/>
))
DropdownMenuItem.displayName = DropdownMenuPrimitive.Item.displayName
const DropdownMenuCheckboxItem = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.CheckboxItem>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.CheckboxItem>
>(({ className, children, checked, ...props }, ref) => (
<DropdownMenuPrimitive.CheckboxItem
ref={ref}
className={cn(
"relative flex cursor-default select-none items-center rounded-sm py-1.5 pl-8 pr-2 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
className
)}
checked={checked}
{...props}
>
<span className="absolute left-2 flex h-3.5 w-3.5 items-center justify-center">
<DropdownMenuPrimitive.ItemIndicator>
<Check className="h-4 w-4" />
</DropdownMenuPrimitive.ItemIndicator>
</span>
{children}
</DropdownMenuPrimitive.CheckboxItem>
))
DropdownMenuCheckboxItem.displayName =
DropdownMenuPrimitive.CheckboxItem.displayName
const DropdownMenuRadioItem = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.RadioItem>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.RadioItem>
>(({ className, children, ...props }, ref) => (
<DropdownMenuPrimitive.RadioItem
ref={ref}
className={cn(
"relative flex cursor-default select-none items-center rounded-sm py-1.5 pl-8 pr-2 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",
className
)}
{...props}
>
<span className="absolute left-2 flex h-3.5 w-3.5 items-center justify-center">
<DropdownMenuPrimitive.ItemIndicator>
<Circle className="h-2 w-2 fill-current" />
</DropdownMenuPrimitive.ItemIndicator>
</span>
{children}
</DropdownMenuPrimitive.RadioItem>
))
DropdownMenuRadioItem.displayName = DropdownMenuPrimitive.RadioItem.displayName
const DropdownMenuLabel = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.Label>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Label> & {
inset?: boolean
}
>(({ className, inset, ...props }, ref) => (
<DropdownMenuPrimitive.Label
ref={ref}
className={cn("px-2 py-1.5 text-sm font-semibold", inset && "pl-8", className)}
{...props}
/>
))
DropdownMenuLabel.displayName = DropdownMenuPrimitive.Label.displayName
const DropdownMenuSeparator = React.forwardRef<
React.ElementRef<typeof DropdownMenuPrimitive.Separator>,
React.ComponentPropsWithoutRef<typeof DropdownMenuPrimitive.Separator>
>(({ className, ...props }, ref) => (
<DropdownMenuPrimitive.Separator
ref={ref}
className={cn("-mx-1 my-1 h-px bg-muted", className)}
{...props}
/>
))
DropdownMenuSeparator.displayName = DropdownMenuPrimitive.Separator.displayName
const DropdownMenuShortcut = ({
className,
...props
}: React.HTMLAttributes<HTMLSpanElement>) => {
return (
<span
className={cn("ml-auto text-xs tracking-widest opacity-60", className)}
{...props}
/>
)
}
DropdownMenuShortcut.displayName = "DropdownMenuShortcut"
export {
DropdownMenu,
DropdownMenuTrigger,
DropdownMenuContent,
DropdownMenuItem,
DropdownMenuCheckboxItem,
DropdownMenuRadioItem,
DropdownMenuLabel,
DropdownMenuSeparator,
DropdownMenuShortcut,
DropdownMenuGroup,
DropdownMenuPortal,
DropdownMenuSub,
DropdownMenuSubContent,
DropdownMenuSubTrigger,
DropdownMenuRadioGroup,
}

View File

@ -280,6 +280,7 @@ apiRouter.get('/pipeline/runs', async (req: Request, res: Response) => {
const runPipelineSchema = z.object({ const runPipelineSchema = z.object({
topN: z.number().min(1).max(50).optional(), topN: z.number().min(1).max(50).optional(),
minSuitabilityScore: z.number().min(0).max(100).optional(), minSuitabilityScore: z.number().min(0).max(100).optional(),
sources: z.array(z.enum(['gradcracker', 'indeed', 'linkedin'])).min(1).optional(),
}); });
apiRouter.post('/pipeline/run', async (req: Request, res: Response) => { apiRouter.post('/pipeline/run', async (req: Request, res: Response) => {

View File

@ -26,6 +26,33 @@ const migrations = [
`CREATE TABLE IF NOT EXISTS jobs ( `CREATE TABLE IF NOT EXISTS jobs (
id TEXT PRIMARY KEY, id TEXT PRIMARY KEY,
source TEXT NOT NULL DEFAULT 'gradcracker', source TEXT NOT NULL DEFAULT 'gradcracker',
source_job_id TEXT,
job_url_direct TEXT,
date_posted TEXT,
job_type TEXT,
salary_source TEXT,
salary_interval TEXT,
salary_min_amount REAL,
salary_max_amount REAL,
salary_currency TEXT,
is_remote INTEGER,
job_level TEXT,
job_function TEXT,
listing_type TEXT,
emails TEXT,
company_industry TEXT,
company_logo TEXT,
company_url_direct TEXT,
company_addresses TEXT,
company_num_employees TEXT,
company_revenue TEXT,
company_description TEXT,
skills TEXT,
experience_range TEXT,
company_rating REAL,
company_reviews_count INTEGER,
vacancy_count INTEGER,
work_from_home_type TEXT,
title TEXT NOT NULL, title TEXT NOT NULL,
employer TEXT NOT NULL, employer TEXT NOT NULL,
employer_url TEXT, employer_url TEXT,
@ -65,6 +92,35 @@ const migrations = [
`ALTER TABLE jobs ADD COLUMN source TEXT NOT NULL DEFAULT 'gradcracker'`, `ALTER TABLE jobs ADD COLUMN source TEXT NOT NULL DEFAULT 'gradcracker'`,
`UPDATE jobs SET source = 'gradcracker' WHERE source IS NULL OR source = ''`, `UPDATE jobs SET source = 'gradcracker' WHERE source IS NULL OR source = ''`,
// Add JobSpy columns for existing databases (safe to skip if already present)
`ALTER TABLE jobs ADD COLUMN source_job_id TEXT`,
`ALTER TABLE jobs ADD COLUMN job_url_direct TEXT`,
`ALTER TABLE jobs ADD COLUMN date_posted TEXT`,
`ALTER TABLE jobs ADD COLUMN job_type TEXT`,
`ALTER TABLE jobs ADD COLUMN salary_source TEXT`,
`ALTER TABLE jobs ADD COLUMN salary_interval TEXT`,
`ALTER TABLE jobs ADD COLUMN salary_min_amount REAL`,
`ALTER TABLE jobs ADD COLUMN salary_max_amount REAL`,
`ALTER TABLE jobs ADD COLUMN salary_currency TEXT`,
`ALTER TABLE jobs ADD COLUMN is_remote INTEGER`,
`ALTER TABLE jobs ADD COLUMN job_level TEXT`,
`ALTER TABLE jobs ADD COLUMN job_function TEXT`,
`ALTER TABLE jobs ADD COLUMN listing_type TEXT`,
`ALTER TABLE jobs ADD COLUMN emails TEXT`,
`ALTER TABLE jobs ADD COLUMN company_industry TEXT`,
`ALTER TABLE jobs ADD COLUMN company_logo TEXT`,
`ALTER TABLE jobs ADD COLUMN company_url_direct TEXT`,
`ALTER TABLE jobs ADD COLUMN company_addresses TEXT`,
`ALTER TABLE jobs ADD COLUMN company_num_employees TEXT`,
`ALTER TABLE jobs ADD COLUMN company_revenue TEXT`,
`ALTER TABLE jobs ADD COLUMN company_description TEXT`,
`ALTER TABLE jobs ADD COLUMN skills TEXT`,
`ALTER TABLE jobs ADD COLUMN experience_range TEXT`,
`ALTER TABLE jobs ADD COLUMN company_rating REAL`,
`ALTER TABLE jobs ADD COLUMN company_reviews_count INTEGER`,
`ALTER TABLE jobs ADD COLUMN vacancy_count INTEGER`,
`ALTER TABLE jobs ADD COLUMN work_from_home_type TEXT`,
`CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status)`, `CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status)`,
`CREATE INDEX IF NOT EXISTS idx_jobs_discovered_at ON jobs(discovered_at)`, `CREATE INDEX IF NOT EXISTS idx_jobs_discovered_at ON jobs(discovered_at)`,
`CREATE INDEX IF NOT EXISTS idx_pipeline_runs_started_at ON pipeline_runs(started_at)`, `CREATE INDEX IF NOT EXISTS idx_pipeline_runs_started_at ON pipeline_runs(started_at)`,
@ -78,12 +134,12 @@ for (const migration of migrations) {
console.log('✅ Migration applied'); console.log('✅ Migration applied');
} catch (error) { } catch (error) {
const message = error instanceof Error ? error.message : String(error); const message = error instanceof Error ? error.message : String(error);
const isDuplicateSourceColumn = const isDuplicateColumn =
migration.includes('ALTER TABLE jobs ADD COLUMN source') && migration.toLowerCase().includes('alter table jobs add column') &&
message.toLowerCase().includes('duplicate column name'); message.toLowerCase().includes('duplicate column name');
if (isDuplicateSourceColumn) { if (isDuplicateColumn) {
console.log('↩️ Migration skipped (source column already exists)'); console.log('↩️ Migration skipped (column already exists)');
continue; continue;
} }

View File

@ -9,7 +9,10 @@ export const jobs = sqliteTable('jobs', {
id: text('id').primaryKey(), id: text('id').primaryKey(),
// From crawler // From crawler
source: text('source', { enum: ['gradcracker'] }).notNull().default('gradcracker'), source: text('source', { enum: ['gradcracker', 'indeed', 'linkedin'] }).notNull().default('gradcracker'),
sourceJobId: text('source_job_id'),
jobUrlDirect: text('job_url_direct'),
datePosted: text('date_posted'),
title: text('title').notNull(), title: text('title').notNull(),
employer: text('employer').notNull(), employer: text('employer').notNull(),
employerUrl: text('employer_url'), employerUrl: text('employer_url'),
@ -22,6 +25,32 @@ export const jobs = sqliteTable('jobs', {
degreeRequired: text('degree_required'), degreeRequired: text('degree_required'),
starting: text('starting'), starting: text('starting'),
jobDescription: text('job_description'), jobDescription: text('job_description'),
// JobSpy fields (nullable for other sources)
jobType: text('job_type'),
salarySource: text('salary_source'),
salaryInterval: text('salary_interval'),
salaryMinAmount: real('salary_min_amount'),
salaryMaxAmount: real('salary_max_amount'),
salaryCurrency: text('salary_currency'),
isRemote: integer('is_remote', { mode: 'boolean' }),
jobLevel: text('job_level'),
jobFunction: text('job_function'),
listingType: text('listing_type'),
emails: text('emails'),
companyIndustry: text('company_industry'),
companyLogo: text('company_logo'),
companyUrlDirect: text('company_url_direct'),
companyAddresses: text('company_addresses'),
companyNumEmployees: text('company_num_employees'),
companyRevenue: text('company_revenue'),
companyDescription: text('company_description'),
skills: text('skills'),
experienceRange: text('experience_range'),
companyRating: real('company_rating'),
companyReviewsCount: integer('company_reviews_count'),
vacancyCount: integer('vacancy_count'),
workFromHomeType: text('work_from_home_type'),
// Orchestrator enrichments // Orchestrator enrichments
status: text('status', { status: text('status', {

View File

@ -14,13 +14,14 @@ import { readFile } from 'fs/promises';
import { join, dirname } from 'path'; import { join, dirname } from 'path';
import { fileURLToPath } from 'url'; import { fileURLToPath } from 'url';
import { runCrawler } from '../services/crawler.js'; import { runCrawler } from '../services/crawler.js';
import { runJobSpy } from '../services/jobspy.js';
import { scoreAndRankJobs, scoreJobSuitability } from '../services/scorer.js'; import { scoreAndRankJobs, scoreJobSuitability } from '../services/scorer.js';
import { generateSummary } from '../services/summary.js'; import { generateSummary } from '../services/summary.js';
import { generatePdf } from '../services/pdf.js'; import { generatePdf } from '../services/pdf.js';
import * as jobsRepo from '../repositories/jobs.js'; import * as jobsRepo from '../repositories/jobs.js';
import * as pipelineRepo from '../repositories/pipeline.js'; import * as pipelineRepo from '../repositories/pipeline.js';
import { progressHelpers, resetProgress } from './progress.js'; import { progressHelpers, resetProgress, updateProgress } from './progress.js';
import type { Job, PipelineConfig } from '../../shared/types.js'; import type { CreateJobInput, Job, JobSource, PipelineConfig } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url)); const __dirname = dirname(fileURLToPath(import.meta.url));
const DEFAULT_PROFILE_PATH = join(__dirname, '../../../../resume-generator/base.json'); const DEFAULT_PROFILE_PATH = join(__dirname, '../../../../resume-generator/base.json');
@ -28,7 +29,7 @@ const DEFAULT_PROFILE_PATH = join(__dirname, '../../../../resume-generator/base.
const DEFAULT_CONFIG: PipelineConfig = { const DEFAULT_CONFIG: PipelineConfig = {
topN: 10, topN: 10,
minSuitabilityScore: 50, minSuitabilityScore: 50,
sources: ['gradcracker'], sources: ['gradcracker', 'indeed', 'linkedin'],
profilePath: DEFAULT_PROFILE_PATH, profilePath: DEFAULT_PROFILE_PATH,
outputDir: join(__dirname, '../../../data/pdfs'), outputDir: join(__dirname, '../../../data/pdfs'),
}; };
@ -73,31 +74,65 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
console.log('\n🕷 Running crawler...'); console.log('\n🕷 Running crawler...');
progressHelpers.startCrawling(); progressHelpers.startCrawling();
const existingJobUrls = await jobsRepo.getAllJobUrls(); const existingJobUrls = await jobsRepo.getAllJobUrls();
const crawlerResult = await runCrawler({
existingJobUrls, const discoveredJobs: CreateJobInput[] = [];
onProgress: (update) => { const sourceErrors: string[] = [];
progressHelpers.crawlingUpdate({
listPagesProcessed: update.listPagesProcessed, if (mergedConfig.sources.includes('gradcracker')) {
listPagesTotal: update.listPagesTotal, const crawlerResult = await runCrawler({
jobCardsFound: update.jobCardsFound, existingJobUrls,
jobPagesEnqueued: update.jobPagesEnqueued, onProgress: (update) => {
jobPagesSkipped: update.jobPagesSkipped, progressHelpers.crawlingUpdate({
jobPagesProcessed: update.jobPagesProcessed, listPagesProcessed: update.listPagesProcessed,
phase: update.phase, listPagesTotal: update.listPagesTotal,
currentUrl: update.currentUrl, jobCardsFound: update.jobCardsFound,
}); jobPagesEnqueued: update.jobPagesEnqueued,
}, jobPagesSkipped: update.jobPagesSkipped,
}); jobPagesProcessed: update.jobPagesProcessed,
phase: update.phase,
if (!crawlerResult.success) { currentUrl: update.currentUrl,
throw new Error(`Crawler failed: ${crawlerResult.error}`); });
},
});
if (!crawlerResult.success) {
sourceErrors.push(`gradcracker: ${crawlerResult.error ?? 'unknown error'}`);
} else {
discoveredJobs.push(...crawlerResult.jobs);
}
} }
progressHelpers.crawlingComplete(crawlerResult.jobs.length); const jobSpySites = mergedConfig.sources.filter(
(s): s is 'indeed' | 'linkedin' => s === 'indeed' || s === 'linkedin'
);
if (jobSpySites.length > 0) {
updateProgress({
step: 'crawling',
detail: `JobSpy: scraping ${jobSpySites.join(', ')}...`,
});
const jobSpyResult = await runJobSpy({ sites: jobSpySites });
if (!jobSpyResult.success) {
sourceErrors.push(`jobspy: ${jobSpyResult.error ?? 'unknown error'}`);
} else {
discoveredJobs.push(...jobSpyResult.jobs);
}
}
if (discoveredJobs.length === 0 && sourceErrors.length > 0) {
throw new Error(`All sources failed: ${sourceErrors.join('; ')}`);
}
if (sourceErrors.length > 0) {
console.warn(`ƒsÿ,? Some sources failed: ${sourceErrors.join('; ')}`);
}
progressHelpers.crawlingComplete(discoveredJobs.length);
// Step 3: Import discovered jobs // Step 3: Import discovered jobs
console.log('\n💾 Importing jobs to database...'); console.log('\n💾 Importing jobs to database...');
const { created, skipped } = await jobsRepo.bulkCreateJobs(crawlerResult.jobs); const { created, skipped } = await jobsRepo.bulkCreateJobs(discoveredJobs);
console.log(` Created: ${created}, Skipped (duplicates): ${skipped}`); console.log(` Created: ${created}, Skipped (duplicates): ${skipped}`);
progressHelpers.importComplete(created, skipped); progressHelpers.importComplete(created, skipped);

View File

@ -61,6 +61,9 @@ export async function createJob(input: CreateJobInput): Promise<Job> {
await db.insert(jobs).values({ await db.insert(jobs).values({
id, id,
source: input.source, source: input.source,
sourceJobId: input.sourceJobId ?? null,
jobUrlDirect: input.jobUrlDirect ?? null,
datePosted: input.datePosted ?? null,
title: input.title, title: input.title,
employer: input.employer, employer: input.employer,
employerUrl: input.employerUrl ?? null, employerUrl: input.employerUrl ?? null,
@ -73,6 +76,30 @@ export async function createJob(input: CreateJobInput): Promise<Job> {
degreeRequired: input.degreeRequired ?? null, degreeRequired: input.degreeRequired ?? null,
starting: input.starting ?? null, starting: input.starting ?? null,
jobDescription: input.jobDescription ?? null, jobDescription: input.jobDescription ?? null,
jobType: input.jobType ?? null,
salarySource: input.salarySource ?? null,
salaryInterval: input.salaryInterval ?? null,
salaryMinAmount: input.salaryMinAmount ?? null,
salaryMaxAmount: input.salaryMaxAmount ?? null,
salaryCurrency: input.salaryCurrency ?? null,
isRemote: input.isRemote ?? null,
jobLevel: input.jobLevel ?? null,
jobFunction: input.jobFunction ?? null,
listingType: input.listingType ?? null,
emails: input.emails ?? null,
companyIndustry: input.companyIndustry ?? null,
companyLogo: input.companyLogo ?? null,
companyUrlDirect: input.companyUrlDirect ?? null,
companyAddresses: input.companyAddresses ?? null,
companyNumEmployees: input.companyNumEmployees ?? null,
companyRevenue: input.companyRevenue ?? null,
companyDescription: input.companyDescription ?? null,
skills: input.skills ?? null,
experienceRange: input.experienceRange ?? null,
companyRating: input.companyRating ?? null,
companyReviewsCount: input.companyReviewsCount ?? null,
vacancyCount: input.vacancyCount ?? null,
workFromHomeType: input.workFromHomeType ?? null,
status: 'discovered', status: 'discovered',
discoveredAt: now, discoveredAt: now,
createdAt: now, createdAt: now,
@ -173,6 +200,9 @@ function mapRowToJob(row: typeof jobs.$inferSelect): Job {
return { return {
id: row.id, id: row.id,
source: row.source as Job['source'], source: row.source as Job['source'],
sourceJobId: row.sourceJobId ?? null,
jobUrlDirect: row.jobUrlDirect ?? null,
datePosted: row.datePosted ?? null,
title: row.title, title: row.title,
employer: row.employer, employer: row.employer,
employerUrl: row.employerUrl, employerUrl: row.employerUrl,
@ -191,6 +221,30 @@ function mapRowToJob(row: typeof jobs.$inferSelect): Job {
tailoredSummary: row.tailoredSummary, tailoredSummary: row.tailoredSummary,
pdfPath: row.pdfPath, pdfPath: row.pdfPath,
notionPageId: row.notionPageId, notionPageId: row.notionPageId,
jobType: row.jobType ?? null,
salarySource: row.salarySource ?? null,
salaryInterval: row.salaryInterval ?? null,
salaryMinAmount: row.salaryMinAmount ?? null,
salaryMaxAmount: row.salaryMaxAmount ?? null,
salaryCurrency: row.salaryCurrency ?? null,
isRemote: row.isRemote ?? null,
jobLevel: row.jobLevel ?? null,
jobFunction: row.jobFunction ?? null,
listingType: row.listingType ?? null,
emails: row.emails ?? null,
companyIndustry: row.companyIndustry ?? null,
companyLogo: row.companyLogo ?? null,
companyUrlDirect: row.companyUrlDirect ?? null,
companyAddresses: row.companyAddresses ?? null,
companyNumEmployees: row.companyNumEmployees ?? null,
companyRevenue: row.companyRevenue ?? null,
companyDescription: row.companyDescription ?? null,
skills: row.skills ?? null,
experienceRange: row.experienceRange ?? null,
companyRating: row.companyRating ?? null,
companyReviewsCount: row.companyReviewsCount ?? null,
vacancyCount: row.vacancyCount ?? null,
workFromHomeType: row.workFromHomeType ?? null,
discoveredAt: row.discoveredAt, discoveredAt: row.discoveredAt,
processedAt: row.processedAt, processedAt: row.processedAt,
appliedAt: row.appliedAt, appliedAt: row.appliedAt,

View File

@ -1,4 +1,5 @@
export * from './crawler.js'; export * from './crawler.js';
export * from './jobspy.js';
export * from './scorer.js'; export * from './scorer.js';
export * from './summary.js'; export * from './summary.js';
export * from './pdf.js'; export * from './pdf.js';

View File

@ -0,0 +1,241 @@
/**
* Service for scraping jobs via JobSpy (Indeed/LinkedIn/etc) and mapping them into our DB shape.
*
* Uses a small Python wrapper script that writes both CSV + JSON to disk; we ingest the JSON.
*/
import { spawn } from 'child_process';
import { readFile, mkdir } from 'fs/promises';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import type { CreateJobInput, JobSource } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const JOBSPY_DIR = join(__dirname, '../../../../jobspy-extractor');
const JOBSPY_SCRIPT = join(JOBSPY_DIR, 'scrape_jobs.py');
function getPythonPath(): string {
if (process.env.PYTHON_PATH) return process.env.PYTHON_PATH;
return process.platform === 'win32' ? 'python' : 'python3';
}
function getDataDir(): string {
if (process.env.DATA_DIR) return process.env.DATA_DIR;
return join(__dirname, '../../../data');
}
function toStringOrNull(value: unknown): string | null {
if (value === null || value === undefined) return null;
if (typeof value === 'string') {
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : null;
}
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
return null;
}
function toNumberOrNull(value: unknown): number | null {
if (value === null || value === undefined) return null;
if (typeof value === 'number') return Number.isFinite(value) ? value : null;
if (typeof value === 'string') {
const trimmed = value.trim();
if (!trimmed) return null;
const parsed = Number(trimmed);
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
function toBooleanOrNull(value: unknown): boolean | null {
if (value === null || value === undefined) return null;
if (typeof value === 'boolean') return value;
if (typeof value === 'number') return value !== 0;
if (typeof value === 'string') {
const normalized = value.trim().toLowerCase();
if (!normalized) return null;
if (['1', 'true', 'yes', 'y', 'on'].includes(normalized)) return true;
if (['0', 'false', 'no', 'n', 'off'].includes(normalized)) return false;
}
return null;
}
function toJsonStringOrNull(value: unknown): string | null {
if (value === null || value === undefined) return null;
if (typeof value === 'string') return toStringOrNull(value);
try {
return JSON.stringify(value);
} catch {
return null;
}
}
function toJobSource(site: unknown): JobSource | null {
const raw = toStringOrNull(site)?.toLowerCase();
if (raw === 'gradcracker') return 'gradcracker';
if (raw === 'indeed') return 'indeed';
if (raw === 'linkedin') return 'linkedin';
return null;
}
function formatSalary(params: {
minAmount: number | null;
maxAmount: number | null;
currency: string | null;
interval: string | null;
}): string | null {
const { minAmount, maxAmount, currency, interval } = params;
if (minAmount === null && maxAmount === null) return null;
const fmt = (n: number) => {
// Avoid locale ambiguity; keep it simple.
const rounded = Math.round(n);
return `${rounded}`;
};
let range: string;
if (minAmount !== null && maxAmount !== null) {
range = `${fmt(minAmount)}-${fmt(maxAmount)}`;
} else if (minAmount !== null) {
range = `${fmt(minAmount)}+`;
} else if (maxAmount !== null) {
range = `${fmt(maxAmount)}`;
} else {
return null;
}
const currencyPart = currency ? `${currency} ` : '';
const intervalPart = interval ? ` / ${interval}` : '';
return `${currencyPart}${range}${intervalPart}`.trim();
}
export interface RunJobSpyOptions {
sites?: Array<JobSource>;
searchTerm?: string;
location?: string;
resultsWanted?: number;
hoursOld?: number;
countryIndeed?: string;
linkedinFetchDescription?: boolean;
}
export interface JobSpyResult {
success: boolean;
jobs: CreateJobInput[];
error?: string;
}
export async function runJobSpy(options: RunJobSpyOptions = {}): Promise<JobSpyResult> {
const dataDir = getDataDir();
const outputDir = join(dataDir, 'imports');
await mkdir(outputDir, { recursive: true });
const outputCsv = join(outputDir, 'jobspy_jobs.csv');
const outputJson = join(outputDir, 'jobspy_jobs.json');
const sites = (options.sites ?? ['indeed', 'linkedin'])
.filter((s) => s === 'indeed' || s === 'linkedin')
.join(',');
try {
await new Promise<void>((resolve, reject) => {
const pythonPath = getPythonPath();
const child = spawn(pythonPath, [JOBSPY_SCRIPT], {
cwd: JOBSPY_DIR,
shell: false,
stdio: 'inherit',
env: {
...process.env,
JOBSPY_SITES: sites || 'indeed,linkedin',
JOBSPY_SEARCH_TERM: options.searchTerm ?? process.env.JOBSPY_SEARCH_TERM ?? 'web developer',
JOBSPY_LOCATION: options.location ?? process.env.JOBSPY_LOCATION ?? 'UK',
JOBSPY_RESULTS_WANTED: String(options.resultsWanted ?? process.env.JOBSPY_RESULTS_WANTED ?? 200),
JOBSPY_HOURS_OLD: String(options.hoursOld ?? process.env.JOBSPY_HOURS_OLD ?? 72),
JOBSPY_COUNTRY_INDEED: options.countryIndeed ?? process.env.JOBSPY_COUNTRY_INDEED ?? 'UK',
JOBSPY_LINKEDIN_FETCH_DESCRIPTION: String(
options.linkedinFetchDescription ?? process.env.JOBSPY_LINKEDIN_FETCH_DESCRIPTION ?? '1'
),
JOBSPY_OUTPUT_CSV: outputCsv,
JOBSPY_OUTPUT_JSON: outputJson,
},
});
child.on('close', (code) => {
if (code === 0) resolve();
else reject(new Error(`JobSpy exited with code ${code}`));
});
child.on('error', reject);
});
const raw = await readFile(outputJson, 'utf-8');
const parsed = JSON.parse(raw) as Array<Record<string, unknown>>;
const jobs: CreateJobInput[] = [];
for (const row of parsed) {
const source = toJobSource(row.site);
if (!source) continue;
const jobUrl = toStringOrNull(row.job_url);
if (!jobUrl) continue;
const title = toStringOrNull(row.title) ?? 'Unknown Title';
const employer = toStringOrNull(row.company) ?? 'Unknown Employer';
const jobUrlDirect = toStringOrNull(row.job_url_direct);
const applicationLink = jobUrlDirect ?? jobUrl;
const minAmount = toNumberOrNull(row.min_amount);
const maxAmount = toNumberOrNull(row.max_amount);
const currency = toStringOrNull(row.currency);
const interval = toStringOrNull(row.interval);
const salary = formatSalary({ minAmount, maxAmount, currency, interval });
jobs.push({
source,
sourceJobId: toStringOrNull(row.id) ?? undefined,
jobUrlDirect: jobUrlDirect ?? undefined,
datePosted: toStringOrNull(row.date_posted) ?? undefined,
title,
employer,
employerUrl: toStringOrNull(row.company_url) ?? undefined,
jobUrl,
applicationLink,
location: toStringOrNull(row.location) ?? undefined,
jobDescription: toStringOrNull(row.description) ?? undefined,
salary: salary ?? undefined,
jobType: toStringOrNull(row.job_type) ?? undefined,
salarySource: toStringOrNull(row.salary_source) ?? undefined,
salaryInterval: interval ?? undefined,
salaryMinAmount: minAmount ?? undefined,
salaryMaxAmount: maxAmount ?? undefined,
salaryCurrency: currency ?? undefined,
isRemote: toBooleanOrNull(row.is_remote) ?? undefined,
jobLevel: toStringOrNull(row.job_level) ?? undefined,
jobFunction: toStringOrNull(row.job_function) ?? undefined,
listingType: toStringOrNull(row.listing_type) ?? undefined,
emails: toJsonStringOrNull(row.emails) ?? undefined,
companyIndustry: toStringOrNull(row.company_industry) ?? undefined,
companyLogo: toStringOrNull(row.company_logo) ?? undefined,
companyUrlDirect: toStringOrNull(row.company_url_direct) ?? undefined,
companyAddresses: toJsonStringOrNull(row.company_addresses) ?? undefined,
companyNumEmployees: toStringOrNull(row.company_num_employees) ?? undefined,
companyRevenue: toStringOrNull(row.company_revenue) ?? undefined,
companyDescription: toStringOrNull(row.company_description) ?? undefined,
skills: toJsonStringOrNull(row.skills) ?? undefined,
experienceRange: toJsonStringOrNull(row.experience_range) ?? undefined,
companyRating: toNumberOrNull(row.company_rating) ?? undefined,
companyReviewsCount: toNumberOrNull(row.company_reviews_count) ?? undefined,
vacancyCount: toNumberOrNull(row.vacancy_count) ?? undefined,
workFromHomeType: toStringOrNull(row.work_from_home_type) ?? undefined,
});
}
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
return { success: false, jobs: [], error: message };
}
}

View File

@ -11,13 +11,20 @@ export type JobStatus =
| 'expired'; // Deadline passed | 'expired'; // Deadline passed
export type JobSource = export type JobSource =
| 'gradcracker'; | 'gradcracker'
| 'indeed'
| 'linkedin';
export interface Job { export interface Job {
id: string; id: string;
// From crawler // Source / provenance
source: JobSource; source: JobSource;
sourceJobId: string | null; // External ID (if provided)
jobUrlDirect: string | null; // Source-provided direct URL (if provided)
datePosted: string | null; // Source-provided posting date (if provided)
// From crawler (normalized)
title: string; title: string;
employer: string; employer: string;
employerUrl: string | null; employerUrl: string | null;
@ -38,6 +45,32 @@ export interface Job {
tailoredSummary: string | null; // Generated resume summary tailoredSummary: string | null; // Generated resume summary
pdfPath: string | null; // Path to generated PDF pdfPath: string | null; // Path to generated PDF
notionPageId: string | null; // Notion page ID if synced notionPageId: string | null; // Notion page ID if synced
// JobSpy fields (nullable for non-JobSpy sources)
jobType: string | null;
salarySource: string | null;
salaryInterval: string | null;
salaryMinAmount: number | null;
salaryMaxAmount: number | null;
salaryCurrency: string | null;
isRemote: boolean | null;
jobLevel: string | null;
jobFunction: string | null;
listingType: string | null;
emails: string | null;
companyIndustry: string | null;
companyLogo: string | null;
companyUrlDirect: string | null;
companyAddresses: string | null;
companyNumEmployees: string | null;
companyRevenue: string | null;
companyDescription: string | null;
skills: string | null;
experienceRange: string | null;
companyRating: number | null;
companyReviewsCount: number | null;
vacancyCount: number | null;
workFromHomeType: string | null;
// Timestamps // Timestamps
discoveredAt: string; discoveredAt: string;
@ -61,6 +94,35 @@ export interface CreateJobInput {
degreeRequired?: string; degreeRequired?: string;
starting?: string; starting?: string;
jobDescription?: string; jobDescription?: string;
// JobSpy fields (optional)
sourceJobId?: string;
jobUrlDirect?: string;
datePosted?: string;
jobType?: string;
salarySource?: string;
salaryInterval?: string;
salaryMinAmount?: number;
salaryMaxAmount?: number;
salaryCurrency?: string;
isRemote?: boolean;
jobLevel?: string;
jobFunction?: string;
listingType?: string;
emails?: string;
companyIndustry?: string;
companyLogo?: string;
companyUrlDirect?: string;
companyAddresses?: string;
companyNumEmployees?: string;
companyRevenue?: string;
companyDescription?: string;
skills?: string;
experienceRange?: string;
companyRating?: number;
companyReviewsCount?: number;
vacancyCount?: number;
workFromHomeType?: string;
} }
export interface UpdateJobInput { export interface UpdateJobInput {
@ -76,7 +138,7 @@ export interface UpdateJobInput {
export interface PipelineConfig { export interface PipelineConfig {
topN: number; // Number of top jobs to process topN: number; // Number of top jobs to process
minSuitabilityScore: number; // Minimum score to auto-process minSuitabilityScore: number; // Minimum score to auto-process
sources: string[]; // Job sources to crawl sources: JobSource[]; // Job sources to crawl
profilePath: string; // Path to profile JSON profilePath: string; // Path to profile JSON
outputDir: string; // Directory for generated PDFs outputDir: string; // Directory for generated PDFs
} }