diff --git a/.env.example b/.env.example index 6f6df10..aedda52 100644 --- a/.env.example +++ b/.env.example @@ -23,3 +23,15 @@ NOTION_DATABASE_ID= # Optional: Webhook secret for n8n automation WEBHOOK_SECRET= + +# ============================================================================= +# JobSpy (Indeed/LinkedIn scraping) - optional +# ============================================================================= +# These control the Python JobSpy scraper used by the pipeline. +JOBSPY_SITES=indeed,linkedin +JOBSPY_SEARCH_TERM=web developer +JOBSPY_LOCATION=UK +JOBSPY_RESULTS_WANTED=200 +JOBSPY_HOURS_OLD=72 +JOBSPY_COUNTRY_INDEED=UK +JOBSPY_LINKEDIN_FETCH_DESCRIPTION=1 diff --git a/Dockerfile b/Dockerfile index a60c19d..cbb5f5c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,19 +21,19 @@ RUN apt-get update && apt-get install -y \ # Set working directory WORKDIR /app -# Install Playwright and Firefox only -RUN pip3 install --no-cache-dir --break-system-packages playwright && \ +# Install Playwright and Firefox only (plus JobSpy for Indeed/LinkedIn scraping) +RUN pip3 install --no-cache-dir --break-system-packages playwright python-jobspy && \ npx playwright install firefox # Copy package files first for better caching COPY orchestrator/package*.json ./orchestrator/ -COPY job-extractor/package*.json ./job-extractor/ +COPY extractors/gradcracker/package*.json ./extractors/gradcracker/ # Install Node.js dependencies WORKDIR /app/orchestrator RUN npm install --production=false -WORKDIR /app/job-extractor +WORKDIR /app/extractors/gradcracker RUN npm install --production=false # Install Camoufox browser (downloads its own Firefox fork) @@ -42,7 +42,8 @@ RUN npx camoufox fetch # Copy source code WORKDIR /app COPY orchestrator ./orchestrator -COPY job-extractor ./job-extractor +COPY extractors/gradcracker ./extractors/gradcracker +COPY extractors/jobspy ./extractors/jobspy COPY resume-generator ./resume-generator # Build the orchestrator (client + server) diff --git a/README.md b/README.md index 8ad6d60..db4f764 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ Automated job discovery -> AI suitability scoring -> tailored resume PDFs -> a d ## How it works (pipeline) -1. **Crawl**: `job-extractor` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs). -2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`job-extractor/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique). +1. **Crawl**: `extractors/gradcracker` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs). +2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`extractors/gradcracker/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique). 3. **Score**: `orchestrator` scores up to 50 unprocessed jobs via OpenRouter (cached as `suitabilityScore`/`suitabilityReason`). 4. **Select**: take the top `N` jobs above `minSuitabilityScore`. 5. **Process**: for each selected job: @@ -30,7 +30,7 @@ flowchart LR PDFS[(PDFs
pdfs/)] end - subgraph CRAWL["job-extractor (Crawlee/Playwright)"] + subgraph CRAWL["extractors/gradcracker (Crawlee/Playwright)"] C1["Seed search URLs
(locations x roles)"] C2["Parse list pages
enqueue job pages"] C3["Parse job pages
extract JD + apply URL"] @@ -73,7 +73,8 @@ job-ops/ src/server/ # API routes, pipeline, DB, services src/client/ # UI (polls jobs, listens to SSE progress) src/shared/ # shared types (Job, PipelineRun, etc.) - job-extractor/ # Crawlee crawler (Gradcracker) + extractors/gradcracker/ # Crawlee crawler (Gradcracker) + extractors/jobspy/ # JobSpy wrapper (Indeed/LinkedIn/etc) resume-generator/ # Python Playwright automation for rxresu.me base.json # your exported base resume (template) data/ # persisted runtime artifacts (Docker default) @@ -113,7 +114,7 @@ Install Node deps (both packages): ```bash cd orchestrator && npm install -cd ../job-extractor && npm install +cd ../extractors/gradcracker && npm install ``` Configure the orchestrator env + DB: @@ -153,7 +154,7 @@ Dev URLs: ## Notes / sharp edges -- **Crawl targets**: edit `job-extractor/src/main.ts` to change the Gradcracker location/role matrix. +- **Crawl targets**: edit `extractors/gradcracker/src/main.ts` to change the Gradcracker location/role matrix. - **Notion sync is schema-dependent**: `orchestrator/src/server/services/notion.ts` assumes property names; adjust to match your Notion database. - **Pipeline config knobs**: `POST /api/pipeline/run` accepts `{ topN, minSuitabilityScore }`; `PIPELINE_TOP_N`/`PIPELINE_MIN_SCORE` are used by `npm run pipeline:run` (CLI runner). - **Anti-bot reality**: crawling is headless + "humanized", but sites can still block; expect occasional flakiness. diff --git a/docker-compose.yml b/docker-compose.yml index 3b85d12..30c7abd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,6 +33,15 @@ services: - PIPELINE_TOP_N=${PIPELINE_TOP_N:-10} - PIPELINE_MIN_SCORE=${PIPELINE_MIN_SCORE:-50} + # JobSpy (Indeed/LinkedIn scraping) - optional + - JOBSPY_SITES=${JOBSPY_SITES:-indeed,linkedin} + - JOBSPY_SEARCH_TERM=${JOBSPY_SEARCH_TERM:-web developer} + - JOBSPY_LOCATION=${JOBSPY_LOCATION:-UK} + - JOBSPY_RESULTS_WANTED=${JOBSPY_RESULTS_WANTED:-200} + - JOBSPY_HOURS_OLD=${JOBSPY_HOURS_OLD:-72} + - JOBSPY_COUNTRY_INDEED=${JOBSPY_COUNTRY_INDEED:-UK} + - JOBSPY_LINKEDIN_FETCH_DESCRIPTION=${JOBSPY_LINKEDIN_FETCH_DESCRIPTION:-1} + # Optional: Notion integration - NOTION_API_KEY=${NOTION_API_KEY:-} - NOTION_DATABASE_ID=${NOTION_DATABASE_ID:-} diff --git a/job-extractor/.dockerignore b/extractors/gradcracker/.dockerignore similarity index 100% rename from job-extractor/.dockerignore rename to extractors/gradcracker/.dockerignore diff --git a/job-extractor/.gitignore b/extractors/gradcracker/.gitignore similarity index 100% rename from job-extractor/.gitignore rename to extractors/gradcracker/.gitignore diff --git a/job-extractor/Dockerfile b/extractors/gradcracker/Dockerfile similarity index 100% rename from job-extractor/Dockerfile rename to extractors/gradcracker/Dockerfile diff --git a/job-extractor/README.md b/extractors/gradcracker/README.md similarity index 100% rename from job-extractor/README.md rename to extractors/gradcracker/README.md diff --git a/job-extractor/package-lock.json b/extractors/gradcracker/package-lock.json similarity index 100% rename from job-extractor/package-lock.json rename to extractors/gradcracker/package-lock.json diff --git a/job-extractor/package.json b/extractors/gradcracker/package.json similarity index 100% rename from job-extractor/package.json rename to extractors/gradcracker/package.json diff --git a/job-extractor/src/main.ts b/extractors/gradcracker/src/main.ts similarity index 100% rename from job-extractor/src/main.ts rename to extractors/gradcracker/src/main.ts diff --git a/job-extractor/src/progress.ts b/extractors/gradcracker/src/progress.ts similarity index 100% rename from job-extractor/src/progress.ts rename to extractors/gradcracker/src/progress.ts diff --git a/job-extractor/src/routes.ts b/extractors/gradcracker/src/routes.ts similarity index 100% rename from job-extractor/src/routes.ts rename to extractors/gradcracker/src/routes.ts diff --git a/job-extractor/tsconfig.json b/extractors/gradcracker/tsconfig.json similarity index 100% rename from job-extractor/tsconfig.json rename to extractors/gradcracker/tsconfig.json diff --git a/extractors/jobspy/requirements.txt b/extractors/jobspy/requirements.txt new file mode 100644 index 0000000..8c5560a --- /dev/null +++ b/extractors/jobspy/requirements.txt @@ -0,0 +1 @@ +python-jobspy diff --git a/extractors/jobspy/scrape_jobs.py b/extractors/jobspy/scrape_jobs.py new file mode 100644 index 0000000..b61b067 --- /dev/null +++ b/extractors/jobspy/scrape_jobs.py @@ -0,0 +1,77 @@ +import csv +import os +from pathlib import Path + +from jobspy import scrape_jobs + + +def _env_str(name: str, default: str) -> str: + value = os.getenv(name) + return value if value and value.strip() else default + + +def _env_int(name: str, default: int) -> int: + value = os.getenv(name) + if value is None or value.strip() == "": + return default + try: + return int(value) + except ValueError: + return default + + +def _env_bool(name: str, default: bool) -> bool: + value = os.getenv(name) + if value is None or value.strip() == "": + return default + return value.strip().lower() in ("1", "true", "yes", "y", "on") + + +def _parse_sites(raw: str) -> list[str]: + return [s.strip() for s in raw.split(",") if s.strip()] + + +def main() -> int: + sites = _parse_sites(_env_str("JOBSPY_SITES", "indeed,linkedin")) + search_term = _env_str("JOBSPY_SEARCH_TERM", "web developer") + location = _env_str("JOBSPY_LOCATION", "UK") + results_wanted = _env_int("JOBSPY_RESULTS_WANTED", 200) + hours_old = _env_int("JOBSPY_HOURS_OLD", 72) + country_indeed = _env_str("JOBSPY_COUNTRY_INDEED", "UK") + linkedin_fetch_description = _env_bool("JOBSPY_LINKEDIN_FETCH_DESCRIPTION", True) + + output_csv = Path(_env_str("JOBSPY_OUTPUT_CSV", "jobs.csv")) + output_json = Path(_env_str("JOBSPY_OUTPUT_JSON", str(output_csv.with_suffix(".json")))) + + output_csv.parent.mkdir(parents=True, exist_ok=True) + output_json.parent.mkdir(parents=True, exist_ok=True) + + jobs = scrape_jobs( + site_name=sites, + search_term=search_term, + location=location, + results_wanted=results_wanted, + hours_old=hours_old, + country_indeed=country_indeed, + linkedin_fetch_description=linkedin_fetch_description, + ) + + print(f"Found {len(jobs)} jobs") + + jobs.to_csv( + output_csv, + quoting=csv.QUOTE_NONNUMERIC, + escapechar="\\", + index=False, + ) + + jobs.to_json(output_json, orient="records", force_ascii=False) + + print(f"Wrote CSV: {output_csv}") + print(f"Wrote JSON: {output_json}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/orchestrator/.env.example b/orchestrator/.env.example index fc94358..39a12e5 100644 --- a/orchestrator/.env.example +++ b/orchestrator/.env.example @@ -19,3 +19,14 @@ PIPELINE_MIN_SCORE=50 # RXResume credentials (for PDF generation) RXRESUME_EMAIL= RXRESUME_PASSWORD= + +# ============================================================================= +# JobSpy (Indeed/LinkedIn scraping) - optional +# ============================================================================= +JOBSPY_SITES=indeed,linkedin +JOBSPY_SEARCH_TERM=web developer +JOBSPY_LOCATION=UK +JOBSPY_RESULTS_WANTED=200 +JOBSPY_HOURS_OLD=72 +JOBSPY_COUNTRY_INDEED=UK +JOBSPY_LINKEDIN_FETCH_DESCRIPTION=1 diff --git a/orchestrator/package-lock.json b/orchestrator/package-lock.json index 14deea5..4e2171b 100644 --- a/orchestrator/package-lock.json +++ b/orchestrator/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "dependencies": { "@radix-ui/react-alert-dialog": "^1.1.15", + "@radix-ui/react-dropdown-menu": "^2.1.15", "@radix-ui/react-progress": "^1.1.8", "@radix-ui/react-separator": "^1.1.8", "@radix-ui/react-slot": "^1.2.4", @@ -1229,6 +1230,40 @@ "node": ">=12" } }, + "node_modules/@floating-ui/core": { + "version": "1.7.3", + "resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.7.3.tgz", + "integrity": "sha512-sGnvb5dmrJaKEZ+LDIpguvdX3bDlEllmv4/ClQ9awcmCZrlx5jQyyMWFM5kBI+EyNOCDDiKk8il0zeuX3Zlg/w==", + "dependencies": { + "@floating-ui/utils": "^0.2.10" + } + }, + "node_modules/@floating-ui/dom": { + "version": "1.7.4", + "resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.7.4.tgz", + "integrity": "sha512-OOchDgh4F2CchOX94cRVqhvy7b3AFb+/rQXyswmzmGakRfkMgoWVjfnLWkRirfLEfuD4ysVW16eXzwt3jHIzKA==", + "dependencies": { + "@floating-ui/core": "^1.7.3", + "@floating-ui/utils": "^0.2.10" + } + }, + "node_modules/@floating-ui/react-dom": { + "version": "2.1.6", + "resolved": "https://registry.npmjs.org/@floating-ui/react-dom/-/react-dom-2.1.6.tgz", + "integrity": "sha512-4JX6rEatQEvlmgU80wZyq9RT96HZJa88q8hp0pBd+LrczeDI4o6uA2M+uvxngVHo4Ihr8uibXxH6+70zhAFrVw==", + "dependencies": { + "@floating-ui/dom": "^1.7.4" + }, + "peerDependencies": { + "react": ">=16.8.0", + "react-dom": ">=16.8.0" + } + }, + "node_modules/@floating-ui/utils": { + "version": "0.2.10", + "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.10.tgz", + "integrity": "sha512-aGTxbpbg8/b5JfU1HXSrbH3wXZuLPJcNEcZQFMxLs3oSzgtVu6nFPkbbGGUvBcUjKV2YyB9Wxxabo+HEH9tcRQ==" + }, "node_modules/@jridgewell/gen-mapping": { "version": "0.3.13", "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", @@ -1335,6 +1370,28 @@ } } }, + "node_modules/@radix-ui/react-arrow": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz", + "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==", + "dependencies": { + "@radix-ui/react-primitive": "2.1.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-collection": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz", @@ -1497,6 +1554,34 @@ } } }, + "node_modules/@radix-ui/react-dropdown-menu": { + "version": "2.1.16", + "resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.16.tgz", + "integrity": "sha512-1PLGQEynI/3OX/ftV54COn+3Sud/Mn8vALg2rWnBLnRaGtJDduNW/22XjlGgPdpcIbiQxjKtb7BkcjP00nqfJw==", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-menu": "2.1.16", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-focus-guards": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz", @@ -1552,6 +1637,93 @@ } } }, + "node_modules/@radix-ui/react-menu": { + "version": "2.1.16", + "resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.16.tgz", + "integrity": "sha512-72F2T+PLlphrqLcAotYPp0uJMr5SjP5SL01wfEspJbru5Zs5vQaSHb4VB3ZMJPimgHHCHG7gMOeOB9H3Hdmtxg==", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-collection": "1.1.7", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-direction": "1.1.1", + "@radix-ui/react-dismissable-layer": "1.1.11", + "@radix-ui/react-focus-guards": "1.1.3", + "@radix-ui/react-focus-scope": "1.1.7", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-popper": "1.2.8", + "@radix-ui/react-portal": "1.1.9", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-roving-focus": "1.1.11", + "@radix-ui/react-slot": "1.2.3", + "@radix-ui/react-use-callback-ref": "1.1.1", + "aria-hidden": "^1.2.4", + "react-remove-scroll": "^2.6.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-slot": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz", + "integrity": "sha512-aeNmHnBxbi2St0au6VBVC7JXFlhLlOnvIIlePNniyUNAClzmtAUEY8/pBiK3iHjufOlwA+c20/8jngo7xcrg8A==", + "dependencies": { + "@radix-ui/react-compose-refs": "1.1.2" + }, + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-popper": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz", + "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==", + "dependencies": { + "@floating-ui/react-dom": "^2.0.0", + "@radix-ui/react-arrow": "1.1.7", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-callback-ref": "1.1.1", + "@radix-ui/react-use-layout-effect": "1.1.1", + "@radix-ui/react-use-rect": "1.1.1", + "@radix-ui/react-use-size": "1.1.1", + "@radix-ui/rect": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-portal": { "version": "1.1.9", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz", @@ -1896,6 +2068,45 @@ } } }, + "node_modules/@radix-ui/react-use-rect": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@radix-ui/react-use-rect/-/react-use-rect-1.1.1.tgz", + "integrity": "sha512-QTYuDesS0VtuHNNvMh+CjlKJ4LJickCMUAqjlE3+j8w+RlRpwyX3apEQKGFzbZGdo7XNG1tXa+bQqIE7HIXT2w==", + "dependencies": { + "@radix-ui/rect": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-use-size": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@radix-ui/react-use-size/-/react-use-size-1.1.1.tgz", + "integrity": "sha512-ewrXRDTAqAXlkl6t/fkXWNAhFX9I+CkKlw6zjEwk86RSPKwZr3xpBRso655aqYafwtnbpHLj6toFzmd6xdVptQ==", + "dependencies": { + "@radix-ui/react-use-layout-effect": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@radix-ui/rect": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@radix-ui/rect/-/rect-1.1.1.tgz", + "integrity": "sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==" + }, "node_modules/@rolldown/pluginutils": { "version": "1.0.0-beta.27", "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz", diff --git a/orchestrator/package.json b/orchestrator/package.json index f8a2634..7184a32 100644 --- a/orchestrator/package.json +++ b/orchestrator/package.json @@ -18,6 +18,7 @@ "pipeline:run": "tsx src/server/pipeline/run.ts" }, "dependencies": { + "@radix-ui/react-dropdown-menu": "^2.1.15", "@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-progress": "^1.1.8", "@radix-ui/react-separator": "^1.1.8", diff --git a/orchestrator/src/client/App.tsx b/orchestrator/src/client/App.tsx index fbeeda4..60a363c 100644 --- a/orchestrator/src/client/App.tsx +++ b/orchestrator/src/client/App.tsx @@ -6,10 +6,13 @@ import React, { useCallback, useEffect, useState } from "react"; import { toast } from "sonner"; import { Toaster } from "@/components/ui/sonner"; -import type { Job, JobStatus } from "../shared/types"; +import type { Job, JobSource, JobStatus } from "../shared/types"; import { Header, JobList, PipelineProgress, Stats } from "./components"; import * as api from "./api"; +const DEFAULT_PIPELINE_SOURCES: JobSource[] = ["gradcracker", "indeed", "linkedin"]; +const PIPELINE_SOURCES_STORAGE_KEY = "jobops.pipeline.sources"; + export const App: React.FC = () => { const [jobs, setJobs] = useState([]); const [stats, setStats] = useState>({ @@ -24,6 +27,27 @@ export const App: React.FC = () => { const [isPipelineRunning, setIsPipelineRunning] = useState(false); const [processingJobId, setProcessingJobId] = useState(null); const [isProcessingAll, setIsProcessingAll] = useState(false); + const [pipelineSources, setPipelineSources] = useState(() => { + try { + const raw = localStorage.getItem(PIPELINE_SOURCES_STORAGE_KEY); + if (!raw) return DEFAULT_PIPELINE_SOURCES; + const parsed = JSON.parse(raw) as unknown; + const allowed: JobSource[] = ["gradcracker", "indeed", "linkedin"]; + if (!Array.isArray(parsed)) return DEFAULT_PIPELINE_SOURCES; + const next = parsed.filter((value): value is JobSource => allowed.includes(value)); + return next.length > 0 ? next : DEFAULT_PIPELINE_SOURCES; + } catch { + return DEFAULT_PIPELINE_SOURCES; + } + }); + + useEffect(() => { + try { + localStorage.setItem(PIPELINE_SOURCES_STORAGE_KEY, JSON.stringify(pipelineSources)); + } catch { + // Ignore localStorage errors + } + }, [pipelineSources]); const loadJobs = useCallback(async () => { try { @@ -63,8 +87,10 @@ export const App: React.FC = () => { const handleRunPipeline = async () => { try { setIsPipelineRunning(true); - await api.runPipeline(); - toast.message("Pipeline started", { description: "This may take a few minutes." }); + await api.runPipeline({ sources: pipelineSources }); + toast.message("Pipeline started", { + description: `Sources: ${pipelineSources.join(", ")}. This may take a few minutes.`, + }); const pollInterval = setInterval(async () => { try { @@ -170,6 +196,8 @@ export const App: React.FC = () => { onClearDatabase={handleClearDatabase} isPipelineRunning={isPipelineRunning} isLoading={isLoading} + pipelineSources={pipelineSources} + onPipelineSourcesChange={setPipelineSources} />
@@ -190,4 +218,3 @@ export const App: React.FC = () => { ); }; - diff --git a/orchestrator/src/client/api/client.ts b/orchestrator/src/client/api/client.ts index e376f9b..5f26054 100644 --- a/orchestrator/src/client/api/client.ts +++ b/orchestrator/src/client/api/client.ts @@ -7,6 +7,7 @@ import type { ApiResponse, JobsListResponse, PipelineStatusResponse, + JobSource, PipelineRun } from '../../shared/types'; @@ -83,6 +84,7 @@ export async function getPipelineRuns(): Promise { export async function runPipeline(config?: { topN?: number; minSuitabilityScore?: number; + sources?: JobSource[]; }): Promise<{ message: string }> { return fetchApi<{ message: string }>('/pipeline/run', { method: 'POST', diff --git a/orchestrator/src/client/components/Header.tsx b/orchestrator/src/client/components/Header.tsx index 7fe16aa..605e3f9 100644 --- a/orchestrator/src/client/components/Header.tsx +++ b/orchestrator/src/client/components/Header.tsx @@ -3,7 +3,7 @@ */ import React from "react"; -import { Loader2, Play, RefreshCcw, Rocket, Trash2 } from "lucide-react"; +import { ChevronDown, Loader2, Play, RefreshCcw, Rocket, Trash2 } from "lucide-react"; import { Button } from "@/components/ui/button"; import { @@ -17,6 +17,16 @@ import { AlertDialogTitle, AlertDialogTrigger, } from "@/components/ui/alert-dialog"; +import { + DropdownMenu, + DropdownMenuCheckboxItem, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuTrigger, +} from "@/components/ui/dropdown-menu"; +import type { JobSource } from "../../shared/types"; interface HeaderProps { onRunPipeline: () => void; @@ -24,6 +34,8 @@ interface HeaderProps { onClearDatabase: () => void; isPipelineRunning: boolean; isLoading: boolean; + pipelineSources: JobSource[]; + onPipelineSourcesChange: (sources: JobSource[]) => void; } export const Header: React.FC = ({ @@ -32,7 +44,26 @@ export const Header: React.FC = ({ onClearDatabase, isPipelineRunning, isLoading, + pipelineSources, + onPipelineSourcesChange, }) => { + const sourceLabel: Record = { + gradcracker: "Gradcracker", + indeed: "Indeed", + linkedin: "LinkedIn", + }; + + const orderedSources: JobSource[] = ["gradcracker", "indeed", "linkedin"]; + + const toggleSource = (source: JobSource, checked: boolean) => { + const next = checked + ? Array.from(new Set([...pipelineSources, source])) + : pipelineSources.filter((s) => s !== source); + + if (next.length === 0) return; + onPipelineSourcesChange(next); + }; + return (
@@ -81,19 +112,62 @@ export const Header: React.FC = ({ Refresh - +
+ + + + + + + + Sources + + {orderedSources.map((source) => ( + toggleSource(source, Boolean(checked))} + > + {sourceLabel[source]} + + ))} + + onPipelineSourcesChange(orderedSources)}> + All sources + + onPipelineSourcesChange(["gradcracker"])}> + Gradcracker only + + onPipelineSourcesChange(["indeed", "linkedin"])}> + Indeed + LinkedIn only + + + +
diff --git a/orchestrator/src/client/components/JobCard.tsx b/orchestrator/src/client/components/JobCard.tsx index 185136d..21aa1e1 100644 --- a/orchestrator/src/client/components/JobCard.tsx +++ b/orchestrator/src/client/components/JobCard.tsx @@ -16,6 +16,7 @@ import { XCircle, } from "lucide-react"; +import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; import { Card, CardContent, CardFooter, CardHeader, CardTitle } from "@/components/ui/card"; import type { Job } from "../../shared/types"; @@ -52,6 +53,12 @@ export const JobCard: React.FC = ({ onProcess, isProcessing, }) => { + const sourceLabel: Record = { + gradcracker: "Gradcracker", + indeed: "Indeed", + linkedin: "LinkedIn", + }; + const hasPdf = !!job.pdfPath; const canApply = job.status === "ready"; const canProcess = job.status === "discovered"; @@ -72,6 +79,9 @@ export const JobCard: React.FC = ({
+ + {sourceLabel[job.source]} +
@@ -181,4 +191,3 @@ export const JobCard: React.FC = ({ ); }; - diff --git a/orchestrator/src/components/ui/dropdown-menu.tsx b/orchestrator/src/components/ui/dropdown-menu.tsx new file mode 100644 index 0000000..35ee4b8 --- /dev/null +++ b/orchestrator/src/components/ui/dropdown-menu.tsx @@ -0,0 +1,193 @@ +import * as React from "react" +import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu" +import { Check, ChevronRight, Circle } from "lucide-react" + +import { cn } from "@/lib/utils" + +const DropdownMenu = DropdownMenuPrimitive.Root + +const DropdownMenuTrigger = DropdownMenuPrimitive.Trigger + +const DropdownMenuGroup = DropdownMenuPrimitive.Group + +const DropdownMenuPortal = DropdownMenuPrimitive.Portal + +const DropdownMenuSub = DropdownMenuPrimitive.Sub + +const DropdownMenuRadioGroup = DropdownMenuPrimitive.RadioGroup + +const DropdownMenuSubTrigger = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef & { + inset?: boolean + } +>(({ className, inset, children, ...props }, ref) => ( + + {children} + + +)) +DropdownMenuSubTrigger.displayName = DropdownMenuPrimitive.SubTrigger.displayName + +const DropdownMenuSubContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)) +DropdownMenuSubContent.displayName = DropdownMenuPrimitive.SubContent.displayName + +const DropdownMenuContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, sideOffset = 4, ...props }, ref) => ( + + + +)) +DropdownMenuContent.displayName = DropdownMenuPrimitive.Content.displayName + +const DropdownMenuItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef & { + inset?: boolean + } +>(({ className, inset, ...props }, ref) => ( + +)) +DropdownMenuItem.displayName = DropdownMenuPrimitive.Item.displayName + +const DropdownMenuCheckboxItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, children, checked, ...props }, ref) => ( + + + + + + + {children} + +)) +DropdownMenuCheckboxItem.displayName = + DropdownMenuPrimitive.CheckboxItem.displayName + +const DropdownMenuRadioItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, children, ...props }, ref) => ( + + + + + + + {children} + +)) +DropdownMenuRadioItem.displayName = DropdownMenuPrimitive.RadioItem.displayName + +const DropdownMenuLabel = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef & { + inset?: boolean + } +>(({ className, inset, ...props }, ref) => ( + +)) +DropdownMenuLabel.displayName = DropdownMenuPrimitive.Label.displayName + +const DropdownMenuSeparator = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)) +DropdownMenuSeparator.displayName = DropdownMenuPrimitive.Separator.displayName + +const DropdownMenuShortcut = ({ + className, + ...props +}: React.HTMLAttributes) => { + return ( + + ) +} +DropdownMenuShortcut.displayName = "DropdownMenuShortcut" + +export { + DropdownMenu, + DropdownMenuTrigger, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuCheckboxItem, + DropdownMenuRadioItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuShortcut, + DropdownMenuGroup, + DropdownMenuPortal, + DropdownMenuSub, + DropdownMenuSubContent, + DropdownMenuSubTrigger, + DropdownMenuRadioGroup, +} + diff --git a/orchestrator/src/server/api/routes.ts b/orchestrator/src/server/api/routes.ts index 57d7212..0ca9845 100644 --- a/orchestrator/src/server/api/routes.ts +++ b/orchestrator/src/server/api/routes.ts @@ -280,6 +280,7 @@ apiRouter.get('/pipeline/runs', async (req: Request, res: Response) => { const runPipelineSchema = z.object({ topN: z.number().min(1).max(50).optional(), minSuitabilityScore: z.number().min(0).max(100).optional(), + sources: z.array(z.enum(['gradcracker', 'indeed', 'linkedin'])).min(1).optional(), }); apiRouter.post('/pipeline/run', async (req: Request, res: Response) => { diff --git a/orchestrator/src/server/db/migrate.ts b/orchestrator/src/server/db/migrate.ts index a9b47c7..4792170 100644 --- a/orchestrator/src/server/db/migrate.ts +++ b/orchestrator/src/server/db/migrate.ts @@ -25,6 +25,34 @@ const sqlite = new Database(DB_PATH); const migrations = [ `CREATE TABLE IF NOT EXISTS jobs ( id TEXT PRIMARY KEY, + source TEXT NOT NULL DEFAULT 'gradcracker', + source_job_id TEXT, + job_url_direct TEXT, + date_posted TEXT, + job_type TEXT, + salary_source TEXT, + salary_interval TEXT, + salary_min_amount REAL, + salary_max_amount REAL, + salary_currency TEXT, + is_remote INTEGER, + job_level TEXT, + job_function TEXT, + listing_type TEXT, + emails TEXT, + company_industry TEXT, + company_logo TEXT, + company_url_direct TEXT, + company_addresses TEXT, + company_num_employees TEXT, + company_revenue TEXT, + company_description TEXT, + skills TEXT, + experience_range TEXT, + company_rating REAL, + company_reviews_count INTEGER, + vacancy_count INTEGER, + work_from_home_type TEXT, title TEXT NOT NULL, employer TEXT NOT NULL, employer_url TEXT, @@ -60,6 +88,39 @@ const migrations = [ error_message TEXT )`, + // Add source column for existing databases (safe to skip if already present) + `ALTER TABLE jobs ADD COLUMN source TEXT NOT NULL DEFAULT 'gradcracker'`, + `UPDATE jobs SET source = 'gradcracker' WHERE source IS NULL OR source = ''`, + + // Add JobSpy columns for existing databases (safe to skip if already present) + `ALTER TABLE jobs ADD COLUMN source_job_id TEXT`, + `ALTER TABLE jobs ADD COLUMN job_url_direct TEXT`, + `ALTER TABLE jobs ADD COLUMN date_posted TEXT`, + `ALTER TABLE jobs ADD COLUMN job_type TEXT`, + `ALTER TABLE jobs ADD COLUMN salary_source TEXT`, + `ALTER TABLE jobs ADD COLUMN salary_interval TEXT`, + `ALTER TABLE jobs ADD COLUMN salary_min_amount REAL`, + `ALTER TABLE jobs ADD COLUMN salary_max_amount REAL`, + `ALTER TABLE jobs ADD COLUMN salary_currency TEXT`, + `ALTER TABLE jobs ADD COLUMN is_remote INTEGER`, + `ALTER TABLE jobs ADD COLUMN job_level TEXT`, + `ALTER TABLE jobs ADD COLUMN job_function TEXT`, + `ALTER TABLE jobs ADD COLUMN listing_type TEXT`, + `ALTER TABLE jobs ADD COLUMN emails TEXT`, + `ALTER TABLE jobs ADD COLUMN company_industry TEXT`, + `ALTER TABLE jobs ADD COLUMN company_logo TEXT`, + `ALTER TABLE jobs ADD COLUMN company_url_direct TEXT`, + `ALTER TABLE jobs ADD COLUMN company_addresses TEXT`, + `ALTER TABLE jobs ADD COLUMN company_num_employees TEXT`, + `ALTER TABLE jobs ADD COLUMN company_revenue TEXT`, + `ALTER TABLE jobs ADD COLUMN company_description TEXT`, + `ALTER TABLE jobs ADD COLUMN skills TEXT`, + `ALTER TABLE jobs ADD COLUMN experience_range TEXT`, + `ALTER TABLE jobs ADD COLUMN company_rating REAL`, + `ALTER TABLE jobs ADD COLUMN company_reviews_count INTEGER`, + `ALTER TABLE jobs ADD COLUMN vacancy_count INTEGER`, + `ALTER TABLE jobs ADD COLUMN work_from_home_type TEXT`, + `CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status)`, `CREATE INDEX IF NOT EXISTS idx_jobs_discovered_at ON jobs(discovered_at)`, `CREATE INDEX IF NOT EXISTS idx_pipeline_runs_started_at ON pipeline_runs(started_at)`, @@ -72,6 +133,16 @@ for (const migration of migrations) { sqlite.exec(migration); console.log('✅ Migration applied'); } catch (error) { + const message = error instanceof Error ? error.message : String(error); + const isDuplicateColumn = + migration.toLowerCase().includes('alter table jobs add column') && + message.toLowerCase().includes('duplicate column name'); + + if (isDuplicateColumn) { + console.log('↩️ Migration skipped (column already exists)'); + continue; + } + console.error('❌ Migration failed:', error); process.exit(1); } diff --git a/orchestrator/src/server/db/schema.ts b/orchestrator/src/server/db/schema.ts index 42784a8..3d66498 100644 --- a/orchestrator/src/server/db/schema.ts +++ b/orchestrator/src/server/db/schema.ts @@ -9,6 +9,10 @@ export const jobs = sqliteTable('jobs', { id: text('id').primaryKey(), // From crawler + source: text('source', { enum: ['gradcracker', 'indeed', 'linkedin'] }).notNull().default('gradcracker'), + sourceJobId: text('source_job_id'), + jobUrlDirect: text('job_url_direct'), + datePosted: text('date_posted'), title: text('title').notNull(), employer: text('employer').notNull(), employerUrl: text('employer_url'), @@ -21,6 +25,32 @@ export const jobs = sqliteTable('jobs', { degreeRequired: text('degree_required'), starting: text('starting'), jobDescription: text('job_description'), + + // JobSpy fields (nullable for other sources) + jobType: text('job_type'), + salarySource: text('salary_source'), + salaryInterval: text('salary_interval'), + salaryMinAmount: real('salary_min_amount'), + salaryMaxAmount: real('salary_max_amount'), + salaryCurrency: text('salary_currency'), + isRemote: integer('is_remote', { mode: 'boolean' }), + jobLevel: text('job_level'), + jobFunction: text('job_function'), + listingType: text('listing_type'), + emails: text('emails'), + companyIndustry: text('company_industry'), + companyLogo: text('company_logo'), + companyUrlDirect: text('company_url_direct'), + companyAddresses: text('company_addresses'), + companyNumEmployees: text('company_num_employees'), + companyRevenue: text('company_revenue'), + companyDescription: text('company_description'), + skills: text('skills'), + experienceRange: text('experience_range'), + companyRating: real('company_rating'), + companyReviewsCount: integer('company_reviews_count'), + vacancyCount: integer('vacancy_count'), + workFromHomeType: text('work_from_home_type'), // Orchestrator enrichments status: text('status', { diff --git a/orchestrator/src/server/pipeline/orchestrator.ts b/orchestrator/src/server/pipeline/orchestrator.ts index 18d9980..5ec4f02 100644 --- a/orchestrator/src/server/pipeline/orchestrator.ts +++ b/orchestrator/src/server/pipeline/orchestrator.ts @@ -14,13 +14,14 @@ import { readFile } from 'fs/promises'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; import { runCrawler } from '../services/crawler.js'; +import { runJobSpy } from '../services/jobspy.js'; import { scoreAndRankJobs, scoreJobSuitability } from '../services/scorer.js'; import { generateSummary } from '../services/summary.js'; import { generatePdf } from '../services/pdf.js'; import * as jobsRepo from '../repositories/jobs.js'; import * as pipelineRepo from '../repositories/pipeline.js'; -import { progressHelpers, resetProgress } from './progress.js'; -import type { Job, PipelineConfig } from '../../shared/types.js'; +import { progressHelpers, resetProgress, updateProgress } from './progress.js'; +import type { CreateJobInput, Job, JobSource, PipelineConfig } from '../../shared/types.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); const DEFAULT_PROFILE_PATH = join(__dirname, '../../../../resume-generator/base.json'); @@ -28,7 +29,7 @@ const DEFAULT_PROFILE_PATH = join(__dirname, '../../../../resume-generator/base. const DEFAULT_CONFIG: PipelineConfig = { topN: 10, minSuitabilityScore: 50, - sources: ['gradcracker'], + sources: ['gradcracker', 'indeed', 'linkedin'], profilePath: DEFAULT_PROFILE_PATH, outputDir: join(__dirname, '../../../data/pdfs'), }; @@ -73,31 +74,65 @@ export async function runPipeline(config: Partial = {}): Promise console.log('\n🕷️ Running crawler...'); progressHelpers.startCrawling(); const existingJobUrls = await jobsRepo.getAllJobUrls(); - const crawlerResult = await runCrawler({ - existingJobUrls, - onProgress: (update) => { - progressHelpers.crawlingUpdate({ - listPagesProcessed: update.listPagesProcessed, - listPagesTotal: update.listPagesTotal, - jobCardsFound: update.jobCardsFound, - jobPagesEnqueued: update.jobPagesEnqueued, - jobPagesSkipped: update.jobPagesSkipped, - jobPagesProcessed: update.jobPagesProcessed, - phase: update.phase, - currentUrl: update.currentUrl, - }); - }, - }); - - if (!crawlerResult.success) { - throw new Error(`Crawler failed: ${crawlerResult.error}`); + + const discoveredJobs: CreateJobInput[] = []; + const sourceErrors: string[] = []; + + if (mergedConfig.sources.includes('gradcracker')) { + const crawlerResult = await runCrawler({ + existingJobUrls, + onProgress: (update) => { + progressHelpers.crawlingUpdate({ + listPagesProcessed: update.listPagesProcessed, + listPagesTotal: update.listPagesTotal, + jobCardsFound: update.jobCardsFound, + jobPagesEnqueued: update.jobPagesEnqueued, + jobPagesSkipped: update.jobPagesSkipped, + jobPagesProcessed: update.jobPagesProcessed, + phase: update.phase, + currentUrl: update.currentUrl, + }); + }, + }); + + if (!crawlerResult.success) { + sourceErrors.push(`gradcracker: ${crawlerResult.error ?? 'unknown error'}`); + } else { + discoveredJobs.push(...crawlerResult.jobs); + } } - - progressHelpers.crawlingComplete(crawlerResult.jobs.length); + + const jobSpySites = mergedConfig.sources.filter( + (s): s is 'indeed' | 'linkedin' => s === 'indeed' || s === 'linkedin' + ); + + if (jobSpySites.length > 0) { + updateProgress({ + step: 'crawling', + detail: `JobSpy: scraping ${jobSpySites.join(', ')}...`, + }); + + const jobSpyResult = await runJobSpy({ sites: jobSpySites }); + if (!jobSpyResult.success) { + sourceErrors.push(`jobspy: ${jobSpyResult.error ?? 'unknown error'}`); + } else { + discoveredJobs.push(...jobSpyResult.jobs); + } + } + + if (discoveredJobs.length === 0 && sourceErrors.length > 0) { + throw new Error(`All sources failed: ${sourceErrors.join('; ')}`); + } + + if (sourceErrors.length > 0) { + console.warn(`ƒsÿ‹,? Some sources failed: ${sourceErrors.join('; ')}`); + } + + progressHelpers.crawlingComplete(discoveredJobs.length); // Step 3: Import discovered jobs console.log('\n💾 Importing jobs to database...'); - const { created, skipped } = await jobsRepo.bulkCreateJobs(crawlerResult.jobs); + const { created, skipped } = await jobsRepo.bulkCreateJobs(discoveredJobs); console.log(` Created: ${created}, Skipped (duplicates): ${skipped}`); progressHelpers.importComplete(created, skipped); diff --git a/orchestrator/src/server/repositories/jobs.ts b/orchestrator/src/server/repositories/jobs.ts index 19a3461..98e469f 100644 --- a/orchestrator/src/server/repositories/jobs.ts +++ b/orchestrator/src/server/repositories/jobs.ts @@ -60,6 +60,10 @@ export async function createJob(input: CreateJobInput): Promise { await db.insert(jobs).values({ id, + source: input.source, + sourceJobId: input.sourceJobId ?? null, + jobUrlDirect: input.jobUrlDirect ?? null, + datePosted: input.datePosted ?? null, title: input.title, employer: input.employer, employerUrl: input.employerUrl ?? null, @@ -72,6 +76,30 @@ export async function createJob(input: CreateJobInput): Promise { degreeRequired: input.degreeRequired ?? null, starting: input.starting ?? null, jobDescription: input.jobDescription ?? null, + jobType: input.jobType ?? null, + salarySource: input.salarySource ?? null, + salaryInterval: input.salaryInterval ?? null, + salaryMinAmount: input.salaryMinAmount ?? null, + salaryMaxAmount: input.salaryMaxAmount ?? null, + salaryCurrency: input.salaryCurrency ?? null, + isRemote: input.isRemote ?? null, + jobLevel: input.jobLevel ?? null, + jobFunction: input.jobFunction ?? null, + listingType: input.listingType ?? null, + emails: input.emails ?? null, + companyIndustry: input.companyIndustry ?? null, + companyLogo: input.companyLogo ?? null, + companyUrlDirect: input.companyUrlDirect ?? null, + companyAddresses: input.companyAddresses ?? null, + companyNumEmployees: input.companyNumEmployees ?? null, + companyRevenue: input.companyRevenue ?? null, + companyDescription: input.companyDescription ?? null, + skills: input.skills ?? null, + experienceRange: input.experienceRange ?? null, + companyRating: input.companyRating ?? null, + companyReviewsCount: input.companyReviewsCount ?? null, + vacancyCount: input.vacancyCount ?? null, + workFromHomeType: input.workFromHomeType ?? null, status: 'discovered', discoveredAt: now, createdAt: now, @@ -171,6 +199,10 @@ export async function getJobsForProcessing(limit: number = 10): Promise { function mapRowToJob(row: typeof jobs.$inferSelect): Job { return { id: row.id, + source: row.source as Job['source'], + sourceJobId: row.sourceJobId ?? null, + jobUrlDirect: row.jobUrlDirect ?? null, + datePosted: row.datePosted ?? null, title: row.title, employer: row.employer, employerUrl: row.employerUrl, @@ -189,6 +221,30 @@ function mapRowToJob(row: typeof jobs.$inferSelect): Job { tailoredSummary: row.tailoredSummary, pdfPath: row.pdfPath, notionPageId: row.notionPageId, + jobType: row.jobType ?? null, + salarySource: row.salarySource ?? null, + salaryInterval: row.salaryInterval ?? null, + salaryMinAmount: row.salaryMinAmount ?? null, + salaryMaxAmount: row.salaryMaxAmount ?? null, + salaryCurrency: row.salaryCurrency ?? null, + isRemote: row.isRemote ?? null, + jobLevel: row.jobLevel ?? null, + jobFunction: row.jobFunction ?? null, + listingType: row.listingType ?? null, + emails: row.emails ?? null, + companyIndustry: row.companyIndustry ?? null, + companyLogo: row.companyLogo ?? null, + companyUrlDirect: row.companyUrlDirect ?? null, + companyAddresses: row.companyAddresses ?? null, + companyNumEmployees: row.companyNumEmployees ?? null, + companyRevenue: row.companyRevenue ?? null, + companyDescription: row.companyDescription ?? null, + skills: row.skills ?? null, + experienceRange: row.experienceRange ?? null, + companyRating: row.companyRating ?? null, + companyReviewsCount: row.companyReviewsCount ?? null, + vacancyCount: row.vacancyCount ?? null, + workFromHomeType: row.workFromHomeType ?? null, discoveredAt: row.discoveredAt, processedAt: row.processedAt, appliedAt: row.appliedAt, diff --git a/orchestrator/src/server/services/crawler.ts b/orchestrator/src/server/services/crawler.ts index c654dae..d825c1e 100644 --- a/orchestrator/src/server/services/crawler.ts +++ b/orchestrator/src/server/services/crawler.ts @@ -1,5 +1,5 @@ /** - * Service for running the job crawler (job-extractor). + * Service for running the Gradcracker crawler (extractors/gradcracker). * Wraps the existing Crawlee-based crawler. */ @@ -11,7 +11,7 @@ import { createInterface } from 'readline'; import type { CreateJobInput } from '../../shared/types.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const CRAWLER_DIR = join(__dirname, '../../../../job-extractor'); +const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker'); const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default'); const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops'); @@ -29,7 +29,7 @@ export interface RunCrawlerOptions { existingJobUrls?: string[]; /** - * Optional callback for live crawl progress emitted by job-extractor. + * Optional callback for live crawl progress emitted by the Gradcracker extractor. */ onProgress?: (update: JobExtractorProgress) => void; } @@ -57,7 +57,7 @@ async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined): } /** - * Run the job-extractor crawler and return discovered jobs. + * Run the Gradcracker crawler and return discovered jobs. */ export async function runCrawler(options: RunCrawlerOptions = {}): Promise { console.log('🕷️ Starting job crawler...'); @@ -144,6 +144,7 @@ async function readCrawledJobs(): Promise { // Map crawler output to our job input format jobs.push({ + source: 'gradcracker', title: data.title || 'Unknown Title', employer: data.employer || 'Unknown Employer', employerUrl: data.employerUrl, diff --git a/orchestrator/src/server/services/index.ts b/orchestrator/src/server/services/index.ts index 43a7e1f..72f6b05 100644 --- a/orchestrator/src/server/services/index.ts +++ b/orchestrator/src/server/services/index.ts @@ -1,4 +1,5 @@ export * from './crawler.js'; +export * from './jobspy.js'; export * from './scorer.js'; export * from './summary.js'; export * from './pdf.js'; diff --git a/orchestrator/src/server/services/jobspy.ts b/orchestrator/src/server/services/jobspy.ts new file mode 100644 index 0000000..0846bad --- /dev/null +++ b/orchestrator/src/server/services/jobspy.ts @@ -0,0 +1,241 @@ +/** + * Service for scraping jobs via JobSpy (Indeed/LinkedIn/etc) and mapping them into our DB shape. + * + * Uses a small Python wrapper script that writes both CSV + JSON to disk; we ingest the JSON. + */ + +import { spawn } from 'child_process'; +import { readFile, mkdir } from 'fs/promises'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; +import type { CreateJobInput, JobSource } from '../../shared/types.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const JOBSPY_DIR = join(__dirname, '../../../../extractors/jobspy'); +const JOBSPY_SCRIPT = join(JOBSPY_DIR, 'scrape_jobs.py'); + +function getPythonPath(): string { + if (process.env.PYTHON_PATH) return process.env.PYTHON_PATH; + return process.platform === 'win32' ? 'python' : 'python3'; +} + +function getDataDir(): string { + if (process.env.DATA_DIR) return process.env.DATA_DIR; + return join(__dirname, '../../../data'); +} + +function toStringOrNull(value: unknown): string | null { + if (value === null || value === undefined) return null; + if (typeof value === 'string') { + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : null; + } + if (typeof value === 'number' || typeof value === 'boolean') return String(value); + return null; +} + +function toNumberOrNull(value: unknown): number | null { + if (value === null || value === undefined) return null; + if (typeof value === 'number') return Number.isFinite(value) ? value : null; + if (typeof value === 'string') { + const trimmed = value.trim(); + if (!trimmed) return null; + const parsed = Number(trimmed); + return Number.isFinite(parsed) ? parsed : null; + } + return null; +} + +function toBooleanOrNull(value: unknown): boolean | null { + if (value === null || value === undefined) return null; + if (typeof value === 'boolean') return value; + if (typeof value === 'number') return value !== 0; + if (typeof value === 'string') { + const normalized = value.trim().toLowerCase(); + if (!normalized) return null; + if (['1', 'true', 'yes', 'y', 'on'].includes(normalized)) return true; + if (['0', 'false', 'no', 'n', 'off'].includes(normalized)) return false; + } + return null; +} + +function toJsonStringOrNull(value: unknown): string | null { + if (value === null || value === undefined) return null; + if (typeof value === 'string') return toStringOrNull(value); + try { + return JSON.stringify(value); + } catch { + return null; + } +} + +function toJobSource(site: unknown): JobSource | null { + const raw = toStringOrNull(site)?.toLowerCase(); + if (raw === 'gradcracker') return 'gradcracker'; + if (raw === 'indeed') return 'indeed'; + if (raw === 'linkedin') return 'linkedin'; + return null; +} + +function formatSalary(params: { + minAmount: number | null; + maxAmount: number | null; + currency: string | null; + interval: string | null; +}): string | null { + const { minAmount, maxAmount, currency, interval } = params; + if (minAmount === null && maxAmount === null) return null; + + const fmt = (n: number) => { + // Avoid locale ambiguity; keep it simple. + const rounded = Math.round(n); + return `${rounded}`; + }; + + let range: string; + if (minAmount !== null && maxAmount !== null) { + range = `${fmt(minAmount)}-${fmt(maxAmount)}`; + } else if (minAmount !== null) { + range = `${fmt(minAmount)}+`; + } else if (maxAmount !== null) { + range = `${fmt(maxAmount)}`; + } else { + return null; + } + + const currencyPart = currency ? `${currency} ` : ''; + const intervalPart = interval ? ` / ${interval}` : ''; + return `${currencyPart}${range}${intervalPart}`.trim(); +} + +export interface RunJobSpyOptions { + sites?: Array; + searchTerm?: string; + location?: string; + resultsWanted?: number; + hoursOld?: number; + countryIndeed?: string; + linkedinFetchDescription?: boolean; +} + +export interface JobSpyResult { + success: boolean; + jobs: CreateJobInput[]; + error?: string; +} + +export async function runJobSpy(options: RunJobSpyOptions = {}): Promise { + const dataDir = getDataDir(); + const outputDir = join(dataDir, 'imports'); + await mkdir(outputDir, { recursive: true }); + + const outputCsv = join(outputDir, 'jobspy_jobs.csv'); + const outputJson = join(outputDir, 'jobspy_jobs.json'); + + const sites = (options.sites ?? ['indeed', 'linkedin']) + .filter((s) => s === 'indeed' || s === 'linkedin') + .join(','); + + try { + await new Promise((resolve, reject) => { + const pythonPath = getPythonPath(); + const child = spawn(pythonPath, [JOBSPY_SCRIPT], { + cwd: JOBSPY_DIR, + shell: false, + stdio: 'inherit', + env: { + ...process.env, + JOBSPY_SITES: sites || 'indeed,linkedin', + JOBSPY_SEARCH_TERM: options.searchTerm ?? process.env.JOBSPY_SEARCH_TERM ?? 'web developer', + JOBSPY_LOCATION: options.location ?? process.env.JOBSPY_LOCATION ?? 'UK', + JOBSPY_RESULTS_WANTED: String(options.resultsWanted ?? process.env.JOBSPY_RESULTS_WANTED ?? 200), + JOBSPY_HOURS_OLD: String(options.hoursOld ?? process.env.JOBSPY_HOURS_OLD ?? 72), + JOBSPY_COUNTRY_INDEED: options.countryIndeed ?? process.env.JOBSPY_COUNTRY_INDEED ?? 'UK', + JOBSPY_LINKEDIN_FETCH_DESCRIPTION: String( + options.linkedinFetchDescription ?? process.env.JOBSPY_LINKEDIN_FETCH_DESCRIPTION ?? '1' + ), + JOBSPY_OUTPUT_CSV: outputCsv, + JOBSPY_OUTPUT_JSON: outputJson, + }, + }); + + child.on('close', (code) => { + if (code === 0) resolve(); + else reject(new Error(`JobSpy exited with code ${code}`)); + }); + child.on('error', reject); + }); + + const raw = await readFile(outputJson, 'utf-8'); + const parsed = JSON.parse(raw) as Array>; + + const jobs: CreateJobInput[] = []; + + for (const row of parsed) { + const source = toJobSource(row.site); + if (!source) continue; + + const jobUrl = toStringOrNull(row.job_url); + if (!jobUrl) continue; + + const title = toStringOrNull(row.title) ?? 'Unknown Title'; + const employer = toStringOrNull(row.company) ?? 'Unknown Employer'; + + const jobUrlDirect = toStringOrNull(row.job_url_direct); + const applicationLink = jobUrlDirect ?? jobUrl; + + const minAmount = toNumberOrNull(row.min_amount); + const maxAmount = toNumberOrNull(row.max_amount); + const currency = toStringOrNull(row.currency); + const interval = toStringOrNull(row.interval); + + const salary = formatSalary({ minAmount, maxAmount, currency, interval }); + + jobs.push({ + source, + sourceJobId: toStringOrNull(row.id) ?? undefined, + jobUrlDirect: jobUrlDirect ?? undefined, + datePosted: toStringOrNull(row.date_posted) ?? undefined, + + title, + employer, + employerUrl: toStringOrNull(row.company_url) ?? undefined, + jobUrl, + applicationLink, + location: toStringOrNull(row.location) ?? undefined, + jobDescription: toStringOrNull(row.description) ?? undefined, + salary: salary ?? undefined, + + jobType: toStringOrNull(row.job_type) ?? undefined, + salarySource: toStringOrNull(row.salary_source) ?? undefined, + salaryInterval: interval ?? undefined, + salaryMinAmount: minAmount ?? undefined, + salaryMaxAmount: maxAmount ?? undefined, + salaryCurrency: currency ?? undefined, + isRemote: toBooleanOrNull(row.is_remote) ?? undefined, + jobLevel: toStringOrNull(row.job_level) ?? undefined, + jobFunction: toStringOrNull(row.job_function) ?? undefined, + listingType: toStringOrNull(row.listing_type) ?? undefined, + emails: toJsonStringOrNull(row.emails) ?? undefined, + companyIndustry: toStringOrNull(row.company_industry) ?? undefined, + companyLogo: toStringOrNull(row.company_logo) ?? undefined, + companyUrlDirect: toStringOrNull(row.company_url_direct) ?? undefined, + companyAddresses: toJsonStringOrNull(row.company_addresses) ?? undefined, + companyNumEmployees: toStringOrNull(row.company_num_employees) ?? undefined, + companyRevenue: toStringOrNull(row.company_revenue) ?? undefined, + companyDescription: toStringOrNull(row.company_description) ?? undefined, + skills: toJsonStringOrNull(row.skills) ?? undefined, + experienceRange: toJsonStringOrNull(row.experience_range) ?? undefined, + companyRating: toNumberOrNull(row.company_rating) ?? undefined, + companyReviewsCount: toNumberOrNull(row.company_reviews_count) ?? undefined, + vacancyCount: toNumberOrNull(row.vacancy_count) ?? undefined, + workFromHomeType: toStringOrNull(row.work_from_home_type) ?? undefined, + }); + } + + return { success: true, jobs }; + } catch (error) { + const message = error instanceof Error ? error.message : 'Unknown error'; + return { success: false, jobs: [], error: message }; + } +} diff --git a/orchestrator/src/shared/types.ts b/orchestrator/src/shared/types.ts index f6cd0a0..235969c 100644 --- a/orchestrator/src/shared/types.ts +++ b/orchestrator/src/shared/types.ts @@ -10,10 +10,21 @@ export type JobStatus = | 'rejected' // User rejected this job | 'expired'; // Deadline passed +export type JobSource = + | 'gradcracker' + | 'indeed' + | 'linkedin'; + export interface Job { id: string; - // From crawler + // Source / provenance + source: JobSource; + sourceJobId: string | null; // External ID (if provided) + jobUrlDirect: string | null; // Source-provided direct URL (if provided) + datePosted: string | null; // Source-provided posting date (if provided) + + // From crawler (normalized) title: string; employer: string; employerUrl: string | null; @@ -34,6 +45,32 @@ export interface Job { tailoredSummary: string | null; // Generated resume summary pdfPath: string | null; // Path to generated PDF notionPageId: string | null; // Notion page ID if synced + + // JobSpy fields (nullable for non-JobSpy sources) + jobType: string | null; + salarySource: string | null; + salaryInterval: string | null; + salaryMinAmount: number | null; + salaryMaxAmount: number | null; + salaryCurrency: string | null; + isRemote: boolean | null; + jobLevel: string | null; + jobFunction: string | null; + listingType: string | null; + emails: string | null; + companyIndustry: string | null; + companyLogo: string | null; + companyUrlDirect: string | null; + companyAddresses: string | null; + companyNumEmployees: string | null; + companyRevenue: string | null; + companyDescription: string | null; + skills: string | null; + experienceRange: string | null; + companyRating: number | null; + companyReviewsCount: number | null; + vacancyCount: number | null; + workFromHomeType: string | null; // Timestamps discoveredAt: string; @@ -44,6 +81,7 @@ export interface Job { } export interface CreateJobInput { + source: JobSource; title: string; employer: string; employerUrl?: string; @@ -56,6 +94,35 @@ export interface CreateJobInput { degreeRequired?: string; starting?: string; jobDescription?: string; + + // JobSpy fields (optional) + sourceJobId?: string; + jobUrlDirect?: string; + datePosted?: string; + jobType?: string; + salarySource?: string; + salaryInterval?: string; + salaryMinAmount?: number; + salaryMaxAmount?: number; + salaryCurrency?: string; + isRemote?: boolean; + jobLevel?: string; + jobFunction?: string; + listingType?: string; + emails?: string; + companyIndustry?: string; + companyLogo?: string; + companyUrlDirect?: string; + companyAddresses?: string; + companyNumEmployees?: string; + companyRevenue?: string; + companyDescription?: string; + skills?: string; + experienceRange?: string; + companyRating?: number; + companyReviewsCount?: number; + vacancyCount?: number; + workFromHomeType?: string; } export interface UpdateJobInput { @@ -71,7 +138,7 @@ export interface UpdateJobInput { export interface PipelineConfig { topN: number; // Number of top jobs to process minSuitabilityScore: number; // Minimum score to auto-process - sources: string[]; // Job sources to crawl + sources: JobSource[]; // Job sources to crawl profilePath: string; // Path to profile JSON outputDir: string; // Directory for generated PDFs }