diff --git a/Dockerfile b/Dockerfile index d95c108..cbb5f5c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,13 +27,13 @@ RUN pip3 install --no-cache-dir --break-system-packages playwright python-jobspy # Copy package files first for better caching COPY orchestrator/package*.json ./orchestrator/ -COPY job-extractor/package*.json ./job-extractor/ +COPY extractors/gradcracker/package*.json ./extractors/gradcracker/ # Install Node.js dependencies WORKDIR /app/orchestrator RUN npm install --production=false -WORKDIR /app/job-extractor +WORKDIR /app/extractors/gradcracker RUN npm install --production=false # Install Camoufox browser (downloads its own Firefox fork) @@ -42,8 +42,8 @@ RUN npx camoufox fetch # Copy source code WORKDIR /app COPY orchestrator ./orchestrator -COPY job-extractor ./job-extractor -COPY jobspy-extractor ./jobspy-extractor +COPY extractors/gradcracker ./extractors/gradcracker +COPY extractors/jobspy ./extractors/jobspy COPY resume-generator ./resume-generator # Build the orchestrator (client + server) diff --git a/README.md b/README.md index 8ad6d60..db4f764 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ Automated job discovery -> AI suitability scoring -> tailored resume PDFs -> a d ## How it works (pipeline) -1. **Crawl**: `job-extractor` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs). -2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`job-extractor/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique). +1. **Crawl**: `extractors/gradcracker` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs). +2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`extractors/gradcracker/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique). 3. **Score**: `orchestrator` scores up to 50 unprocessed jobs via OpenRouter (cached as `suitabilityScore`/`suitabilityReason`). 4. **Select**: take the top `N` jobs above `minSuitabilityScore`. 5. **Process**: for each selected job: @@ -30,7 +30,7 @@ flowchart LR PDFS[(PDFs
pdfs/)] end - subgraph CRAWL["job-extractor (Crawlee/Playwright)"] + subgraph CRAWL["extractors/gradcracker (Crawlee/Playwright)"] C1["Seed search URLs
(locations x roles)"] C2["Parse list pages
enqueue job pages"] C3["Parse job pages
extract JD + apply URL"] @@ -73,7 +73,8 @@ job-ops/ src/server/ # API routes, pipeline, DB, services src/client/ # UI (polls jobs, listens to SSE progress) src/shared/ # shared types (Job, PipelineRun, etc.) - job-extractor/ # Crawlee crawler (Gradcracker) + extractors/gradcracker/ # Crawlee crawler (Gradcracker) + extractors/jobspy/ # JobSpy wrapper (Indeed/LinkedIn/etc) resume-generator/ # Python Playwright automation for rxresu.me base.json # your exported base resume (template) data/ # persisted runtime artifacts (Docker default) @@ -113,7 +114,7 @@ Install Node deps (both packages): ```bash cd orchestrator && npm install -cd ../job-extractor && npm install +cd ../extractors/gradcracker && npm install ``` Configure the orchestrator env + DB: @@ -153,7 +154,7 @@ Dev URLs: ## Notes / sharp edges -- **Crawl targets**: edit `job-extractor/src/main.ts` to change the Gradcracker location/role matrix. +- **Crawl targets**: edit `extractors/gradcracker/src/main.ts` to change the Gradcracker location/role matrix. - **Notion sync is schema-dependent**: `orchestrator/src/server/services/notion.ts` assumes property names; adjust to match your Notion database. - **Pipeline config knobs**: `POST /api/pipeline/run` accepts `{ topN, minSuitabilityScore }`; `PIPELINE_TOP_N`/`PIPELINE_MIN_SCORE` are used by `npm run pipeline:run` (CLI runner). - **Anti-bot reality**: crawling is headless + "humanized", but sites can still block; expect occasional flakiness. diff --git a/job-extractor/.dockerignore b/extractors/gradcracker/.dockerignore similarity index 100% rename from job-extractor/.dockerignore rename to extractors/gradcracker/.dockerignore diff --git a/job-extractor/.gitignore b/extractors/gradcracker/.gitignore similarity index 100% rename from job-extractor/.gitignore rename to extractors/gradcracker/.gitignore diff --git a/job-extractor/Dockerfile b/extractors/gradcracker/Dockerfile similarity index 100% rename from job-extractor/Dockerfile rename to extractors/gradcracker/Dockerfile diff --git a/job-extractor/README.md b/extractors/gradcracker/README.md similarity index 100% rename from job-extractor/README.md rename to extractors/gradcracker/README.md diff --git a/job-extractor/package-lock.json b/extractors/gradcracker/package-lock.json similarity index 100% rename from job-extractor/package-lock.json rename to extractors/gradcracker/package-lock.json diff --git a/job-extractor/package.json b/extractors/gradcracker/package.json similarity index 100% rename from job-extractor/package.json rename to extractors/gradcracker/package.json diff --git a/job-extractor/src/main.ts b/extractors/gradcracker/src/main.ts similarity index 100% rename from job-extractor/src/main.ts rename to extractors/gradcracker/src/main.ts diff --git a/job-extractor/src/progress.ts b/extractors/gradcracker/src/progress.ts similarity index 100% rename from job-extractor/src/progress.ts rename to extractors/gradcracker/src/progress.ts diff --git a/job-extractor/src/routes.ts b/extractors/gradcracker/src/routes.ts similarity index 100% rename from job-extractor/src/routes.ts rename to extractors/gradcracker/src/routes.ts diff --git a/job-extractor/tsconfig.json b/extractors/gradcracker/tsconfig.json similarity index 100% rename from job-extractor/tsconfig.json rename to extractors/gradcracker/tsconfig.json diff --git a/jobspy-extractor/requirements.txt b/extractors/jobspy/requirements.txt similarity index 100% rename from jobspy-extractor/requirements.txt rename to extractors/jobspy/requirements.txt diff --git a/jobspy-extractor/scrape_jobs.py b/extractors/jobspy/scrape_jobs.py similarity index 100% rename from jobspy-extractor/scrape_jobs.py rename to extractors/jobspy/scrape_jobs.py diff --git a/orchestrator/src/server/services/crawler.ts b/orchestrator/src/server/services/crawler.ts index 7d07f4e..d825c1e 100644 --- a/orchestrator/src/server/services/crawler.ts +++ b/orchestrator/src/server/services/crawler.ts @@ -1,5 +1,5 @@ /** - * Service for running the job crawler (job-extractor). + * Service for running the Gradcracker crawler (extractors/gradcracker). * Wraps the existing Crawlee-based crawler. */ @@ -11,7 +11,7 @@ import { createInterface } from 'readline'; import type { CreateJobInput } from '../../shared/types.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const CRAWLER_DIR = join(__dirname, '../../../../job-extractor'); +const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker'); const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default'); const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops'); @@ -29,7 +29,7 @@ export interface RunCrawlerOptions { existingJobUrls?: string[]; /** - * Optional callback for live crawl progress emitted by job-extractor. + * Optional callback for live crawl progress emitted by the Gradcracker extractor. */ onProgress?: (update: JobExtractorProgress) => void; } @@ -57,7 +57,7 @@ async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined): } /** - * Run the job-extractor crawler and return discovered jobs. + * Run the Gradcracker crawler and return discovered jobs. */ export async function runCrawler(options: RunCrawlerOptions = {}): Promise { console.log('🕷️ Starting job crawler...'); diff --git a/orchestrator/src/server/services/jobspy.ts b/orchestrator/src/server/services/jobspy.ts index 2593e81..0846bad 100644 --- a/orchestrator/src/server/services/jobspy.ts +++ b/orchestrator/src/server/services/jobspy.ts @@ -11,7 +11,7 @@ import { fileURLToPath } from 'url'; import type { CreateJobInput, JobSource } from '../../shared/types.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const JOBSPY_DIR = join(__dirname, '../../../../jobspy-extractor'); +const JOBSPY_DIR = join(__dirname, '../../../../extractors/jobspy'); const JOBSPY_SCRIPT = join(JOBSPY_DIR, 'scrape_jobs.py'); function getPythonPath(): string {