diff --git a/Dockerfile b/Dockerfile
index d95c108..cbb5f5c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,13 +27,13 @@ RUN pip3 install --no-cache-dir --break-system-packages playwright python-jobspy
# Copy package files first for better caching
COPY orchestrator/package*.json ./orchestrator/
-COPY job-extractor/package*.json ./job-extractor/
+COPY extractors/gradcracker/package*.json ./extractors/gradcracker/
# Install Node.js dependencies
WORKDIR /app/orchestrator
RUN npm install --production=false
-WORKDIR /app/job-extractor
+WORKDIR /app/extractors/gradcracker
RUN npm install --production=false
# Install Camoufox browser (downloads its own Firefox fork)
@@ -42,8 +42,8 @@ RUN npx camoufox fetch
# Copy source code
WORKDIR /app
COPY orchestrator ./orchestrator
-COPY job-extractor ./job-extractor
-COPY jobspy-extractor ./jobspy-extractor
+COPY extractors/gradcracker ./extractors/gradcracker
+COPY extractors/jobspy ./extractors/jobspy
COPY resume-generator ./resume-generator
# Build the orchestrator (client + server)
diff --git a/README.md b/README.md
index 8ad6d60..db4f764 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@ Automated job discovery -> AI suitability scoring -> tailored resume PDFs -> a d
## How it works (pipeline)
-1. **Crawl**: `job-extractor` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
-2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`job-extractor/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
+1. **Crawl**: `extractors/gradcracker` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
+2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`extractors/gradcracker/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
3. **Score**: `orchestrator` scores up to 50 unprocessed jobs via OpenRouter (cached as `suitabilityScore`/`suitabilityReason`).
4. **Select**: take the top `N` jobs above `minSuitabilityScore`.
5. **Process**: for each selected job:
@@ -30,7 +30,7 @@ flowchart LR
PDFS[(PDFs
pdfs/)]
end
- subgraph CRAWL["job-extractor (Crawlee/Playwright)"]
+ subgraph CRAWL["extractors/gradcracker (Crawlee/Playwright)"]
C1["Seed search URLs
(locations x roles)"]
C2["Parse list pages
enqueue job pages"]
C3["Parse job pages
extract JD + apply URL"]
@@ -73,7 +73,8 @@ job-ops/
src/server/ # API routes, pipeline, DB, services
src/client/ # UI (polls jobs, listens to SSE progress)
src/shared/ # shared types (Job, PipelineRun, etc.)
- job-extractor/ # Crawlee crawler (Gradcracker)
+ extractors/gradcracker/ # Crawlee crawler (Gradcracker)
+ extractors/jobspy/ # JobSpy wrapper (Indeed/LinkedIn/etc)
resume-generator/ # Python Playwright automation for rxresu.me
base.json # your exported base resume (template)
data/ # persisted runtime artifacts (Docker default)
@@ -113,7 +114,7 @@ Install Node deps (both packages):
```bash
cd orchestrator && npm install
-cd ../job-extractor && npm install
+cd ../extractors/gradcracker && npm install
```
Configure the orchestrator env + DB:
@@ -153,7 +154,7 @@ Dev URLs:
## Notes / sharp edges
-- **Crawl targets**: edit `job-extractor/src/main.ts` to change the Gradcracker location/role matrix.
+- **Crawl targets**: edit `extractors/gradcracker/src/main.ts` to change the Gradcracker location/role matrix.
- **Notion sync is schema-dependent**: `orchestrator/src/server/services/notion.ts` assumes property names; adjust to match your Notion database.
- **Pipeline config knobs**: `POST /api/pipeline/run` accepts `{ topN, minSuitabilityScore }`; `PIPELINE_TOP_N`/`PIPELINE_MIN_SCORE` are used by `npm run pipeline:run` (CLI runner).
- **Anti-bot reality**: crawling is headless + "humanized", but sites can still block; expect occasional flakiness.
diff --git a/job-extractor/.dockerignore b/extractors/gradcracker/.dockerignore
similarity index 100%
rename from job-extractor/.dockerignore
rename to extractors/gradcracker/.dockerignore
diff --git a/job-extractor/.gitignore b/extractors/gradcracker/.gitignore
similarity index 100%
rename from job-extractor/.gitignore
rename to extractors/gradcracker/.gitignore
diff --git a/job-extractor/Dockerfile b/extractors/gradcracker/Dockerfile
similarity index 100%
rename from job-extractor/Dockerfile
rename to extractors/gradcracker/Dockerfile
diff --git a/job-extractor/README.md b/extractors/gradcracker/README.md
similarity index 100%
rename from job-extractor/README.md
rename to extractors/gradcracker/README.md
diff --git a/job-extractor/package-lock.json b/extractors/gradcracker/package-lock.json
similarity index 100%
rename from job-extractor/package-lock.json
rename to extractors/gradcracker/package-lock.json
diff --git a/job-extractor/package.json b/extractors/gradcracker/package.json
similarity index 100%
rename from job-extractor/package.json
rename to extractors/gradcracker/package.json
diff --git a/job-extractor/src/main.ts b/extractors/gradcracker/src/main.ts
similarity index 100%
rename from job-extractor/src/main.ts
rename to extractors/gradcracker/src/main.ts
diff --git a/job-extractor/src/progress.ts b/extractors/gradcracker/src/progress.ts
similarity index 100%
rename from job-extractor/src/progress.ts
rename to extractors/gradcracker/src/progress.ts
diff --git a/job-extractor/src/routes.ts b/extractors/gradcracker/src/routes.ts
similarity index 100%
rename from job-extractor/src/routes.ts
rename to extractors/gradcracker/src/routes.ts
diff --git a/job-extractor/tsconfig.json b/extractors/gradcracker/tsconfig.json
similarity index 100%
rename from job-extractor/tsconfig.json
rename to extractors/gradcracker/tsconfig.json
diff --git a/jobspy-extractor/requirements.txt b/extractors/jobspy/requirements.txt
similarity index 100%
rename from jobspy-extractor/requirements.txt
rename to extractors/jobspy/requirements.txt
diff --git a/jobspy-extractor/scrape_jobs.py b/extractors/jobspy/scrape_jobs.py
similarity index 100%
rename from jobspy-extractor/scrape_jobs.py
rename to extractors/jobspy/scrape_jobs.py
diff --git a/orchestrator/src/server/services/crawler.ts b/orchestrator/src/server/services/crawler.ts
index 7d07f4e..d825c1e 100644
--- a/orchestrator/src/server/services/crawler.ts
+++ b/orchestrator/src/server/services/crawler.ts
@@ -1,5 +1,5 @@
/**
- * Service for running the job crawler (job-extractor).
+ * Service for running the Gradcracker crawler (extractors/gradcracker).
* Wraps the existing Crawlee-based crawler.
*/
@@ -11,7 +11,7 @@ import { createInterface } from 'readline';
import type { CreateJobInput } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
-const CRAWLER_DIR = join(__dirname, '../../../../job-extractor');
+const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker');
const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');
@@ -29,7 +29,7 @@ export interface RunCrawlerOptions {
existingJobUrls?: string[];
/**
- * Optional callback for live crawl progress emitted by job-extractor.
+ * Optional callback for live crawl progress emitted by the Gradcracker extractor.
*/
onProgress?: (update: JobExtractorProgress) => void;
}
@@ -57,7 +57,7 @@ async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined):
}
/**
- * Run the job-extractor crawler and return discovered jobs.
+ * Run the Gradcracker crawler and return discovered jobs.
*/
export async function runCrawler(options: RunCrawlerOptions = {}): Promise {
console.log('🕷️ Starting job crawler...');
diff --git a/orchestrator/src/server/services/jobspy.ts b/orchestrator/src/server/services/jobspy.ts
index 2593e81..0846bad 100644
--- a/orchestrator/src/server/services/jobspy.ts
+++ b/orchestrator/src/server/services/jobspy.ts
@@ -11,7 +11,7 @@ import { fileURLToPath } from 'url';
import type { CreateJobInput, JobSource } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
-const JOBSPY_DIR = join(__dirname, '../../../../jobspy-extractor');
+const JOBSPY_DIR = join(__dirname, '../../../../extractors/jobspy');
const JOBSPY_SCRIPT = join(JOBSPY_DIR, 'scrape_jobs.py');
function getPythonPath(): string {