rename extractors to their own folder

2025-12-14 22:44:37 +00:00 · 2025-12-14 22:44:37 +00:00 · d24f71ab3d
commit d24f71ab3d
parent cefb75a9ec
16 changed files with 16 additions and 15 deletions
--- a/8
+++ b/8
@ -27,13 +27,13 @@ RUN pip3 install --no-cache-dir --break-system-packages playwright python-jobspy

 # Copy package files first for better caching
 COPY orchestrator/package*.json ./orchestrator/
-COPY job-extractor/package*.json ./job-extractor/
+COPY extractors/gradcracker/package*.json ./extractors/gradcracker/

 # Install Node.js dependencies
 WORKDIR /app/orchestrator
 RUN npm install --production=false

-WORKDIR /app/job-extractor
+WORKDIR /app/extractors/gradcracker
 RUN npm install --production=false

 # Install Camoufox browser (downloads its own Firefox fork)
@ -42,8 +42,8 @@ RUN npx camoufox fetch
 # Copy source code
 WORKDIR /app
 COPY orchestrator ./orchestrator
-COPY job-extractor ./job-extractor
-COPY jobspy-extractor ./jobspy-extractor
+COPY extractors/gradcracker ./extractors/gradcracker
+COPY extractors/jobspy ./extractors/jobspy
 COPY resume-generator ./resume-generator

 # Build the orchestrator (client + server)
--- a/README.md
+++ b/README.md
@ -4,8 +4,8 @@ Automated job discovery -> AI suitability scoring -> tailored resume PDFs -> a d

 ## How it works (pipeline)

-1. **Crawl**: `job-extractor` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
-2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`job-extractor/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
+1. **Crawl**: `extractors/gradcracker` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
+2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`extractors/gradcracker/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
 3. **Score**: `orchestrator` scores up to 50 unprocessed jobs via OpenRouter (cached as `suitabilityScore`/`suitabilityReason`).
 4. **Select**: take the top `N` jobs above `minSuitabilityScore`.
 5. **Process**: for each selected job:
@ -30,7 +30,7 @@ flowchart LR
    PDFS[(PDFs<br/>pdfs/)]
  end

-  subgraph CRAWL["job-extractor (Crawlee/Playwright)"]
+  subgraph CRAWL["extractors/gradcracker (Crawlee/Playwright)"]
    C1["Seed search URLs<br/>(locations x roles)"]
    C2["Parse list pages<br/>enqueue job pages"]
    C3["Parse job pages<br/>extract JD + apply URL"]
@ -73,7 +73,8 @@ job-ops/
    src/server/                 # API routes, pipeline, DB, services
    src/client/                 # UI (polls jobs, listens to SSE progress)
    src/shared/                 # shared types (Job, PipelineRun, etc.)
-  job-extractor/                # Crawlee crawler (Gradcracker)
+  extractors/gradcracker/       # Crawlee crawler (Gradcracker)
+  extractors/jobspy/            # JobSpy wrapper (Indeed/LinkedIn/etc)
  resume-generator/             # Python Playwright automation for rxresu.me
    base.json                   # your exported base resume (template)
  data/                         # persisted runtime artifacts (Docker default)
@ -113,7 +114,7 @@ Install Node deps (both packages):

 ```bash
 cd orchestrator && npm install
-cd ../job-extractor && npm install
+cd ../extractors/gradcracker && npm install
 ```

 Configure the orchestrator env + DB:
@ -153,7 +154,7 @@ Dev URLs:

 ## Notes / sharp edges

- **Crawl targets**: edit `job-extractor/src/main.ts` to change the Gradcracker location/role matrix.
+- **Crawl targets**: edit `extractors/gradcracker/src/main.ts` to change the Gradcracker location/role matrix.
 - **Notion sync is schema-dependent**: `orchestrator/src/server/services/notion.ts` assumes property names; adjust to match your Notion database.
 - **Pipeline config knobs**: `POST /api/pipeline/run` accepts `{ topN, minSuitabilityScore }`; `PIPELINE_TOP_N`/`PIPELINE_MIN_SCORE` are used by `npm run pipeline:run` (CLI runner).
 - **Anti-bot reality**: crawling is headless + "humanized", but sites can still block; expect occasional flakiness.
--- a/extractors/gradcracker/.dockerignore
+++ b/extractors/gradcracker/.dockerignore
--- a/extractors/gradcracker/.gitignore
+++ b/extractors/gradcracker/.gitignore
--- a/extractors/gradcracker/Dockerfile
+++ b/extractors/gradcracker/Dockerfile
--- a/extractors/gradcracker/README.md
+++ b/extractors/gradcracker/README.md
--- a/extractors/gradcracker/package-lock.json
+++ b/extractors/gradcracker/package-lock.json
--- a/extractors/gradcracker/package.json
+++ b/extractors/gradcracker/package.json
--- a/extractors/gradcracker/src/main.ts
+++ b/extractors/gradcracker/src/main.ts
--- a/extractors/gradcracker/src/progress.ts
+++ b/extractors/gradcracker/src/progress.ts
--- a/extractors/gradcracker/src/routes.ts
+++ b/extractors/gradcracker/src/routes.ts
--- a/extractors/gradcracker/tsconfig.json
+++ b/extractors/gradcracker/tsconfig.json
--- a/extractors/jobspy/requirements.txt
+++ b/extractors/jobspy/requirements.txt
--- a/extractors/jobspy/scrape_jobs.py
+++ b/extractors/jobspy/scrape_jobs.py
--- a/orchestrator/src/server/services/crawler.ts
+++ b/orchestrator/src/server/services/crawler.ts
@ -1,5 +1,5 @@
 /**
- * Service for running the job crawler (job-extractor).
+ * Service for running the Gradcracker crawler (extractors/gradcracker).
 * Wraps the existing Crawlee-based crawler.
 */

@ -11,7 +11,7 @@ import { createInterface } from 'readline';
 import type { CreateJobInput } from '../../shared/types.js';

 const __dirname = dirname(fileURLToPath(import.meta.url));
-const CRAWLER_DIR = join(__dirname, '../../../../job-extractor');
+const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker');
 const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
 const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');

@ -29,7 +29,7 @@ export interface RunCrawlerOptions {
  existingJobUrls?: string[];

  /**
-   * Optional callback for live crawl progress emitted by job-extractor.
+   * Optional callback for live crawl progress emitted by the Gradcracker extractor.
   */
  onProgress?: (update: JobExtractorProgress) => void;
 }
@ -57,7 +57,7 @@ async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined):
 }

 /**
- * Run the job-extractor crawler and return discovered jobs.
+ * Run the Gradcracker crawler and return discovered jobs.
 */
 export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
  console.log('🕷️ Starting job crawler...');
--- a/orchestrator/src/server/services/jobspy.ts
+++ b/orchestrator/src/server/services/jobspy.ts
@ -11,7 +11,7 @@ import { fileURLToPath } from 'url';
 import type { CreateJobInput, JobSource } from '../../shared/types.js';

 const __dirname = dirname(fileURLToPath(import.meta.url));
-const JOBSPY_DIR = join(__dirname, '../../../../jobspy-extractor');
+const JOBSPY_DIR = join(__dirname, '../../../../extractors/jobspy');
 const JOBSPY_SCRIPT = join(JOBSPY_DIR, 'scrape_jobs.py');

 function getPythonPath(): string {