rename extractors to their own folder

This commit is contained in:
DaKheera47 2025-12-14 22:44:37 +00:00
parent cefb75a9ec
commit d24f71ab3d
16 changed files with 16 additions and 15 deletions

View File

@ -27,13 +27,13 @@ RUN pip3 install --no-cache-dir --break-system-packages playwright python-jobspy
# Copy package files first for better caching
COPY orchestrator/package*.json ./orchestrator/
COPY job-extractor/package*.json ./job-extractor/
COPY extractors/gradcracker/package*.json ./extractors/gradcracker/
# Install Node.js dependencies
WORKDIR /app/orchestrator
RUN npm install --production=false
WORKDIR /app/job-extractor
WORKDIR /app/extractors/gradcracker
RUN npm install --production=false
# Install Camoufox browser (downloads its own Firefox fork)
@ -42,8 +42,8 @@ RUN npx camoufox fetch
# Copy source code
WORKDIR /app
COPY orchestrator ./orchestrator
COPY job-extractor ./job-extractor
COPY jobspy-extractor ./jobspy-extractor
COPY extractors/gradcracker ./extractors/gradcracker
COPY extractors/jobspy ./extractors/jobspy
COPY resume-generator ./resume-generator
# Build the orchestrator (client + server)

View File

@ -4,8 +4,8 @@ Automated job discovery -> AI suitability scoring -> tailored resume PDFs -> a d
## How it works (pipeline)
1. **Crawl**: `job-extractor` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`job-extractor/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
1. **Crawl**: `extractors/gradcracker` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`extractors/gradcracker/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
3. **Score**: `orchestrator` scores up to 50 unprocessed jobs via OpenRouter (cached as `suitabilityScore`/`suitabilityReason`).
4. **Select**: take the top `N` jobs above `minSuitabilityScore`.
5. **Process**: for each selected job:
@ -30,7 +30,7 @@ flowchart LR
PDFS[(PDFs<br/>pdfs/)]
end
subgraph CRAWL["job-extractor (Crawlee/Playwright)"]
subgraph CRAWL["extractors/gradcracker (Crawlee/Playwright)"]
C1["Seed search URLs<br/>(locations x roles)"]
C2["Parse list pages<br/>enqueue job pages"]
C3["Parse job pages<br/>extract JD + apply URL"]
@ -73,7 +73,8 @@ job-ops/
src/server/ # API routes, pipeline, DB, services
src/client/ # UI (polls jobs, listens to SSE progress)
src/shared/ # shared types (Job, PipelineRun, etc.)
job-extractor/ # Crawlee crawler (Gradcracker)
extractors/gradcracker/ # Crawlee crawler (Gradcracker)
extractors/jobspy/ # JobSpy wrapper (Indeed/LinkedIn/etc)
resume-generator/ # Python Playwright automation for rxresu.me
base.json # your exported base resume (template)
data/ # persisted runtime artifacts (Docker default)
@ -113,7 +114,7 @@ Install Node deps (both packages):
```bash
cd orchestrator && npm install
cd ../job-extractor && npm install
cd ../extractors/gradcracker && npm install
```
Configure the orchestrator env + DB:
@ -153,7 +154,7 @@ Dev URLs:
## Notes / sharp edges
- **Crawl targets**: edit `job-extractor/src/main.ts` to change the Gradcracker location/role matrix.
- **Crawl targets**: edit `extractors/gradcracker/src/main.ts` to change the Gradcracker location/role matrix.
- **Notion sync is schema-dependent**: `orchestrator/src/server/services/notion.ts` assumes property names; adjust to match your Notion database.
- **Pipeline config knobs**: `POST /api/pipeline/run` accepts `{ topN, minSuitabilityScore }`; `PIPELINE_TOP_N`/`PIPELINE_MIN_SCORE` are used by `npm run pipeline:run` (CLI runner).
- **Anti-bot reality**: crawling is headless + "humanized", but sites can still block; expect occasional flakiness.

View File

@ -1,5 +1,5 @@
/**
* Service for running the job crawler (job-extractor).
* Service for running the Gradcracker crawler (extractors/gradcracker).
* Wraps the existing Crawlee-based crawler.
*/
@ -11,7 +11,7 @@ import { createInterface } from 'readline';
import type { CreateJobInput } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const CRAWLER_DIR = join(__dirname, '../../../../job-extractor');
const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker');
const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');
@ -29,7 +29,7 @@ export interface RunCrawlerOptions {
existingJobUrls?: string[];
/**
* Optional callback for live crawl progress emitted by job-extractor.
* Optional callback for live crawl progress emitted by the Gradcracker extractor.
*/
onProgress?: (update: JobExtractorProgress) => void;
}
@ -57,7 +57,7 @@ async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined):
}
/**
* Run the job-extractor crawler and return discovered jobs.
* Run the Gradcracker crawler and return discovered jobs.
*/
export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
console.log('🕷️ Starting job crawler...');

View File

@ -11,7 +11,7 @@ import { fileURLToPath } from 'url';
import type { CreateJobInput, JobSource } from '../../shared/types.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const JOBSPY_DIR = join(__dirname, '../../../../jobspy-extractor');
const JOBSPY_DIR = join(__dirname, '../../../../extractors/jobspy');
const JOBSPY_SCRIPT = join(JOBSPY_DIR, 'scrape_jobs.py');
function getPythonPath(): string {