rename extractors to their own folder
This commit is contained in:
parent
cefb75a9ec
commit
d24f71ab3d
@ -27,13 +27,13 @@ RUN pip3 install --no-cache-dir --break-system-packages playwright python-jobspy
|
||||
|
||||
# Copy package files first for better caching
|
||||
COPY orchestrator/package*.json ./orchestrator/
|
||||
COPY job-extractor/package*.json ./job-extractor/
|
||||
COPY extractors/gradcracker/package*.json ./extractors/gradcracker/
|
||||
|
||||
# Install Node.js dependencies
|
||||
WORKDIR /app/orchestrator
|
||||
RUN npm install --production=false
|
||||
|
||||
WORKDIR /app/job-extractor
|
||||
WORKDIR /app/extractors/gradcracker
|
||||
RUN npm install --production=false
|
||||
|
||||
# Install Camoufox browser (downloads its own Firefox fork)
|
||||
@ -42,8 +42,8 @@ RUN npx camoufox fetch
|
||||
# Copy source code
|
||||
WORKDIR /app
|
||||
COPY orchestrator ./orchestrator
|
||||
COPY job-extractor ./job-extractor
|
||||
COPY jobspy-extractor ./jobspy-extractor
|
||||
COPY extractors/gradcracker ./extractors/gradcracker
|
||||
COPY extractors/jobspy ./extractors/jobspy
|
||||
COPY resume-generator ./resume-generator
|
||||
|
||||
# Build the orchestrator (client + server)
|
||||
|
||||
13
README.md
13
README.md
@ -4,8 +4,8 @@ Automated job discovery -> AI suitability scoring -> tailored resume PDFs -> a d
|
||||
|
||||
## How it works (pipeline)
|
||||
|
||||
1. **Crawl**: `job-extractor` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
|
||||
2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`job-extractor/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
|
||||
1. **Crawl**: `extractors/gradcracker` (Crawlee + Playwright + Camoufox) visits Gradcracker search pages, opens each job page, extracts structured fields + the job description, and captures the real application URL by clicking the apply button (skipped for already-known jobs).
|
||||
2. **Import + dedupe**: `orchestrator` reads the Crawlee dataset (`extractors/gradcracker/storage/datasets/default/*.json`) and inserts new jobs into SQLite (`jobs.job_url` is unique).
|
||||
3. **Score**: `orchestrator` scores up to 50 unprocessed jobs via OpenRouter (cached as `suitabilityScore`/`suitabilityReason`).
|
||||
4. **Select**: take the top `N` jobs above `minSuitabilityScore`.
|
||||
5. **Process**: for each selected job:
|
||||
@ -30,7 +30,7 @@ flowchart LR
|
||||
PDFS[(PDFs<br/>pdfs/)]
|
||||
end
|
||||
|
||||
subgraph CRAWL["job-extractor (Crawlee/Playwright)"]
|
||||
subgraph CRAWL["extractors/gradcracker (Crawlee/Playwright)"]
|
||||
C1["Seed search URLs<br/>(locations x roles)"]
|
||||
C2["Parse list pages<br/>enqueue job pages"]
|
||||
C3["Parse job pages<br/>extract JD + apply URL"]
|
||||
@ -73,7 +73,8 @@ job-ops/
|
||||
src/server/ # API routes, pipeline, DB, services
|
||||
src/client/ # UI (polls jobs, listens to SSE progress)
|
||||
src/shared/ # shared types (Job, PipelineRun, etc.)
|
||||
job-extractor/ # Crawlee crawler (Gradcracker)
|
||||
extractors/gradcracker/ # Crawlee crawler (Gradcracker)
|
||||
extractors/jobspy/ # JobSpy wrapper (Indeed/LinkedIn/etc)
|
||||
resume-generator/ # Python Playwright automation for rxresu.me
|
||||
base.json # your exported base resume (template)
|
||||
data/ # persisted runtime artifacts (Docker default)
|
||||
@ -113,7 +114,7 @@ Install Node deps (both packages):
|
||||
|
||||
```bash
|
||||
cd orchestrator && npm install
|
||||
cd ../job-extractor && npm install
|
||||
cd ../extractors/gradcracker && npm install
|
||||
```
|
||||
|
||||
Configure the orchestrator env + DB:
|
||||
@ -153,7 +154,7 @@ Dev URLs:
|
||||
|
||||
## Notes / sharp edges
|
||||
|
||||
- **Crawl targets**: edit `job-extractor/src/main.ts` to change the Gradcracker location/role matrix.
|
||||
- **Crawl targets**: edit `extractors/gradcracker/src/main.ts` to change the Gradcracker location/role matrix.
|
||||
- **Notion sync is schema-dependent**: `orchestrator/src/server/services/notion.ts` assumes property names; adjust to match your Notion database.
|
||||
- **Pipeline config knobs**: `POST /api/pipeline/run` accepts `{ topN, minSuitabilityScore }`; `PIPELINE_TOP_N`/`PIPELINE_MIN_SCORE` are used by `npm run pipeline:run` (CLI runner).
|
||||
- **Anti-bot reality**: crawling is headless + "humanized", but sites can still block; expect occasional flakiness.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Service for running the job crawler (job-extractor).
|
||||
* Service for running the Gradcracker crawler (extractors/gradcracker).
|
||||
* Wraps the existing Crawlee-based crawler.
|
||||
*/
|
||||
|
||||
@ -11,7 +11,7 @@ import { createInterface } from 'readline';
|
||||
import type { CreateJobInput } from '../../shared/types.js';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const CRAWLER_DIR = join(__dirname, '../../../../job-extractor');
|
||||
const CRAWLER_DIR = join(__dirname, '../../../../extractors/gradcracker');
|
||||
const STORAGE_DIR = join(CRAWLER_DIR, 'storage/datasets/default');
|
||||
const JOBOPS_STORAGE_DIR = join(CRAWLER_DIR, 'storage/jobops');
|
||||
|
||||
@ -29,7 +29,7 @@ export interface RunCrawlerOptions {
|
||||
existingJobUrls?: string[];
|
||||
|
||||
/**
|
||||
* Optional callback for live crawl progress emitted by job-extractor.
|
||||
* Optional callback for live crawl progress emitted by the Gradcracker extractor.
|
||||
*/
|
||||
onProgress?: (update: JobExtractorProgress) => void;
|
||||
}
|
||||
@ -57,7 +57,7 @@ async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined):
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the job-extractor crawler and return discovered jobs.
|
||||
* Run the Gradcracker crawler and return discovered jobs.
|
||||
*/
|
||||
export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
|
||||
console.log('🕷️ Starting job crawler...');
|
||||
|
||||
@ -11,7 +11,7 @@ import { fileURLToPath } from 'url';
|
||||
import type { CreateJobInput, JobSource } from '../../shared/types.js';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const JOBSPY_DIR = join(__dirname, '../../../../jobspy-extractor');
|
||||
const JOBSPY_DIR = join(__dirname, '../../../../extractors/jobspy');
|
||||
const JOBSPY_SCRIPT = join(JOBSPY_DIR, 'scrape_jobs.py');
|
||||
|
||||
function getPythonPath(): string {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user