Dockerization :)

2025-12-11 23:56:07 +00:00 · 2025-12-11 23:56:07 +00:00 · 051c09d943
commit 051c09d943
parent d743aacd1a
14 changed files with 362 additions and 30 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,45 @@
+# Dependencies
+**/node_modules
+**/.venv
+**/__pycache__
+
+# Build outputs
+**/dist
+**/*.pyc
+
+# Data (mounted as volume)
+data/
+
+# Environment files (passed via docker-compose)
+.env
+**/.env
+**/.env.local
+
+# Git
+.git
+.gitignore
+
+# IDE
+.idea
+.vscode
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Logs
+*.log
+npm-debug.log*
+
+# Test files
+**/*.test.ts
+**/*.spec.ts
+**/test/
+**/tests/
+**/__tests__/
+
+# Documentation
+*.md
+!README.md
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,25 @@
+# =============================================================================
+# Job Ops - Environment Variables
+# Copy this file to .env and fill in your values
+# =============================================================================
+
+# OpenRouter API for AI scoring and summaries
+# Get your key at: https://openrouter.ai/keys
+OPENROUTER_API_KEY=your_openrouter_api_key_here
+MODEL=openai/gpt-4o-mini
+
+# RXResume credentials for PDF generation
+# Create an account at: https://rxresu.me
+RXRESUME_EMAIL=your_email@example.com
+RXRESUME_PASSWORD=your_password_here
+
+# Pipeline configuration
+PIPELINE_TOP_N=10
+PIPELINE_MIN_SCORE=50
+
+# Optional: Notion integration for job tracking
+NOTION_API_KEY=
+NOTION_DATABASE_ID=
+
+# Optional: Webhook secret for n8n automation
+WEBHOOK_SECRET=
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,10 @@
+# Environment files
+.env
+*.env.local
+
+# Data directory (bind mount in Docker)
+data/
+
+# OS files
 .DS_Store
+Thumbs.db
--- a/60
+++ b/60
@ -0,0 +1,60 @@
+# =============================================================================
+# Job Ops - Unified Docker Image
+# Contains: Orchestrator (Node.js), Job Crawler, Resume Generator (Python/Playwright)
+# =============================================================================
+
+FROM mcr.microsoft.com/playwright:v1.49.1-jammy
+
+# Set working directory
+WORKDIR /app
+
+# Install Node.js 20.x and Python
+RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
+    apt-get update && \
+    apt-get install -y nodejs python3 python3-pip && \
+    npm install -g pnpm
+
+# Install Python dependencies for resume generator
+RUN pip3 install --no-cache-dir playwright
+
+# Copy package files first for better caching
+COPY orchestrator/package*.json ./orchestrator/
+COPY job-extractor/package*.json ./job-extractor/
+
+# Install Node.js dependencies
+WORKDIR /app/orchestrator
+RUN npm install --production=false
+
+WORKDIR /app/job-extractor
+RUN npm install --production=false
+
+# Copy source code
+WORKDIR /app
+COPY orchestrator ./orchestrator
+COPY job-extractor ./job-extractor
+COPY resume-generator ./resume-generator
+
+# Build the orchestrator (client + server)
+WORKDIR /app/orchestrator
+RUN npm run build
+
+# Create data directories
+RUN mkdir -p /app/data/pdfs
+
+# Expose ports
+EXPOSE 3001
+
+# Environment variables (can be overridden)
+ENV NODE_ENV=production
+ENV PORT=3001
+ENV PYTHON_PATH=/usr/bin/python3
+ENV DATA_DIR=/app/data
+ENV RESUME_GEN_DIR=/app/resume-generator
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:3001/health || exit 1
+
+# Run migrations and start the server
+WORKDIR /app/orchestrator
+CMD ["sh", "-c", "npm run db:migrate && npm run start"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,127 @@
+# Job Ops 🚀
+
+Automated job discovery, scoring, and resume generation pipeline.
+
+## Features
+
+- **Job Crawler** - Discovers jobs from Gradcracker and other sources
+- **AI Scoring** - Ranks jobs by suitability using OpenRouter API
+- **Resume Generator** - Creates tailored PDFs via RXResume automation
+- **Dashboard UI** - React-based interface for reviewing and applying
+
+## Quick Start with Docker
+
+### 1. Configure Environment
+
+```bash
+# Copy the example env file
+cp .env.example .env
+
+# Edit with your credentials
+nano .env
+```
+
+Required environment variables:
+- `OPENROUTER_API_KEY` - Get from [openrouter.ai/keys](https://openrouter.ai/keys)
+- `RXRESUME_EMAIL` - Your [rxresu.me](https://rxresu.me) account email
+- `RXRESUME_PASSWORD` - Your RXResume password
+
+### 2. Add Your Base Resume
+
+Place your resume JSON at `resume-generator/base.json`.
+You can export this from RXResume.
+
+### 3. Run
+
+```bash
+# Build and start
+docker compose up -d
+
+# View logs
+docker compose logs -f
+
+# Stop
+docker compose down
+```
+
+### 4. Access
+
+- **Dashboard**: http://localhost:3001
+- **API**: http://localhost:3001/api
+- **Health**: http://localhost:3001/health
+
+## Data Persistence
+
+All data is stored in the `./data` directory:
+- `data/jobs.db` - SQLite database
+- `data/pdfs/` - Generated resume PDFs
+
+## Development
+
+### Without Docker
+
+```bash
+# Install dependencies
+cd orchestrator && npm install
+cd ../job-extractor && npm install
+
+# Set up Python environment for resume generator
+cd ../resume-generator
+python3 -m venv .venv
+source .venv/bin/activate
+pip install playwright
+playwright install chromium
+
+# Run orchestrator (from orchestrator folder)
+cd ../orchestrator
+cp .env.example .env  # Configure your env
+npm run db:migrate
+npm run dev
+```
+
+### Build Docker Image
+
+```bash
+docker build -t job-ops:latest .
+```
+
+### Push to Docker Hub
+
+```bash
+docker tag job-ops:latest yourusername/job-ops:latest
+docker push yourusername/job-ops:latest
+```
+
+## API Endpoints
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/api/jobs` | List all jobs |
+| GET | `/api/jobs/:id` | Get job details |
+| PATCH | `/api/jobs/:id` | Update job |
+| POST | `/api/jobs/:id/process` | Generate resume for job |
+| POST | `/api/jobs/:id/apply` | Mark as applied |
+| POST | `/api/jobs/:id/reject` | Skip job |
+| POST | `/api/jobs/process-discovered` | Process all discovered jobs |
+| GET | `/api/pipeline/status` | Pipeline status |
+| POST | `/api/pipeline/run` | Trigger pipeline |
+| GET | `/api/pipeline/progress` | SSE progress stream |
+| DELETE | `/api/database` | Clear all data |
+
+## Architecture
+
+```
+job-ops/
+├── orchestrator/       # Node.js backend + React frontend
+│   ├── src/server/    # Express API, services, DB
+│   └── src/client/    # React dashboard
+├── job-extractor/      # Crawlee-based job crawler
+├── resume-generator/   # Python Playwright automation
+├── data/               # SQLite DB + generated PDFs
+├── Dockerfile
+└── docker-compose.yml
+```
+
+## License
+
+MIT
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,55 @@
+# =============================================================================
+# Job Ops - Docker Compose Configuration
+# =============================================================================
+
+services:
+  job-ops:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: job-ops:latest
+    container_name: job-ops
+    ports:
+      - "3005:3001"
+    volumes:
+      # Persist database and generated PDFs
+      - ./data:/app/data
+      # Base resume JSON (read-only)
+      - ./resume-generator/base.json:/app/resume-generator/base.json:ro
+    environment:
+      # Server config
+      - NODE_ENV=production
+      - PORT=3001
+
+      # OpenRouter API for AI scoring and summaries
+      - OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
+      - MODEL=${MODEL:-openai/gpt-4o-mini}
+
+      # RXResume credentials for PDF generation
+      - RXRESUME_EMAIL=${RXRESUME_EMAIL}
+      - RXRESUME_PASSWORD=${RXRESUME_PASSWORD}
+
+      # Pipeline configuration
+      - PIPELINE_TOP_N=${PIPELINE_TOP_N:-10}
+      - PIPELINE_MIN_SCORE=${PIPELINE_MIN_SCORE:-50}
+
+      # Optional: Notion integration
+      - NOTION_API_KEY=${NOTION_API_KEY:-}
+      - NOTION_DATABASE_ID=${NOTION_DATABASE_ID:-}
+
+      # Optional: Webhook secret for n8n
+      - WEBHOOK_SECRET=${WEBHOOK_SECRET:-}
+
+      # Python path (uses system python in container)
+      - PYTHON_PATH=/usr/bin/python3
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:3001/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
+
+# Volumes for data persistence
+volumes:
+  data:
--- a/job-extractor/src/main.ts
+++ b/job-extractor/src/main.ts
@ -53,7 +53,7 @@ const crawler = new PlaywrightCrawler({
  launchContext: {
    launcher: firefox,
    launchOptions: await launchOptions({
-      headless: false,
+      headless: true,
      humanize: true,
      geoip: true,
    }),
--- a/orchestrator/src/server/db/clear.ts
+++ b/orchestrator/src/server/db/clear.ts
@ -7,7 +7,11 @@ import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';

 const __dirname = dirname(fileURLToPath(import.meta.url));
-const DB_PATH = join(__dirname, '../../../data/jobs.db');
+
+// Database path - can be overridden via env for Docker
+const DB_PATH = process.env.DATA_DIR
+  ? join(process.env.DATA_DIR, 'jobs.db')
+  : join(__dirname, '../../../data/jobs.db');

 /**
 * Clear all data from the database (keeps the schema intact).
--- a/orchestrator/src/server/db/index.ts
+++ b/orchestrator/src/server/db/index.ts
@ -10,7 +10,11 @@ import { existsSync, mkdirSync } from 'fs';
 import * as schema from './schema.js';

 const __dirname = dirname(fileURLToPath(import.meta.url));
-const DB_PATH = join(__dirname, '../../../data/jobs.db');
+
+// Database path - can be overridden via env for Docker
+const DB_PATH = process.env.DATA_DIR
+  ? join(process.env.DATA_DIR, 'jobs.db')
+  : join(__dirname, '../../../data/jobs.db');

 // Ensure data directory exists
 const dataDir = dirname(DB_PATH);
--- a/orchestrator/src/server/db/migrate.ts
+++ b/orchestrator/src/server/db/migrate.ts
@ -8,7 +8,11 @@ import { fileURLToPath } from 'url';
 import { existsSync, mkdirSync } from 'fs';

 const __dirname = dirname(fileURLToPath(import.meta.url));
-const DB_PATH = join(__dirname, '../../../data/jobs.db');
+
+// Database path - can be overridden via env for Docker
+const DB_PATH = process.env.DATA_DIR
+  ? join(process.env.DATA_DIR, 'jobs.db')
+  : join(__dirname, '../../../data/jobs.db');

 // Ensure data directory exists
 const dataDir = dirname(DB_PATH);
--- a/orchestrator/src/server/index.ts
+++ b/orchestrator/src/server/index.ts
@ -34,7 +34,9 @@ app.use((req, res, next) => {
 app.use('/api', apiRouter);

 // Serve static files for generated PDFs
-const pdfDir = join(__dirname, '../../data/pdfs');
+const pdfDir = process.env.DATA_DIR
+  ? join(process.env.DATA_DIR, 'pdfs')
+  : join(__dirname, '../../data/pdfs');
 app.use('/pdfs', express.static(pdfDir));

 // Health check
--- a/orchestrator/src/server/services/pdf.ts
+++ b/orchestrator/src/server/services/pdf.ts
@ -6,12 +6,16 @@
 import { spawn } from 'child_process';
 import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';
-import { readFile, writeFile, copyFile, access, mkdir } from 'fs/promises';
+import { readFile, writeFile, mkdir, access } from 'fs/promises';
 import { existsSync } from 'fs';

 const __dirname = dirname(fileURLToPath(import.meta.url));
-const RESUME_GEN_DIR = join(__dirname, '../../../../resume-generator');
-const OUTPUT_DIR = join(__dirname, '../../../data/pdfs');
+
+// Paths - can be overridden via env for Docker
+const RESUME_GEN_DIR = process.env.RESUME_GEN_DIR || join(__dirname, '../../../../resume-generator');
+const OUTPUT_DIR = process.env.DATA_DIR 
+  ? join(process.env.DATA_DIR, 'pdfs')
+  : join(__dirname, '../../../data/pdfs');

 export interface PdfResult {
  success: boolean;
@ -55,22 +59,11 @@ export async function generatePdf(
    const tempResumePath = join(RESUME_GEN_DIR, `temp_resume_${jobId}.json`);
    await writeFile(tempResumePath, JSON.stringify(baseResume, null, 2));
    
-    // Generate PDF using Python script
+    // Generate PDF using Python script - output directly to our data folder
    const outputFilename = `resume_${jobId}.pdf`;
    const outputPath = join(OUTPUT_DIR, outputFilename);
    
-    await runPythonPdfGenerator(tempResumePath, outputFilename);
-    
-    // Move generated PDF to our output directory
-    const pythonOutputPath = join(RESUME_GEN_DIR, 'resumes', outputFilename);
-    
-    try {
-      await access(pythonOutputPath);
-      await copyFile(pythonOutputPath, outputPath);
-    } catch {
-      // PDF might already be in the right place or script output different location
-      console.warn('PDF not found at expected Python output location');
-    }
+    await runPythonPdfGenerator(tempResumePath, outputFilename, OUTPUT_DIR);
    
    // Cleanup temp file
    try {
@ -94,11 +87,12 @@ export async function generatePdf(
 */
 async function runPythonPdfGenerator(
  jsonPath: string,
-  outputFilename: string
+  outputFilename: string,
+  outputDir: string
 ): Promise<void> {
  return new Promise((resolve, reject) => {
-    // Use the virtual environment's Python
-    const pythonPath = join(RESUME_GEN_DIR, '.venv', 'bin', 'python');
+    // Use the virtual environment's Python (or system python in Docker)
+    const pythonPath = process.env.PYTHON_PATH || join(RESUME_GEN_DIR, '.venv', 'bin', 'python');
    
    const child = spawn(pythonPath, ['rxresume_automation.py'], {
      cwd: RESUME_GEN_DIR,
@ -106,6 +100,7 @@ async function runPythonPdfGenerator(
        ...process.env,
        RESUME_JSON_PATH: jsonPath,
        OUTPUT_FILENAME: outputFilename,
+        OUTPUT_DIR: outputDir,
      },
      stdio: 'inherit',
    });
--- a/orchestrator/tsconfig.server.json
+++ b/orchestrator/tsconfig.server.json
@ -3,8 +3,8 @@
  "compilerOptions": {
    "module": "ESNext",
    "moduleResolution": "bundler",
-    "outDir": "./dist/server",
-    "rootDir": "./src/server"
+    "outDir": "./dist",
+    "rootDir": "./src"
  },
  "include": ["src/server/**/*", "src/shared/**/*"]
 }
--- a/resume-generator/rxresume_automation.py
+++ b/resume-generator/rxresume_automation.py
@ -21,7 +21,9 @@ RESUME_JSON_PATH = (
 _custom_output_filename = os.getenv("OUTPUT_FILENAME")
 OUTPUT_FILENAME = _custom_output_filename if _custom_output_filename else "resume.pdf"

-OUTPUT_DIR = BASE_DIR / "resumes"
+# Output directory - can be overridden by orchestrator
+_custom_output_dir = os.getenv("OUTPUT_DIR")
+OUTPUT_DIR = Path(_custom_output_dir) if _custom_output_dir else BASE_DIR / "resumes"


 def login(page):