From 09ab32e1e8c28554a03ac4e804036aea65336e8a Mon Sep 17 00:00:00 2001 From: ilia Date: Sun, 5 Apr 2026 19:49:01 -0400 Subject: [PATCH] chore: default pipeline includes Glassdoor; document DB copy and cron sources - Add glassdoor to DEFAULT_CONFIG so POST /api/pipeline/run with {} runs JobSpy trio. - jobber-pipeline-telegram.sh: optional JOBBER_PIPELINE_SOURCES for explicit source list. - Deploy doc: WAL checkpoint + rsync jobs.db to VM for profiles/settings/jobs. Made-with: Cursor --- DEPLOY_GITEA_VM_CRON_TELEGRAM.md | 22 +++++++++++++++++++ .../src/server/pipeline/orchestrator.ts | 4 ++-- scripts/jobber-cron.env.example | 4 ++++ scripts/jobber-pipeline-telegram.sh | 10 ++++++++- 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/DEPLOY_GITEA_VM_CRON_TELEGRAM.md b/DEPLOY_GITEA_VM_CRON_TELEGRAM.md index 0327db3..e859ccd 100644 --- a/DEPLOY_GITEA_VM_CRON_TELEGRAM.md +++ b/DEPLOY_GITEA_VM_CRON_TELEGRAM.md @@ -192,6 +192,28 @@ docker compose up -d --build Wait until `curl -sf http://127.0.0.1:3005/health` succeeds before relying on cron (container needs a few seconds after start). +**5b. Copy your local SQLite to the VM (profiles, settings, jobs)** — optional; use when you want the same **search profile**, `activeProfileId`, and job rows as on your laptop. + +1. **Stop** the app that holds the DB open: local `npm run dev` (Ctrl+C) and on the VM `docker compose stop` (or `docker stop job-ops`). +2. **Checkpoint WAL** on the machine that owns the canonical DB (usually your laptop), so a copy is self-contained: + + ```bash + cd /path/to/Jobber + sqlite3 data/jobs.db "PRAGMA wal_checkpoint(FULL);" + ``` + +3. **Copy** `data/jobs.db` to the VM repo’s `./data/` (same path Docker mounts). Example from your Mac: + + ```bash + rsync -avz --progress ./data/jobs.db YOUR_USER@178:/opt/Jobber/data/jobs.db + ``` + + If you use WAL files and skip checkpointing, copy `jobs.db`, `jobs.db-wal`, and `jobs.db-shm` together while **nothing** is writing to the DB. + +4. On the VM: `docker compose up -d` and verify `GET /api/settings` / the Settings UI shows your profile. + +**Default pipeline sources** (empty JSON body to `POST /api/pipeline/run`, e.g. cron script) include **Glassdoor** via JobSpy with Indeed and LinkedIn. Glassdoor’s API often returns errors in logs; LinkedIn/Indeed can still produce rows. To force an explicit list from cron, set `JOBBER_PIPELINE_SOURCES` in `/root/.jobber-cron.env` (see `scripts/jobber-cron.env.example`). + **Security:** Never commit `/root/.jobber-cron.env` or paste bot tokens in Git. Revoke the token in BotFather if it was exposed. ### Option B2 — Minimal curl-only (no wait-for-finish) diff --git a/orchestrator/src/server/pipeline/orchestrator.ts b/orchestrator/src/server/pipeline/orchestrator.ts index a20ad18..b1b111b 100644 --- a/orchestrator/src/server/pipeline/orchestrator.ts +++ b/orchestrator/src/server/pipeline/orchestrator.ts @@ -37,8 +37,8 @@ import { const DEFAULT_CONFIG: PipelineConfig = { topN: 10, minSuitabilityScore: 50, - // Keep Glassdoor opt-in via source picker/settings; do not enable by default. - sources: ["gradcracker", "indeed", "linkedin", "ukvisajobs"], + // Glassdoor runs inside JobSpy with Indeed/LinkedIn; upstream often errors without failing the run. + sources: ["gradcracker", "indeed", "linkedin", "glassdoor", "ukvisajobs"], outputDir: join(getDataDir(), "pdfs"), enableCrawling: true, enableScoring: true, diff --git a/scripts/jobber-cron.env.example b/scripts/jobber-cron.env.example index 7cb95b3..2e59ed2 100644 --- a/scripts/jobber-cron.env.example +++ b/scripts/jobber-cron.env.example @@ -9,6 +9,10 @@ JOBOPS_URL="http://127.0.0.1:3005" # Optional: cap how many job lines (title + link) are appended to the Telegram message (default 25). # JOB_TELEGRAM_MAX_JOBS=25 +# Optional: override POST /api/pipeline/run sources (comma-separated). If unset, the server default applies. +# Example (matches typical JobSpy bundle + UK sources): +# JOBBER_PIPELINE_SOURCES=gradcracker,indeed,linkedin,glassdoor,ukvisajobs + # Optional — only if BASIC_AUTH_USER / BASIC_AUTH_PASSWORD are set in Jobber .env # BASIC_AUTH_USER="" # BASIC_AUTH_PASSWORD="" diff --git a/scripts/jobber-pipeline-telegram.sh b/scripts/jobber-pipeline-telegram.sh index 22951d2..599662c 100755 --- a/scripts/jobber-pipeline-telegram.sh +++ b/scripts/jobber-pipeline-telegram.sh @@ -164,8 +164,16 @@ if echo "$body" | jq -e '.data.isRunning == true' >/dev/null 2>&1; then exit 0 fi +# Optional: comma-separated sources (see JOBBER_PIPELINE_SOURCES in jobber-cron.env.example). +# If unset, POST body is {} and the server uses its default source list. +run_body='{}' +if [[ -n "${JOBBER_PIPELINE_SOURCES:-}" ]]; then + run_body="$(jq -n --arg s "$JOBBER_PIPELINE_SOURCES" \ + '$s | split(",") | map(gsub("^\\s+|\\s+$";"")) | map(select(. != "")) | {sources: .}')" +fi + resp="$(curl -sS --compressed "${AUTH[@]}" -X POST "${BASE}/api/pipeline/run" \ - -H "Accept: application/json" -H "Content-Type: application/json" -d '{}')" + -H "Accept: application/json" -H "Content-Type: application/json" -d "$run_body")" if ! echo "$resp" | jq -e '.ok == true' >/dev/null 2>&1; then _fail_json="$(echo "$resp" | jq -c . 2>/dev/null || echo "$resp")" send_tg_html "Jobber: POST /api/pipeline/run failed: $(tg_html_escape "$_fail_json")"