From 9576c3d7a156421ad7ee83ab9a8f11b99803cd4d Mon Sep 17 00:00:00 2001 From: ilia Date: Sat, 4 Apr 2026 14:44:52 -0400 Subject: [PATCH] feat: workplace filter, job dedup, company skip docs, deploy notes - Add remote/orchestrator filter by workplace (remote, not remote, unknown) with URL param - Expose isRemote on job list API; canonicalize URLs and source_job_id dedup on import - Onboarding: optional VITE_SKIP_RXRESUME_ONBOARDING for RxResume-free onboarding - Scoring UI + docs for company skip list; pipeline-run dedup note - Vitest: TZ=UTC for stable time-based tests - DEPLOY_GITEA_VM_CRON_TELEGRAM.md for VM/cron/Telegram ops Made-with: Cursor --- .env.example | 4 + DEPLOY_GITEA_VM_CRON_TELEGRAM.md | 179 ++++++++++++++++++ docs-site/docs/features/company-skip-list.md | 53 ++++++ docs-site/docs/features/pipeline-run.md | 12 ++ docs-site/docs/features/settings.md | 3 +- .../src/client/components/OnboardingGate.tsx | 91 ++++++--- .../src/client/pages/OrchestratorPage.tsx | 5 + .../orchestrator/OrchestratorFilters.test.tsx | 12 +- .../orchestrator/OrchestratorFilters.tsx | 58 +++++- .../client/pages/orchestrator/constants.ts | 3 + .../orchestrator/useFilteredJobs.test.ts | 52 +++++ .../pages/orchestrator/useFilteredJobs.ts | 20 +- .../orchestrator/useOrchestratorFilters.ts | 32 ++++ .../components/ScoringSettingsSection.tsx | 4 +- orchestrator/src/server/repositories/jobs.ts | 146 +++++++++++--- orchestrator/vite.config.ts | 2 + shared/src/index.ts | 1 + shared/src/job-url-canonical.test.ts | 27 +++ shared/src/job-url-canonical.ts | 61 ++++++ shared/src/types/jobs.ts | 1 + 20 files changed, 705 insertions(+), 61 deletions(-) create mode 100644 DEPLOY_GITEA_VM_CRON_TELEGRAM.md create mode 100644 docs-site/docs/features/company-skip-list.md create mode 100644 shared/src/job-url-canonical.test.ts create mode 100644 shared/src/job-url-canonical.ts diff --git a/.env.example b/.env.example index 2a59504..baf52bf 100644 --- a/.env.example +++ b/.env.example @@ -23,6 +23,10 @@ RXRESUME_PASSWORD=your_password_here BASIC_AUTH_USER= BASIC_AUTH_PASSWORD= +# Optional: client build only — skip RxResume steps in the onboarding wizard (search without PDF export). +# Set when running `npm run build:client` / Vite dev server; not read by the Docker Node server. +# VITE_SKIP_RXRESUME_ONBOARDING=true + # Public base URL used to generate tracer links when PDFs are created by # background/pipeline runs (where request host cannot be inferred). # Example: JOBOPS_PUBLIC_BASE_URL=https://jobops.example.com diff --git a/DEPLOY_GITEA_VM_CRON_TELEGRAM.md b/DEPLOY_GITEA_VM_CRON_TELEGRAM.md new file mode 100644 index 0000000..7523ac6 --- /dev/null +++ b/DEPLOY_GITEA_VM_CRON_TELEGRAM.md @@ -0,0 +1,179 @@ +# Deploy on a VM or container, run the pipeline on a schedule, notify Telegram + +This guide assumes you already pushed this repo to Gitea, for example: + +```bash +git remote add gitea gitea@10.0.30.169:ilia/Jobber.git # or: git remote set-url gitea ... +git push -u gitea main +``` + +If you have **uncommitted** changes, commit them first, then push again: + +```bash +git add -A && git commit -m "Your message" && git push gitea main +``` + +--- + +## 1. Deploy on a Linux VM (bare metal or cloud) + +1. Install **Docker** and **Docker Compose** (plugin v2). +2. Clone from your Gitea server (SSH or HTTPS): + + ```bash + git clone gitea@10.0.30.169:ilia/Jobber.git + cd Jobber # or job-ops if you kept that folder name + ``` + +3. Copy and edit environment: + + ```bash + cp .env.example .env + # Edit .env: MODEL / LLM keys, RXRESUME_*, search settings, etc. + ``` + +4. Start the stack: + + ```bash + docker compose up -d + ``` + +5. Open the UI: `http://:3005` (port mapped in `docker-compose.yml`). + +6. Persist data: the compose file mounts `./data` — back up that directory. + +--- + +## 2. Deploy as a container (same image, any host) + +Same as the VM path: only Docker is required. On the VM: + +- Ensure port **3005** (or your chosen host port) is reachable if you use the UI from another machine. +- For **only** API/cron use from localhost, you can bind to `127.0.0.1:3005` by changing the `ports:` line in `docker-compose.yml` if you edit it (e.g. `"127.0.0.1:3005:3001"`). + +Inside the container the app listens on **3001**; the host maps **3005 → 3001** by default. + +**Cron on the host** should call the API on the host: + +- UI: `http://127.0.0.1:3005` (browser) +- **API (orchestrator)**: `http://127.0.0.1:3005` — same port; requests to `/api/...` are served by the app behind the reverse proxy built into the image. + +If your setup exposes the API only on an internal Docker network, use the container name and port `3001` from another container, or publish `3005` on the host and use `127.0.0.1:3005` from cron. + +--- + +## 3. Run the pipeline three times a day (cron) + +`POST /api/pipeline/run` **starts** the pipeline in the **background** and returns immediately (`{ ok: true, data: { message: "Pipeline started" } }`). That is enough for scheduling. + +Example **crontab** entries (host time zone — adjust hours as you like): + +```cron +# 08:00, 14:00, 20:00 daily — trigger JobOps pipeline +0 8,14,20 * * * /usr/local/bin/jobops-pipeline-run.sh >> /var/log/jobops-pipeline.log 2>&1 +``` + +Create `/usr/local/bin/jobops-pipeline-run.sh`: + +```bash +#!/usr/bin/env bash +set -euo pipefail +BASE_URL="${JOBOPS_URL:-http://127.0.0.1:3005}" +# If you set BASIC_AUTH_USER / BASIC_AUTH_PASSWORD in .env, uncomment: +# AUTH=(-u "${BASIC_AUTH_USER:?}:${BASIC_AUTH_PASSWORD:?}") + +curl -sS -X POST "${BASE_URL}/api/pipeline/run" \ + -H "Content-Type: application/json" \ + -d '{}' \ + "${AUTH[@]:-}" \ + | tee -a /var/log/jobops-pipeline.log +echo >> /var/log/jobops-pipeline.log +``` + +```bash +sudo chmod +x /usr/local/bin/jobops-pipeline-run.sh +``` + +Optional: set `JOBOPS_URL` in root’s crontab or in `/etc/environment` if the app is on another host. + +**Basic Auth:** When `BASIC_AUTH_USER` and `BASIC_AUTH_PASSWORD` are set in `.env`, all non-GET API calls need Basic auth — use `curl -u user:pass` as above. + +--- + +## 4. Telegram notifications + +JobOps does **not** send Telegram directly. Practical options: + +### Option A — Pipeline webhook (recommended) + +1. In the app: **Settings → Webhooks** (or env `PIPELINE_WEBHOOK_URL` / `WEBHOOK_SECRET`) set a URL that receives JSON when a run **completes or fails**. +2. Point that URL to a **small relay** that translates the JSON into a Telegram `sendMessage` call. + +Telegram API: + +```text +https://api.telegram.org/bot/sendMessage +``` + +Body (JSON): + +```json +{ + "chat_id": "", + "text": "Pipeline finished: ..." +} +``` + +You can host the relay on the same VM (Flask/FastAPI/Node, or **n8n** / **Webhook.site** + automation). Keep the **bot token** and **chat id** in env vars, not in the JobOps UI if possible. + +Webhook payload shape (sanitized) includes fields like `event`, `pipelineRunId`, `jobsDiscovered`, `jobsProcessed`, `error` — see server code `notify-webhook.ts`. + +### Option B — Cron wrapper: poll status, then Telegram + +Because `/api/pipeline/run` returns before the run finishes, a simple approach: + +1. Cron calls `jobops-pipeline-run.sh` (as above). +2. A **second** script (or same script extended) polls `GET /api/pipeline/status` until `isRunning` is false, then reads `GET /api/pipeline/runs` for the latest run and sends a short message via `curl` to Telegram. + +Example **send** (replace token and chat id): + +```bash +TELEGRAM_BOT_TOKEN="123456:ABC..." +CHAT_ID="your_numeric_chat_id" +MSG="$(printf 'JobOps pipeline finished. Check dashboard.')" +curl -sS -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -H "Content-Type: application/json" \ + -d "{\"chat_id\":\"${CHAT_ID}\",\"text\":$(echo "$MSG" | jq -Rs .)}" +``` + +Get **chat_id**: message your bot, then open `https://api.telegram.org/bot/getUpdates` and read `message.chat.id`. + +### Option C — External automation + +Use **n8n**, **Grafana OnCall**, or similar: trigger on schedule → HTTP POST ` /api/pipeline/run` → wait/poll → Telegram node. + +--- + +## 5. Security notes + +- Do not commit `.env` or Telegram tokens to Git. +- Prefer **Basic Auth** on the instance if it is reachable from the internet. +- Restrict firewall so only your IP (or VPN) can reach port 3005 if exposed. + +--- + +## 6. Git remotes quick reference + +```bash +git remote -v +git push gitea main # your Gitea +git push origin main # upstream GitHub (if you have rights) +``` + +--- + +## Related project docs + +- Self-hosting: docs site **Self-Hosting** guide (if present in your tree). +- Webhooks: **Settings** documentation for pipeline / job-complete webhooks. +- Optional env: `PIPELINE_WEBHOOK_URL`, `WEBHOOK_SECRET`, `BASIC_AUTH_USER`, `BASIC_AUTH_PASSWORD` in `.env.example`. diff --git a/docs-site/docs/features/company-skip-list.md b/docs-site/docs/features/company-skip-list.md new file mode 100644 index 0000000..e041af1 --- /dev/null +++ b/docs-site/docs/features/company-skip-list.md @@ -0,0 +1,53 @@ +--- +id: company-skip-list +title: Company skip list +description: Block unwanted employers during discovery using blocked company keywords in Settings. +sidebar_position: 6 +--- + +## What it is + +The **company skip list** is the **Blocked company keywords** field in **Settings → Scoring Settings**. Any job whose **employer / company name** contains one of your tokens (case-insensitive substring match) is **dropped during discovery** and is never imported. + +## Why it exists + +You may want to avoid certain agencies, staffing brands, or employers without having to filter them out of every search manually. + +## How to use it + +1. Open **Settings** and expand **Scoring Settings**. +2. Find **Company skip list (blocked keywords)**. +3. Add tokens one at a time, or paste a comma- or newline-separated list. +4. Click **Save Changes**. +5. Run the pipeline again — blocked companies apply to **new discovery only**; they do not remove jobs already in the database. + +### Tips + +- Use substrings that reliably identify the employer on listings, for example `recruit`, `staffing`, or a distinctive part of a brand name. +- Avoid overly short tokens that could match unrelated companies (for example a three-letter acronym shared by many firms). +- The list is capped in Settings validation (max 200 entries, each up to 200 characters). +- To block more precisely, prefer the exact spelling that appears on job posts you see in JobOps. + +### Maintenance + +- **Add** entries when you notice employers you never want to see again. +- **Remove** entries if you blocked too much — save, then run discovery again. +- **Review periodically** — staffing brand names change, and your targets may shift. +- **Existing jobs** are unchanged; use the Jobs UI or **Danger Zone** in Settings if you need to clear old rows. + +## Common problems + +### Blocked companies still appear + +- Confirm you clicked **Save Changes** after editing the list. +- Remember: only **new** runs apply the filter. Old jobs stay until you delete or clear them. + +### Too many jobs disappeared + +- A token may be too broad. Remove or narrow it in Settings. + +## Related pages + +- [Settings](/docs/features/settings) +- [Pipeline Run](/docs/features/pipeline-run) +- [Orchestrator](/docs/features/orchestrator) diff --git a/docs-site/docs/features/pipeline-run.md b/docs-site/docs/features/pipeline-run.md index a150730..edb72ac 100644 --- a/docs-site/docs/features/pipeline-run.md +++ b/docs-site/docs/features/pipeline-run.md @@ -96,6 +96,17 @@ Use it when you already have a specific job description or link and do not want For accepted input formats, inference behavior, and limits, see [Manual Import Extractor](/docs/next/extractors/manual). +## Discovery deduplication + +When new listings are imported, JobOps does not create a second database row if the job is already in your workspace (any status). Matching uses: + +- a **canonical job URL** (normalizes `http`/`https`, `www`, trailing slashes, common tracking query params, and sorts remaining query keys) +- the pair **`source` + `source_job_id`** when the extractor provides an external id + +Existing jobs keep their stored URL; new imports use the canonical form so the same role is not added again under a slightly different link. + +To drop companies before import, configure a **company skip list** (blocked company keywords) in **Settings → Scoring Settings**. See [Company skip list](/docs/features/company-skip-list). + ## Common problems ### Start button stays disabled @@ -128,6 +139,7 @@ For accepted input formats, inference behavior, and limits, see [Manual Import E ## Related pages +- [Company skip list](/docs/features/company-skip-list) - [Find Jobs and Apply Workflow](/docs/next/workflows/find-jobs-and-apply-workflow) - [Manual Import Extractor](/docs/next/extractors/manual) - [Orchestrator](/docs/next/features/orchestrator) diff --git a/docs-site/docs/features/settings.md b/docs-site/docs/features/settings.md index 520da9e..57ebd8e 100644 --- a/docs-site/docs/features/settings.md +++ b/docs-site/docs/features/settings.md @@ -169,7 +169,7 @@ Readiness requires: - Penalize missing salary data - Set penalty amount - Optional auto-skip threshold for low-score jobs -- Block jobs from companies that match configured keyword tokens +- **Company skip list** (blocked company keywords): drop listings during discovery when the employer name contains a token — see [Company skip list](/docs/features/company-skip-list) - Add custom scoring instructions to tell the AI what to weigh more or less ### Danger Zone @@ -261,6 +261,7 @@ curl -X POST "http://localhost:3001/api/backups" ## Related pages +- [Company skip list](/docs/features/company-skip-list) - [Reactive Resume](/docs/next/features/reactive-resume) - [Database Backups](/docs/next/getting-started/database-backups) - [Overview](/docs/next/features/overview) diff --git a/orchestrator/src/client/components/OnboardingGate.tsx b/orchestrator/src/client/components/OnboardingGate.tsx index 36810d1..9e3bb29 100644 --- a/orchestrator/src/client/components/OnboardingGate.tsx +++ b/orchestrator/src/client/components/OnboardingGate.tsx @@ -94,6 +94,9 @@ function getStepPrimaryLabel(input: { } export const OnboardingGate: React.FC = () => { + /** Opt-in: set `VITE_SKIP_RXRESUME_ONBOARDING=true` at build/dev time to skip RxResume steps in onboarding. */ + const skipRxResumeOnboarding = + import.meta.env.VITE_SKIP_RXRESUME_ONBOARDING === "true"; const { settings, isLoading: settingsLoading, @@ -216,14 +219,20 @@ export const OnboardingGate: React.FC = () => { "v5") as RxResumeMode; const hasCheckedValidations = (requiresLlmKey ? llmValidation.checked : true) && - rxresumeValidation.checked && - baseResumeValidation.checked; + (skipRxResumeOnboarding + ? true + : rxresumeValidation.checked && baseResumeValidation.checked); const llmValidated = requiresLlmKey ? llmValidation.valid : true; const shouldOpen = !demoMode && Boolean(settings && !settingsLoading) && hasCheckedValidations && - !(llmValidated && rxresumeValidation.valid && baseResumeValidation.valid); + !( + llmValidated && + (skipRxResumeOnboarding + ? true + : rxresumeValidation.valid && baseResumeValidation.valid) + ); const validateRxresumeVersion = useCallback( async ( @@ -318,30 +327,46 @@ export const OnboardingGate: React.FC = () => { }, [selectedProvider]); const steps = useMemo( - () => [ - { - id: "llm", - label: "LLM Provider", - subtitle: "Provider + credentials", - complete: llmValidated, - disabled: false, - }, - { - id: "rxresume", - label: "Connect Reactive Resume", - subtitle: "Version + credentials", - complete: rxresumeValidation.valid, - disabled: false, - }, - { - id: "baseresume", - label: "Select Template Resume", - subtitle: "Template selection", - complete: baseResumeValidation.valid, - disabled: !rxresumeValidation.valid, - }, + () => + skipRxResumeOnboarding + ? [ + { + id: "llm", + label: "LLM Provider", + subtitle: "Provider + credentials", + complete: llmValidated, + disabled: false, + }, + ] + : [ + { + id: "llm", + label: "LLM Provider", + subtitle: "Provider + credentials", + complete: llmValidated, + disabled: false, + }, + { + id: "rxresume", + label: "Connect Reactive Resume", + subtitle: "Version + credentials", + complete: rxresumeValidation.valid, + disabled: false, + }, + { + id: "baseresume", + label: "Select Template Resume", + subtitle: "Template selection", + complete: baseResumeValidation.valid, + disabled: !rxresumeValidation.valid, + }, + ], + [ + skipRxResumeOnboarding, + llmValidated, + rxresumeValidation.valid, + baseResumeValidation.valid, ], - [llmValidated, rxresumeValidation.valid, baseResumeValidation.valid], ); const defaultStep = steps.find((step) => !step.complete)?.id ?? steps[0]?.id; @@ -361,7 +386,12 @@ export const OnboardingGate: React.FC = () => { } else { setLlmValidation({ valid: true, message: null, checked: true }); } - validations.push(validateRxresume(), validateBaseResume()); + if (!skipRxResumeOnboarding) { + validations.push(validateRxresume(), validateBaseResume()); + } else { + setRxresumeValidation({ valid: true, message: null, checked: true }); + setBaseResumeValidation({ valid: true, message: null, checked: true }); + } const results = await Promise.allSettled(validations); @@ -375,6 +405,7 @@ export const OnboardingGate: React.FC = () => { }, [ settings, requiresLlmKey, + skipRxResumeOnboarding, validateLlm, validateRxresume, validateBaseResume, @@ -386,8 +417,9 @@ export const OnboardingGate: React.FC = () => { if (!settings || settingsLoading) return; const needsValidation = (requiresLlmKey ? !llmValidation.checked : false) || - !rxresumeValidation.checked || - !baseResumeValidation.checked; + (skipRxResumeOnboarding + ? false + : !rxresumeValidation.checked || !baseResumeValidation.checked); if (!needsValidation) return; void runAllValidations(); }, [ @@ -399,6 +431,7 @@ export const OnboardingGate: React.FC = () => { baseResumeValidation.checked, runAllValidations, demoMode, + skipRxResumeOnboarding, ]); const handleSaveLlm = async (): Promise => { diff --git a/orchestrator/src/client/pages/OrchestratorPage.tsx b/orchestrator/src/client/pages/OrchestratorPage.tsx index ff4617b..d3d9622 100644 --- a/orchestrator/src/client/pages/OrchestratorPage.tsx +++ b/orchestrator/src/client/pages/OrchestratorPage.tsx @@ -39,6 +39,8 @@ export const OrchestratorPage: React.FC = () => { setSourceFilter, sponsorFilter, setSponsorFilter, + workplaceFilter, + setWorkplaceFilter, salaryFilter, setSalaryFilter, sort, @@ -144,6 +146,7 @@ export const OrchestratorPage: React.FC = () => { activeTab, sourceFilter, sponsorFilter, + workplaceFilter, salaryFilter, sort, ); @@ -386,6 +389,8 @@ export const OrchestratorPage: React.FC = () => { onSourceFilterChange={setSourceFilter} sponsorFilter={sponsorFilter} onSponsorFilterChange={setSponsorFilter} + workplaceFilter={workplaceFilter} + onWorkplaceFilterChange={setWorkplaceFilter} salaryFilter={salaryFilter} onSalaryFilterChange={setSalaryFilter} sourcesWithJobs={sourcesWithJobs} diff --git a/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.test.tsx b/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.test.tsx index b01d245..474f29e 100644 --- a/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.test.tsx +++ b/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.test.tsx @@ -2,7 +2,12 @@ import type { JobSource } from "@shared/types.js"; import { fireEvent, render, screen } from "@testing-library/react"; import type { ComponentProps } from "react"; import { afterAll, beforeAll, describe, expect, it, vi } from "vitest"; -import type { FilterTab, JobSort, SponsorFilter } from "./constants"; +import type { + FilterTab, + JobSort, + SponsorFilter, + WorkplaceFilter, +} from "./constants"; import { OrchestratorFilters } from "./OrchestratorFilters"; const originalScrollIntoView = HTMLElement.prototype.scrollIntoView; @@ -38,6 +43,8 @@ const renderFilters = ( onSourceFilterChange: vi.fn(), sponsorFilter: "all" as SponsorFilter, onSponsorFilterChange: vi.fn(), + workplaceFilter: "all" as WorkplaceFilter, + onWorkplaceFilterChange: vi.fn(), salaryFilter: { mode: "at_least" as const, min: null, @@ -80,6 +87,9 @@ describe("OrchestratorFilters", () => { fireEvent.click(screen.getByRole("button", { name: "Potential sponsor" })); expect(props.onSponsorFilterChange).toHaveBeenCalledWith("potential"); + fireEvent.click(screen.getByRole("button", { name: "Remote" })); + expect(props.onWorkplaceFilterChange).toHaveBeenCalledWith("remote"); + fireEvent.change(screen.getByLabelText("Minimum"), { target: { value: "65000" }, }); diff --git a/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.tsx b/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.tsx index 741e36e..a37353f 100644 --- a/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.tsx +++ b/orchestrator/src/client/pages/orchestrator/OrchestratorFilters.tsx @@ -32,6 +32,7 @@ import type { SalaryFilter, SalaryFilterMode, SponsorFilter, + WorkplaceFilter, } from "./constants"; import { defaultSortDirection, orderedFilterSources, tabs } from "./constants"; @@ -44,6 +45,8 @@ interface OrchestratorFiltersProps { onSourceFilterChange: (value: JobSource | "all") => void; sponsorFilter: SponsorFilter; onSponsorFilterChange: (value: SponsorFilter) => void; + workplaceFilter: WorkplaceFilter; + onWorkplaceFilterChange: (value: WorkplaceFilter) => void; salaryFilter: SalaryFilter; onSalaryFilterChange: (value: SalaryFilter) => void; sourcesWithJobs: JobSource[]; @@ -55,6 +58,16 @@ interface OrchestratorFiltersProps { onFiltersOpenChange?: (open: boolean) => void; } +const workplaceOptions: Array<{ + value: WorkplaceFilter; + label: string; +}> = [ + { value: "all", label: "All" }, + { value: "remote", label: "Remote" }, + { value: "not_remote", label: "Not remote" }, + { value: "unknown", label: "Unknown" }, +]; + const sponsorOptions: Array<{ value: SponsorFilter; label: string; @@ -121,6 +134,8 @@ export const OrchestratorFilters: React.FC = ({ onSourceFilterChange, sponsorFilter, onSponsorFilterChange, + workplaceFilter, + onWorkplaceFilterChange, salaryFilter, onSalaryFilterChange, sourcesWithJobs, @@ -143,11 +158,18 @@ export const OrchestratorFilters: React.FC = ({ () => Number(sourceFilter !== "all") + Number(sponsorFilter !== "all") + + Number(workplaceFilter !== "all") + Number( (typeof salaryFilter.min === "number" && salaryFilter.min > 0) || (typeof salaryFilter.max === "number" && salaryFilter.max > 0), ), - [sourceFilter, sponsorFilter, salaryFilter.min, salaryFilter.max], + [ + sourceFilter, + sponsorFilter, + workplaceFilter, + salaryFilter.min, + salaryFilter.max, + ], ); const showSalaryMin = salaryFilter.mode === "at_least" || salaryFilter.mode === "between"; @@ -224,7 +246,8 @@ export const OrchestratorFilters: React.FC = ({ )} - Refine sources, sponsor status, salary, and sorting. + Refine sources, sponsor status, workplace (remote), salary, + and sorting. @@ -283,6 +306,37 @@ export const OrchestratorFilters: React.FC = ({ + + + Workplace + + +

+ Based on each listing's remote flag. Use Unknown + when the source did not mark remote vs on-site. +

+
+ {workplaceOptions.map((option) => ( + + ))} +
+
+
+ Salary diff --git a/orchestrator/src/client/pages/orchestrator/constants.ts b/orchestrator/src/client/pages/orchestrator/constants.ts index 655741d..7a2dd4d 100644 --- a/orchestrator/src/client/pages/orchestrator/constants.ts +++ b/orchestrator/src/client/pages/orchestrator/constants.ts @@ -88,6 +88,9 @@ export type SponsorFilter = | "potential" | "not_found" | "unknown"; + +/** Filter job list by remote flag from listings (null = unknown / not provided). */ +export type WorkplaceFilter = "all" | "remote" | "not_remote" | "unknown"; export type SalaryFilterMode = "at_least" | "at_most" | "between"; export interface SalaryFilter { diff --git a/orchestrator/src/client/pages/orchestrator/useFilteredJobs.test.ts b/orchestrator/src/client/pages/orchestrator/useFilteredJobs.test.ts index 92ad166..319bbed 100644 --- a/orchestrator/src/client/pages/orchestrator/useFilteredJobs.test.ts +++ b/orchestrator/src/client/pages/orchestrator/useFilteredJobs.test.ts @@ -33,6 +33,7 @@ describe("useFilteredJobs", () => { "all", "all", "all", + "all", { mode: "at_least", min: null, max: null }, { key: "score", @@ -60,6 +61,7 @@ describe("useFilteredJobs", () => { "ready", "all", "all", + "all", { mode: "at_least", min: null, max: null }, { key: "score", @@ -88,6 +90,7 @@ describe("useFilteredJobs", () => { "all", "all", "confirmed", + "all", { mode: "at_least", min: null, max: null }, { key: "score", @@ -113,6 +116,7 @@ describe("useFilteredJobs", () => { "all", "all", "all", + "all", { mode: "between", min: 60000, max: 80000 }, { key: "score", @@ -141,6 +145,7 @@ describe("useFilteredJobs", () => { "all", "all", "all", + "all", { mode: "at_least", min: null, max: null }, { key: "salary", @@ -156,4 +161,51 @@ describe("useFilteredJobs", () => { "none", ]); }); + + it("filters by remote workplace flag", () => { + const jobs: Job[] = [ + { ...baseJob, id: "remote", isRemote: true }, + { ...baseJob, id: "onsite", isRemote: false }, + { ...baseJob, id: "unknown", isRemote: null }, + ]; + + const { result: remoteOnly } = renderHook(() => + useFilteredJobs( + jobs, + "all", + "all", + "all", + "remote", + { mode: "at_least", min: null, max: null }, + { key: "score", direction: "desc" }, + ), + ); + expect(remoteOnly.current.map((j) => j.id)).toEqual(["remote"]); + + const { result: notRemote } = renderHook(() => + useFilteredJobs( + jobs, + "all", + "all", + "all", + "not_remote", + { mode: "at_least", min: null, max: null }, + { key: "score", direction: "desc" }, + ), + ); + expect(notRemote.current.map((j) => j.id)).toEqual(["onsite"]); + + const { result: unknown } = renderHook(() => + useFilteredJobs( + jobs, + "all", + "all", + "all", + "unknown", + { mode: "at_least", min: null, max: null }, + { key: "score", direction: "desc" }, + ), + ); + expect(unknown.current.map((j) => j.id)).toEqual(["unknown"]); + }); }); diff --git a/orchestrator/src/client/pages/orchestrator/useFilteredJobs.ts b/orchestrator/src/client/pages/orchestrator/useFilteredJobs.ts index da9a20e..1d46539 100644 --- a/orchestrator/src/client/pages/orchestrator/useFilteredJobs.ts +++ b/orchestrator/src/client/pages/orchestrator/useFilteredJobs.ts @@ -5,6 +5,7 @@ import type { JobSort, SalaryFilter, SponsorFilter, + WorkplaceFilter, } from "./constants"; import { compareJobs, parseSalaryBounds } from "./utils"; @@ -20,6 +21,7 @@ export const useFilteredJobs = ( activeTab: FilterTab, sourceFilter: JobSource | "all", sponsorFilter: SponsorFilter, + workplaceFilter: WorkplaceFilter, salaryFilter: SalaryFilter, sort: JobSort, ) => @@ -54,6 +56,14 @@ export const useFilteredJobs = ( ); } + if (workplaceFilter !== "all") { + filtered = filtered.filter((job) => { + if (workplaceFilter === "remote") return job.isRemote === true; + if (workplaceFilter === "not_remote") return job.isRemote === false; + return job.isRemote === null; + }); + } + const hasMin = typeof salaryFilter.min === "number" && Number.isFinite(salaryFilter.min) && @@ -93,4 +103,12 @@ export const useFilteredJobs = ( } return [...filtered].sort((a, b) => compareJobs(a, b, sort)); - }, [jobs, activeTab, sourceFilter, sponsorFilter, salaryFilter, sort]); + }, [ + jobs, + activeTab, + sourceFilter, + sponsorFilter, + workplaceFilter, + salaryFilter, + sort, + ]); diff --git a/orchestrator/src/client/pages/orchestrator/useOrchestratorFilters.ts b/orchestrator/src/client/pages/orchestrator/useOrchestratorFilters.ts index 92153d6..66b5a23 100644 --- a/orchestrator/src/client/pages/orchestrator/useOrchestratorFilters.ts +++ b/orchestrator/src/client/pages/orchestrator/useOrchestratorFilters.ts @@ -6,6 +6,7 @@ import type { SalaryFilter, SalaryFilterMode, SponsorFilter, + WorkplaceFilter, } from "./constants"; import { DEFAULT_SORT } from "./constants"; @@ -30,6 +31,13 @@ const allowedSortKeys: JobSort["key"][] = [ ]; const allowedSortDirections: JobSort["direction"][] = ["asc", "desc"]; +const allowedWorkplaceFilters: WorkplaceFilter[] = [ + "all", + "remote", + "not_remote", + "unknown", +]; + export const useOrchestratorFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); @@ -81,6 +89,27 @@ export const useOrchestratorFilters = () => { [setSearchParams], ); + const workplaceFilter = useMemo((): WorkplaceFilter => { + const raw = searchParams.get("workplace") ?? "all"; + return allowedWorkplaceFilters.includes(raw as WorkplaceFilter) + ? (raw as WorkplaceFilter) + : "all"; + }, [searchParams]); + + const setWorkplaceFilter = useCallback( + (value: WorkplaceFilter) => { + setSearchParams( + (prev) => { + if (value === "all") prev.delete("workplace"); + else prev.set("workplace", value); + return prev; + }, + { replace: true }, + ); + }, + [setSearchParams], + ); + const salaryFilter = useMemo((): SalaryFilter => { const modeRaw = searchParams.get("salaryMode") ?? "at_least"; const mode = allowedSalaryModes.includes(modeRaw as SalaryFilterMode) @@ -164,6 +193,7 @@ export const useOrchestratorFilters = () => { (prev) => { prev.delete("source"); prev.delete("sponsor"); + prev.delete("workplace"); prev.delete("salaryMode"); prev.delete("salaryMin"); prev.delete("salaryMax"); @@ -181,6 +211,8 @@ export const useOrchestratorFilters = () => { setSourceFilter, sponsorFilter, setSponsorFilter, + workplaceFilter, + setWorkplaceFilter, salaryFilter, setSalaryFilter, sort, diff --git a/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx b/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx index 19a8a54..668ba3b 100644 --- a/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx +++ b/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx @@ -213,7 +213,7 @@ export const ScoringSettingsSection: React.FC = ({ htmlFor="blocked-company-keywords" className="text-sm font-medium leading-none" > - Blocked Company Keywords + Company skip list (blocked keywords) = ({ setValue("blockedCompanyKeywords", value, { shouldDirty: true }) } placeholder='e.g. "recruitment", "staffing"' - helperText="Jobs whose company name contains one of these keywords will be dropped during discovery." + helperText="Maintained here and saved with Settings. Each token is a case-insensitive substring match on the employer name. Matching jobs are dropped during discovery (not removed from the database if already imported). See docs: /docs/features/company-skip-list" removeLabelPrefix="Remove blocked keyword" disabled={isLoading || isSaving} /> diff --git a/orchestrator/src/server/repositories/jobs.ts b/orchestrator/src/server/repositories/jobs.ts index 8186ca8..4528d7c 100644 --- a/orchestrator/src/server/repositories/jobs.ts +++ b/orchestrator/src/server/repositories/jobs.ts @@ -3,6 +3,7 @@ */ import { randomUUID } from "node:crypto"; +import { canonicalizeJobUrl } from "@shared/job-url-canonical"; import type { CreateJobInput, Job, @@ -16,6 +17,66 @@ import { db, schema } from "../db/index"; const { jobs } = schema; +function normalizeCreateJobInputForDedup(input: CreateJobInput): CreateJobInput { + const jobUrl = canonicalizeJobUrl(input.jobUrl); + if (jobUrl === input.jobUrl) return input; + return { ...input, jobUrl }; +} + +function sourceJobKey(source: string, sourceJobId: string): string { + return `${source}\0${sourceJobId}`; +} + +async function loadJobDedupIndexes(): Promise<{ + existingCanonicalSet: Set; + existingSourceJobKeySet: Set; +}> { + const rows = await db + .select({ + jobUrl: jobs.jobUrl, + source: jobs.source, + sourceJobId: jobs.sourceJobId, + }) + .from(jobs); + + const existingCanonicalSet = new Set( + rows.map((r) => canonicalizeJobUrl(r.jobUrl)), + ); + const existingSourceJobKeySet = new Set( + rows + .filter( + (r) => + r.sourceJobId != null && String(r.sourceJobId).trim().length > 0, + ) + .map((r) => sourceJobKey(r.source, String(r.sourceJobId))), + ); + return { existingCanonicalSet, existingSourceJobKeySet }; +} + +async function findJobByCanonicalUrl(canonical: string): Promise { + const [exact] = await db.select().from(jobs).where(eq(jobs.jobUrl, canonical)); + if (exact) return mapRowToJob(exact); + + const allRows = await db.select().from(jobs); + for (const row of allRows) { + if (canonicalizeJobUrl(row.jobUrl) === canonical) { + return mapRowToJob(row); + } + } + return null; +} + +async function getJobBySourceAndExternalId( + source: string, + sourceJobId: string, +): Promise { + const [row] = await db + .select() + .from(jobs) + .where(and(eq(jobs.source, source), eq(jobs.sourceJobId, sourceJobId))); + return row ? mapRowToJob(row) : null; +} + function normalizeStatusFilter(statuses?: JobStatus[]): string | null { if (!statuses || statuses.length === 0) return null; return Array.from(new Set(statuses)).sort().join(","); @@ -65,6 +126,7 @@ export async function getJobListItems( salaryMinAmount: jobs.salaryMinAmount, salaryMaxAmount: jobs.salaryMaxAmount, salaryCurrency: jobs.salaryCurrency, + isRemote: jobs.isRemote, discoveredAt: jobs.discoveredAt, appliedAt: jobs.appliedAt, updatedAt: jobs.updatedAt, @@ -150,18 +212,19 @@ export async function listJobSummariesByIds(jobIds: string[]): Promise< /** * Get a job by its URL (for deduplication). + * Matches canonical URL equivalence, including legacy rows stored with non-canonical URLs. */ export async function getJobByUrl(jobUrl: string): Promise { - const [row] = await db.select().from(jobs).where(eq(jobs.jobUrl, jobUrl)); - return row ? mapRowToJob(row) : null; + return findJobByCanonicalUrl(canonicalizeJobUrl(jobUrl)); } /** - * Get all known job URLs (for deduplication / crawler optimizations). + * Get all known canonical job URLs (for deduplication / crawler skip lists). */ export async function getAllJobUrls(): Promise { const rows = await db.select({ jobUrl: jobs.jobUrl }).from(jobs); - return rows.map((r) => r.jobUrl); + const canonicals = rows.map((r) => canonicalizeJobUrl(r.jobUrl)); + return Array.from(new Set(canonicals)); } async function insertJob(input: CreateJobInput): Promise { @@ -248,14 +311,42 @@ export async function createJobs( inputOrInputs: CreateJobInput | CreateJobInput[], ): Promise { if (!Array.isArray(inputOrInputs)) { - const inserted = await tryInsertJob(inputOrInputs); + const normalized = normalizeCreateJobInputForDedup(inputOrInputs); + const { existingCanonicalSet, existingSourceJobKeySet } = + await loadJobDedupIndexes(); + + const sid = normalized.sourceJobId?.trim(); + if (sid) { + const sk = sourceJobKey(normalized.source, sid); + if (existingSourceJobKeySet.has(sk)) { + const existing = await getJobBySourceAndExternalId( + normalized.source, + sid, + ); + if (existing) return existing; + } + } + + if (existingCanonicalSet.has(normalized.jobUrl)) { + const existing = await findJobByCanonicalUrl(normalized.jobUrl); + if (existing) return existing; + } + + const inserted = await tryInsertJob(normalized); if (inserted) return inserted; - const existing = await getJobByUrl(inputOrInputs.jobUrl); - if (existing) return existing; + + const existingAfterConflict = + (await findJobByCanonicalUrl(normalized.jobUrl)) ?? + (sid ? await getJobBySourceAndExternalId(normalized.source, sid) : null); + if (existingAfterConflict) return existingAfterConflict; + throw new Error("Failed to create or resolve existing job by URL"); } - const byUrl = new Map< + const { existingCanonicalSet, existingSourceJobKeySet } = + await loadJobDedupIndexes(); + + const batchBuckets = new Map< string, { input: CreateJobInput; @@ -263,31 +354,32 @@ export async function createJobs( } >(); - for (const input of inputOrInputs) { - const existing = byUrl.get(input.jobUrl); - if (existing) { - existing.count += 1; + for (const raw of inputOrInputs) { + const normalized = normalizeCreateJobInputForDedup(raw); + const batchKey = normalized.sourceJobId?.trim() + ? `sid:${sourceJobKey(normalized.source, normalized.sourceJobId!)}` + : `url:${normalized.jobUrl}`; + const prev = batchBuckets.get(batchKey); + if (prev) { + prev.count += 1; } else { - byUrl.set(input.jobUrl, { input, count: 1 }); + batchBuckets.set(batchKey, { input: normalized, count: 1 }); } } let created = 0; let skipped = 0; - const uniqueUrls = Array.from(byUrl.keys()); - if (uniqueUrls.length === 0) { - return { created, skipped }; - } + for (const { input, count } of batchBuckets.values()) { + const canonical = input.jobUrl; + const sid = input.sourceJobId?.trim(); + const sk = sid ? sourceJobKey(input.source, sid) : null; - const existingRows = await db - .select({ jobUrl: jobs.jobUrl }) - .from(jobs) - .where(inArray(jobs.jobUrl, uniqueUrls)); - const existingUrlSet = new Set(existingRows.map((row) => row.jobUrl)); - - for (const { input, count } of byUrl.values()) { - if (existingUrlSet.has(input.jobUrl)) { + if (sk && existingSourceJobKeySet.has(sk)) { + skipped += count; + continue; + } + if (existingCanonicalSet.has(canonical)) { skipped += count; continue; } @@ -300,6 +392,10 @@ export async function createJobs( created += 1; skipped += count - 1; + existingCanonicalSet.add(canonical); + if (sk) { + existingSourceJobKeySet.add(sk); + } } return { created, skipped }; diff --git a/orchestrator/vite.config.ts b/orchestrator/vite.config.ts index 8511852..a2c2ba9 100644 --- a/orchestrator/vite.config.ts +++ b/orchestrator/vite.config.ts @@ -36,6 +36,8 @@ export default defineConfig({ test: { globals: true, environment: "jsdom", + // Stable local date/time for chart and backup filename tests across machines. + env: { TZ: "UTC" }, setupFiles: "./src/setupTests.ts", maxWorkers: 1, testTimeout: 30_000, diff --git a/shared/src/index.ts b/shared/src/index.ts index 33a0a42..01bd213 100644 --- a/shared/src/index.ts +++ b/shared/src/index.ts @@ -1,4 +1,5 @@ export * from "./extractors"; +export * from "./job-url-canonical"; export * from "./location-support"; export * from "./types"; export * from "./utils/type-conversion"; diff --git a/shared/src/job-url-canonical.test.ts b/shared/src/job-url-canonical.test.ts new file mode 100644 index 0000000..9f7dd59 --- /dev/null +++ b/shared/src/job-url-canonical.test.ts @@ -0,0 +1,27 @@ +import { describe, expect, it } from "vitest"; +import { canonicalizeJobUrl } from "./job-url-canonical"; + +describe("canonicalizeJobUrl", () => { + it("strips tracking query params and normalizes host", () => { + const a = + "https://www.example.com/jobs/123?utm_source=linkedin&role=eng&utm_medium=social"; + const b = "http://example.com/jobs/123?role=eng"; + expect(canonicalizeJobUrl(a)).toBe(canonicalizeJobUrl(b)); + }); + + it("removes trailing slash on path", () => { + expect(canonicalizeJobUrl("https://example.com/path/")).toBe( + "https://example.com/path", + ); + }); + + it("sorts query params for stable comparison", () => { + const a = "https://example.com/x?b=2&a=1"; + const b = "https://example.com/x?a=1&b=2"; + expect(canonicalizeJobUrl(a)).toBe(canonicalizeJobUrl(b)); + }); + + it("returns trimmed non-URL strings unchanged", () => { + expect(canonicalizeJobUrl(" not a url ")).toBe("not a url"); + }); +}); diff --git a/shared/src/job-url-canonical.ts b/shared/src/job-url-canonical.ts new file mode 100644 index 0000000..dafa1a2 --- /dev/null +++ b/shared/src/job-url-canonical.ts @@ -0,0 +1,61 @@ +/** + * Normalize job listing URLs so the same role is not stored twice when only + * tracking params, scheme, or trivial path differences differ. + */ + +const TRACKING_QUERY_PREFIXES = ["utm_", "stm_"] as const; + +const DROP_QUERY_KEYS = new Set([ + "ref", + "src", + "fbclid", + "gclid", + "mc_eid", + "icid", +]); + +export function canonicalizeJobUrl(raw: string): string { + const trimmed = raw.trim(); + if (!trimmed) return trimmed; + + try { + const u = new URL(trimmed); + u.hash = ""; + + let host = u.hostname.toLowerCase(); + if (host.startsWith("www.")) host = host.slice(4); + u.hostname = host; + u.protocol = "https:"; + + for (const key of [...u.searchParams.keys()]) { + const lower = key.toLowerCase(); + if ( + DROP_QUERY_KEYS.has(lower) || + TRACKING_QUERY_PREFIXES.some((prefix) => lower.startsWith(prefix)) + ) { + u.searchParams.delete(key); + } + } + + const sortedKeys = [...u.searchParams.keys()].sort((a, b) => + a.localeCompare(b), + ); + const next = new URLSearchParams(); + for (const k of sortedKeys) { + for (const v of u.searchParams.getAll(k)) { + next.append(k, v); + } + } + u.search = next.toString() ? `?${next.toString()}` : ""; + + let path = u.pathname; + if (path.length > 1 && path.endsWith("/")) { + path = path.slice(0, -1); + } + u.pathname = path || "/"; + + return u.toString(); + } catch { + return trimmed; + } +} diff --git a/shared/src/types/jobs.ts b/shared/src/types/jobs.ts index a625894..d53b8c8 100644 --- a/shared/src/types/jobs.ts +++ b/shared/src/types/jobs.ts @@ -213,6 +213,7 @@ export type JobListItem = Pick< | "salaryMinAmount" | "salaryMaxAmount" | "salaryCurrency" + | "isRemote" | "discoveredAt" | "appliedAt" | "updatedAt"