From c840f289e14976213bff38e7c42c9128b03b95bf Mon Sep 17 00:00:00 2001 From: ilia Date: Fri, 15 May 2026 22:36:23 -0400 Subject: [PATCH] feat(extractors): expand catalog, smoke coverage, and sourcing docs Adds Arc.dev, BC T-Net, Eluta, iCIMS tenants, QAJobsBoard, and SmartRecruiters manifests with registry/settings/UI wiring; registers full extractor list in smoke-extractors and documents supplementary board access paths. Aligns Careerjet v4 with the url query parameter and fixes strict typing in QAJobsBoard. Co-authored-by: Cursor --- .env.example | 44 +++ docs-site/docs/extractors/arcdev.md | 34 ++ .../extractors/canadian-companies-qa-ats.md | 117 +++++++ docs-site/docs/extractors/eluta.md | 44 +++ docs-site/docs/extractors/overview.md | 41 +++ .../extractors/qa-contract-staffing-canada.md | 92 +++++ docs-site/docs/extractors/qajobsboard.md | 36 ++ docs-site/docs/extractors/smartrecruiters.md | 44 +++ .../supplementary-sources-access-notes.md | 73 ++++ docs-site/docs/workflows/add-an-extractor.md | 19 +- docs-site/sidebars.ts | 5 + extractors/arcdev/README.md | 15 + extractors/arcdev/manifest.ts | 329 ++++++++++++++++++ extractors/arcdev/package.json | 17 + extractors/arcdev/tsconfig.json | 17 + extractors/bctenet/README.md | 9 + extractors/bctenet/manifest.ts | 194 +++++++++++ extractors/bctenet/package.json | 17 + extractors/bctenet/tsconfig.json | 17 + extractors/careerjet/manifest.ts | 14 +- extractors/eluta/README.md | 9 + extractors/eluta/manifest.ts | 201 +++++++++++ extractors/eluta/package.json | 17 + extractors/eluta/tsconfig.json | 17 + extractors/icims/README.md | 13 + extractors/icims/manifest.ts | 233 +++++++++++++ extractors/icims/package.json | 17 + extractors/icims/tsconfig.json | 17 + extractors/qajobsboard/README.md | 10 + extractors/qajobsboard/manifest.ts | 217 ++++++++++++ extractors/qajobsboard/package.json | 17 + extractors/qajobsboard/tsconfig.json | 17 + extractors/smartrecruiters/README.md | 11 + extractors/smartrecruiters/manifest.ts | 287 +++++++++++++++ extractors/smartrecruiters/package.json | 17 + extractors/smartrecruiters/tsconfig.json | 17 + orchestrator/src/client/api/client.ts | 4 +- .../src/client/pages/orchestrator/utils.ts | 25 +- .../src/server/api/routes/settings.ts | 15 +- .../src/server/config/demo-defaults.data.ts | 6 + .../server/pipeline/steps/discover-jobs.ts | 16 +- package-lock.json | 151 ++++++++ package.json | 1 + scripts/smoke-extractors.ts | 291 ++++++++++++---- shared/src/extractors/index.ts | 44 +++ shared/src/location-support.test.ts | 3 + shared/src/location-support.ts | 11 + shared/src/settings-registry.ts | 133 +++++++ shared/src/testing/factories.ts | 20 ++ shared/src/types/settings.ts | 12 + 50 files changed, 2926 insertions(+), 101 deletions(-) create mode 100644 docs-site/docs/extractors/arcdev.md create mode 100644 docs-site/docs/extractors/canadian-companies-qa-ats.md create mode 100644 docs-site/docs/extractors/eluta.md create mode 100644 docs-site/docs/extractors/qa-contract-staffing-canada.md create mode 100644 docs-site/docs/extractors/qajobsboard.md create mode 100644 docs-site/docs/extractors/smartrecruiters.md create mode 100644 docs-site/docs/extractors/supplementary-sources-access-notes.md create mode 100644 extractors/arcdev/README.md create mode 100644 extractors/arcdev/manifest.ts create mode 100644 extractors/arcdev/package.json create mode 100644 extractors/arcdev/tsconfig.json create mode 100644 extractors/bctenet/README.md create mode 100644 extractors/bctenet/manifest.ts create mode 100644 extractors/bctenet/package.json create mode 100644 extractors/bctenet/tsconfig.json create mode 100644 extractors/eluta/README.md create mode 100644 extractors/eluta/manifest.ts create mode 100644 extractors/eluta/package.json create mode 100644 extractors/eluta/tsconfig.json create mode 100644 extractors/icims/README.md create mode 100644 extractors/icims/manifest.ts create mode 100644 extractors/icims/package.json create mode 100644 extractors/icims/tsconfig.json create mode 100644 extractors/qajobsboard/README.md create mode 100644 extractors/qajobsboard/manifest.ts create mode 100644 extractors/qajobsboard/package.json create mode 100644 extractors/qajobsboard/tsconfig.json create mode 100644 extractors/smartrecruiters/README.md create mode 100644 extractors/smartrecruiters/manifest.ts create mode 100644 extractors/smartrecruiters/package.json create mode 100644 extractors/smartrecruiters/tsconfig.json diff --git a/.env.example b/.env.example index 75efce5..d2cd8c4 100644 --- a/.env.example +++ b/.env.example @@ -200,6 +200,7 @@ ADZUNA_APP_KEY= # LEVER_COMPANIES=netflix,figma # ASHBY_COMPANIES=ramp,linear # GREENHOUSE_COMPANIES=stripe,airbnb +# Canadian QA-employer examples (full table): docs-site/docs/extractors/canadian-companies-qa-ats.md # ============================================================================= # Workday (public career sites) - optional @@ -210,3 +211,46 @@ ADZUNA_APP_KEY= # 2) A JSON object with explicit fields: # {"company":"NVIDIA","tenantUrl":"https://nvidia.wd5.myworkdayjobs.com","tenant":"nvidia","site":"NVIDIAExternalCareerSite","locale":"en-US"} # WORKDAY_TENANTS= + +# ============================================================================= +# SmartRecruiters (public Posting API) - optional +# ============================================================================= +# Comma- or newline-separated company identifiers (API path segment), e.g. +# jobs.smartrecruiters.com/smartrecruiters/... → "smartrecruiters". +# SMARTRECRUITERS_COMPANIES=smartrecruiters +# SMARTRECRUITERS_MAX_JOBS_PER_COMPANY=100 + +# ============================================================================= +# Eluta (Canada, RSS by location) - optional +# ============================================================================= +# Comma- or newline-separated location strings for https://www.eluta.ca/rss?location=... +# Example: ELUTA_RSS_LOCATIONS=Toronto, ON|Vancouver, BC +# ELUTA_MAX_JOBS_PER_TERM=100 + +# ============================================================================= +# BC T-Net (British Columbia tech jobs RSS) — optional +# ============================================================================= +# Default feed is built into the extractor when this is unset: +# https://www.bctechnology.com/rss/jobs/tnetjobs.xml +# Override with JSON array or newline-separated URLs (custom feeds from T-Net builder). +# BCTENET_RSS_URLS= +# Prefer Settings: bctenetRssUrls (JSON array), bctenetMaxJobsPerTerm (default 400). + +# ============================================================================= +# iCIMS tenant portals (anonymous HTML search) — optional +# ============================================================================= +# Comma- or newline-separated hosts, e.g. careers-example.icims.com +# ICIMS_TENANTS= +# Caps via Settings: icimsMaxJobsPerTenant (default 250), icimsMaxPagesPerSearch (default 10). + +# ============================================================================= +# QAJobsBoard (QA JobBoardly JSON) — optional +# ============================================================================= +# Configure caps via Settings: qajobsboardMaxJobsPerTerm (default 100). + +# ============================================================================= +# Arc.dev remote listings — optional +# ============================================================================= +# Comma-separated paths under https://arc.dev used when seeding defaults (e.g. Playwright + Cypress feeds). +# ARC_REMOTE_JOBS_PATHS=/remote-jobs/playwright,/remote-jobs/cypress +# Prefer Settings for overrides: arcRemoteJobsPaths (JSON array), arcMaxJobsPerPath (default 120). diff --git a/docs-site/docs/extractors/arcdev.md b/docs-site/docs/extractors/arcdev.md new file mode 100644 index 0000000..76f99a2 --- /dev/null +++ b/docs-site/docs/extractors/arcdev.md @@ -0,0 +1,34 @@ +--- +id: arcdev +title: Arc.dev Extractor +description: Remote tech roles from Arc.dev listing pages via embedded Next.js data. +sidebar_position: 17 +--- + +## What it is + +[Arc.dev](https://arc.dev) exposes remote job listings on paths such as `/remote-jobs/playwright` and `/remote-jobs/cypress`. The extractor downloads SSR HTML and parses the embedded `__NEXT_DATA__` payload (Arc-managed and external rows). + +Implementation: `extractors/arcdev/manifest.ts`. + +## Why it exists + +Curated remote hiring with explicit tooling-oriented feeds; many roles are open to North America when labeled that way on the site. + +## How to use it + +1. Enable **Arc.dev** in pipeline sources (no credentials). +2. Configure **`arcRemoteJobsPaths`** as a JSON array of path strings (defaults include Playwright and Cypress remote feeds). Optionally seed defaults from **`ARC_REMOTE_JOBS_PATHS`** (comma-separated paths). +3. Set **`arcMaxJobsPerPath`** (default `120`, max `300`) to cap rows per listing URL after deduplication. +4. Align **`searchTerms`** with titles or stacks you care about; empty-term behavior is handled inside the manifest per path. + +## Common problems + +- **HTML changes:** If Arc ships a new payload shape, parsing may need an update; smoke-test with `npx tsx scripts/smoke-extractors.ts arcdev`, or run the full extractor suite with `npx tsx scripts/smoke-extractors.ts`. +- **`Arc talent network` employer:** Some Arc-managed rows omit a company name; the mapper uses that placeholder. + +## Related pages + +- [Extractors overview](/docs/next/extractors/overview) +- [Canadian QA contracting firms](/docs/next/extractors/qa-contract-staffing-canada) +- [Manual Import](/docs/next/extractors/manual) diff --git a/docs-site/docs/extractors/canadian-companies-qa-ats.md b/docs-site/docs/extractors/canadian-companies-qa-ats.md new file mode 100644 index 0000000..cb298b6 --- /dev/null +++ b/docs-site/docs/extractors/canadian-companies-qa-ats.md @@ -0,0 +1,117 @@ +--- +id: canadian-companies-qa-ats +title: Canadian companies — strong QA orgs and scrapable ATS +description: Reference list of Canadian tech employers with solid QA cultures and practical ATS endpoints for JobOps pipelines. +sidebar_position: 41 +--- + +## What it is + +A curated reference of **Canadian-headquartered or Canadian-heavy tech employers** where QA / SDET / test automation is often a first-class function, together with **scrapable ATS endpoints** where they exist. + +Tier 1 targets map cleanly to the shipped **Ashby**, **Greenhouse**, **Lever**, **Workday**, and **SmartRecruiters** extractors. Tier 2 entries need custom scraping, browser automation, or upstream quirks. + +**Verification:** Tier 1 integrations below were probed successfully (**HTTP `200`**, JSON where applicable). **Posting counts change daily** — re-run probes locally when you need exact volumes. + +## Why it exists + +Canada-focused QA sourcing benefits from **employer-direct ATS feeds** (clean titles, real apply URLs) instead of only aggregator noise. This page maps recognizable brands to **exact integration shapes** so you can paste slugs into Settings or env without rediscovering URLs. + +## How to use it + +### Tier 1 — Public ATS APIs (shipped extractors) + +| Company | HQ | ATS | Endpoint / shape (reference) | JobOps wiring | +| --- | --- | --- | --- | --- | +| Wealthsimple | Toronto | Ashby | `GET https://api.ashbyhq.com/posting-api/job-board/wealthsimple` | Add **`wealthsimple`** to **`ashbyCompanies`** / `ASHBY_COMPANIES`. | +| 1Password | Toronto (remote-first) | Ashby | `.../job-board/1password` | Add **`1password`**. | +| Jobber | Edmonton / Toronto | Ashby | `.../job-board/jobber` | Add **`jobber`**. | +| Nylas | Toronto / SF | Ashby | `.../job-board/nylas` | Add **`nylas`**. | +| Hootsuite | Vancouver | Greenhouse | `GET https://boards-api.greenhouse.io/v1/boards/hootsuite/jobs?content=true` | Add **`hootsuite`** to **`greenhouseCompanies`**. | +| Faire | Waterloo / SF | Greenhouse | `.../boards/faire/jobs?content=true` | Add **`faire`**. | +| PointClickCare | Mississauga | Lever | `GET https://api.lever.co/v0/postings/pointclickcare?mode=json` | Add **`pointclickcare`** to **`leverCompanies`**. | +| Clio | Burnaby / Calgary / Toronto | Workday | `POST https://clio.wd3.myworkdayjobs.com/wday/cxs/clio/ClioCareerSite/jobs` (`limit`, `offset`, `searchText`) | Add **`https://clio.wd3.myworkdayjobs.com/en-US/ClioCareerSite`** to **`workdayTenants`** / `WORKDAY_TENANTS`. | +| Coveo | Quebec City / Montreal | SmartRecruiters | `GET https://api.smartrecruiters.com/v1/companies/Coveo/postings` | Add **`Coveo`** to **`smartrecruitersCompanies`**. API stays **`200`**; **`totalFound`** may be **zero** between hiring waves. | + +Optional Ashby query parameter `?includeCompensation=true` works in browsers and `curl` for richer payloads; the bundled Ashby extractor calls the **same path without that query** and still returns full job lists. + +**Example Settings JSON (merge with your existing lists):** + +```json +["wealthsimple", "1password", "jobber", "nylas"] +``` + +```json +["hootsuite", "faire"] +``` + +```json +["pointclickcare"] +``` + +```json +["https://clio.wd3.myworkdayjobs.com/en-US/ClioCareerSite"] +``` + +```json +["Coveo"] +``` + +### Tier 2 — Harder or custom surfaces + +| Company | HQ | ATS | Notes | +| --- | --- | --- | --- | +| Shopify | Ottawa / remote | Ashby (custom) | Hosted board / GraphQL (`jobs.ashbyhq.com/api/non-user-graphql`, `organizationHostedJobsPageName: "shopify"`) or parse careers HTML — not covered by the slug-based Ashby extractor today. | +| Lightspeed Commerce | Montreal | Custom (often Cloudflare) | Careers HTML at `https://www.lightspeedhq.com/careers/openings/` — browser or tolerant fetcher; no shipped extractor. | +| RBC Borealis | Toronto / Montreal | Greenhouse (embedded) | `boards-api` path **`rbcborealis`** returned **404** when probed — scrape `https://rbcborealis.com/careers/` or rediscover the active board slug before using Greenhouse JSON. | +| Vidyard | Kitchener–Waterloo | JS-heavy site | `https://careers.vidyard.com/` — Playwright/Puppeteer if automating. | +| Loblaw Digital | Toronto | Workday (parent) | Parent Workday host may need the correct site segment; careers marketing site often lists roles — browser-backed discovery may be more reliable than guessing CXS paths. | + +### Ready-to-use CLI filters (QA-oriented titles) + +Ashby (example: Wealthsimple): + +```bash +curl -s 'https://api.ashbyhq.com/posting-api/job-board/wealthsimple?includeCompensation=true' \ + | jq '.jobs[] | select(.title | test("QA|SDET|Test|Automation"; "i")) | {title, location, url: .jobUrl}' +``` + +Greenhouse (example: Hootsuite): + +```bash +curl -s 'https://boards-api.greenhouse.io/v1/boards/hootsuite/jobs?content=true' \ + | jq '.jobs[] | select(.title | test("QA|SDET|Test|Automation"; "i")) | {title, location: .location.name, url: .absolute_url}' +``` + +Lever (PointClickCare): + +```bash +curl -s 'https://api.lever.co/v0/postings/pointclickcare?mode=json' \ + | jq '.[] | select(.text | test("QA|SDET|Test|Automation"; "i")) | {title: .text, location: .categories.location, url: .hostedUrl}' +``` + +Workday (Clio — QA search text): + +```bash +curl -s -X POST 'https://clio.wd3.myworkdayjobs.com/wday/cxs/clio/ClioCareerSite/jobs' \ + -H 'Content-Type: application/json' \ + -d '{"limit":50,"offset":0,"searchText":"QA"}' \ + | jq '.jobPostings[] | select(.title | test("QA|SDET|Test|Automation"; "i")) | {title, location: .locationsText}' +``` + +### Other strong-QA Canadian employers (ATS not deep-verified here) + +Worth manual checks or Eluta / LinkedIn cross-reference: **Wattpad**, **Knix**, **Ada**, **Hopper**, **Plusgrade**, **D2L**, **Kinaxis**, **TELUS Digital / Mirum**, **Trulioo**, **OpenText / Hubdoc**. + +## Common problems + +- **Ashby counts vs `includeCompensation`:** Omitting the query param still returns jobs; compensation fields may be sparser. +- **Greenhouse board slug drift:** If `boards-api` returns `404`, the employer may have renamed the board — inspect their careers page embed or HTML source for the current board id. +- **SmartRecruiters zero postings:** Still a valid integration; don’t treat empty arrays as a broken extractor. + +## Related pages + +- [Extractors overview](/docs/next/extractors/overview) +- [Canadian / NA QA contracting firms](/docs/next/extractors/qa-contract-staffing-canada) +- [Eluta](/docs/next/extractors/eluta) +- [Manual Import](/docs/next/extractors/manual) diff --git a/docs-site/docs/extractors/eluta.md b/docs-site/docs/extractors/eluta.md new file mode 100644 index 0000000..252d1b2 --- /dev/null +++ b/docs-site/docs/extractors/eluta.md @@ -0,0 +1,44 @@ +--- +id: eluta +title: Eluta Extractor +description: Canadian job discovery via Eluta.ca public RSS feeds. +sidebar_position: 15 +--- + +## What it is + +Original site: [eluta.ca](https://www.eluta.ca) + +The extractor lives in `extractors/eluta/manifest.ts`. It requests one or more public RSS URLs of the form `https://www.eluta.ca/rss?location=...`, parses items (title, employer, location, link, description), filters by pipeline search terms, and merges feeds while de-duplicating by `guid` / URL. + +## Why it exists + +Eluta surfaces Canadian roles indexed directly from employer career sites, often with less aggregator noise than generic job search. RSS provides a stable, low-auth integration compared to scraping HTML. + +## How to use it + +1. Choose **location strings** Eluta accepts in the RSS `location` query parameter (for example `Toronto, ON`, `Vancouver, BC`). Very broad values such as a whole country may return empty feeds; prefer metros or provinces. +2. In **Settings**, set **Eluta RSS locations** (`elutaRssLocations`) as a JSON array or comma/newline-separated list, or set `ELUTA_RSS_LOCATIONS` in the environment (for example `Toronto, ON|Montreal, QC`). +3. Optionally set **Eluta max jobs per term** (`elutaMaxJobsPerTerm`, default `100`). +4. Set your search geography to **Canada** — Eluta is **Canada-only** and is skipped automatically when the resolved pipeline country is not Canada. +5. Enable **Eluta** in pipeline sources and run the pipeline. + +## Common problems + +### Eluta is skipped for my run + +- Search geography is not Canada (city/country/Indeed country resolution). Align geography to Canada or disable Eluta for non-Canada profiles. + +### Empty feeds + +- The `location` string may be too broad or spelled differently than Eluta expects. Try a major city plus province (e.g. `Calgary, AB`). + +### RSS HTTP errors + +- Eluta may block unusual clients; the extractor sends a conventional User-Agent. Retry later or reduce the number of location feeds per run. + +## Related pages + +- [Extractors Overview](/docs/next/extractors/overview) +- [Add an Extractor](/docs/next/workflows/add-an-extractor) +- [Settings](/docs/next/features/settings) diff --git a/docs-site/docs/extractors/overview.md b/docs-site/docs/extractors/overview.md index cd7655c..2a7d1bd 100644 --- a/docs-site/docs/extractors/overview.md +++ b/docs-site/docs/extractors/overview.md @@ -19,6 +19,12 @@ Extractor integrations are now registered through manifests and loaded automatic | [Hiring Cafe](/docs/next/extractors/hiring-cafe) | Browser-backed discovery using Hiring Cafe search APIs | Subject to upstream anti-bot checks; uses browser context and encoded search-state payloads | `HIRING_CAFE_SEARCH_TERMS`, `HIRING_CAFE_COUNTRY`, `HIRING_CAFE_MAX_JOBS_PER_TERM`, `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` | Uses existing pipeline term/country/budget knobs and maps directly to normalized jobs | | [startup.jobs](/docs/next/extractors/startup-jobs) | Startup-focused discovery through the published `startup-jobs-scraper` package | No credentials required; detail enrichment depends on Playwright browser binaries being installed | existing pipeline `searchTerms`, selected country/cities, `jobspyResultsWanted`; `npx playwright install` for fresh environments | Algolia-backed search plus detail-page enrichment via package import; orchestrator maps normalized records and de-duplicates by `jobUrl` | | [UKVisaJobs](/docs/next/extractors/ukvisajobs) | UK visa sponsorship-focused roles | Requires authenticated session and periodic token/cookie refresh | `UKVISAJOBS_EMAIL`, `UKVISAJOBS_PASSWORD`, `UKVISAJOBS_MAX_JOBS`, `UKVISAJOBS_SEARCH_KEYWORD` | API pagination + dataset output; orchestrator de-dupes and may fetch missing descriptions | +| [SmartRecruiters](/docs/next/extractors/smartrecruiters) | Enterprise employers on SmartRecruiters public boards | No auth; needs configured company identifiers; one HTTP round-trip per posting for apply URLs + descriptions | `SMARTRECRUITERS_COMPANIES`, `SMARTRECRUITERS_MAX_JOBS_PER_COMPANY` | Paginates the public Posting API, filters by pipeline terms, normalizes to `CreateJobInput` | +| iCIMS tenants (HTML) | Large employers on iCIMS portals | No auth; HTML search varies by tenant — maintain explicit tenant hosts | `ICIMS_TENANTS`, Settings: `icimsTenants`, `icimsMaxJobsPerTenant`, `icimsMaxPagesPerSearch` | Fetches `/jobs/search` with iframe-style params, parses listing links, caps per tenant | +| BC T-Net (RSS) | BC tech aggregate via T-Net | Canada-only; free RSS (default feed built-in); optional extra feeds | `BCTENET_RSS_URLS`, Settings: `bctenetRssUrls`, `bctenetMaxJobsPerTerm` | Fetches RSS item blocks, normalizes quirky CDATA link fragments, filters by pipeline terms | +| [Eluta](/docs/next/extractors/eluta) | Canadian listings aggregated from employer career sites (RSS) | Canada-only source (skipped when search geography is not Canada); RSS `location` strings must be set | `ELUTA_RSS_LOCATIONS`, `ELUTA_MAX_JOBS_PER_TERM` | Fetches one or more `eluta.ca` RSS feeds, filters by terms, de-duplicates by guid/URL | +| [QAJobsBoard](/docs/next/extractors/qajobsboard) | QA / SDET / automation-heavy board (global JSON feed) | No auth; geography skew is manual/filter downstream | `qajobsboardMaxJobsPerTerm` | Fetches JobBoardly JSON, filters by pipeline terms | +| [Arc.dev](/docs/next/extractors/arcdev) | Remote roles from Arc.dev listing pages (tool-tagged paths) | Parses SSR `__NEXT_DATA__`; relies on stable Next payload | `ARC_REMOTE_JOBS_PATHS` (seeds defaults), `arcRemoteJobsPaths`, `arcMaxJobsPerPath` | Merges Arc-managed + external rows; dedupes by URL | | [Manual Import](/docs/next/extractors/manual) | One-off jobs not covered by scrapers | Inference quality depends on model/provider and input quality; some URLs cannot be fetched reliably | App/API endpoints (`/api/manual-jobs/infer`, `/api/manual-jobs/import`) | Accepts text/HTML/URL, runs inference, then saves and scores job after review | ## Which extractor should I use? @@ -29,10 +35,38 @@ Extractor integrations are now registered through manifests and loaded automatic - Use **startup.jobs** when you want startup-heavy listings without maintaining another scraper locally. - Use **Gradcracker** when targeting graduate pipelines in the UK. - Use **UKVisaJobs** for sponsorship-specific UK searches. +- Use **SmartRecruiters** when you can list target employers’ public SmartRecruiters company identifiers. +- Use **iCIMS tenants** when you can list target `*.icims.com` career hosts (anonymous portal HTML search). +- Use **BC T-Net** for British Columbia tech RSS listings (runs only when search geography is Canada). +- Use **Eluta** for Canadian employer-direct listings via RSS (set metro/province `location` strings). +- Use **QAJobsBoard** or **Arc.dev** when you want QA- or remote-stack-focused feeds without extra credentials. - Use **Manual Import** when you already have a specific posting and need direct import. Many runs combine sources: broad discovery first, then manual import for high-priority jobs that scraping misses. +### QA-focused boards (shipped extractors) + +- **[QAJobsBoard](/docs/next/extractors/qajobsboard)** — Large QA-oriented index via public JSON; filter geography downstream. +- **[Arc.dev](/docs/next/extractors/arcdev)** — Remote feeds (e.g. Playwright / Cypress paths); good for vetted remote slices. + +### Canadian QA contracting firms (reference) + +Staffing and consultancy firms that frequently post QA automation contracts — scrape hints and CLI probes: **[Canadian / NA QA contracting firms](/docs/next/extractors/qa-contract-staffing-canada)**. + +### Canadian employers — QA-strong ATS (reference) + +Direct ATS JSON / extractor wiring for well-known Canadian tech brands (Ashby, Greenhouse, Lever, Workday, SmartRecruiters): **[Canadian companies — strong QA orgs and scrapable ATS](/docs/next/extractors/canadian-companies-qa-ats)**. + +## Supplementary job boards + +Some boards are **credential-gated**, **approval-gated**, or **scraping-hostile** — see **[Supplementary sources — access notes](/docs/next/extractors/supplementary-sources-access-notes)** for realistic paths (Careerjet, Reed, Job Bank XML policy, sponsorship data sources, etc.). + +JobOps ships **BC T-Net** and **iCIMS tenant HTML** extractors for two cases that are usually workable without vendor contracts; everything else in the old “long tail” list still lands best via **[Manual Import](/docs/next/extractors/manual)** until someone promotes it to a manifest. + +### Still common manual-import targets + +- **Wellfound** (formerly AngelList), **Otta**, **Welcome to the Jungle**, **Dice**, **Job Bank** (unless you qualify for syndication), regional boards without stable feeds — use Manual Import or an external tool, then normalize here. + ## Related extractor docs - [Gradcracker](/docs/next/extractors/gradcracker) @@ -41,5 +75,12 @@ Many runs combine sources: broad discovery first, then manual import for high-pr - [Hiring Cafe](/docs/next/extractors/hiring-cafe) - [startup.jobs](/docs/next/extractors/startup-jobs) - [UKVisaJobs](/docs/next/extractors/ukvisajobs) +- [SmartRecruiters](/docs/next/extractors/smartrecruiters) +- [Supplementary sources — access notes](/docs/next/extractors/supplementary-sources-access-notes) +- [Eluta](/docs/next/extractors/eluta) +- [QAJobsBoard](/docs/next/extractors/qajobsboard) +- [Arc.dev](/docs/next/extractors/arcdev) +- [Canadian / NA QA contracting firms](/docs/next/extractors/qa-contract-staffing-canada) +- [Canadian companies — QA-strong ATS](/docs/next/extractors/canadian-companies-qa-ats) - [Manual Import](/docs/next/extractors/manual) - [Add an Extractor](/docs/next/workflows/add-an-extractor) diff --git a/docs-site/docs/extractors/qa-contract-staffing-canada.md b/docs-site/docs/extractors/qa-contract-staffing-canada.md new file mode 100644 index 0000000..e09f0be --- /dev/null +++ b/docs-site/docs/extractors/qa-contract-staffing-canada.md @@ -0,0 +1,92 @@ +--- +id: qa-contract-staffing-canada +title: Canadian / NA QA contracting firms +description: Staffing and consultancy boards that often post QA automation contracts, with JobOps wiring notes and scrape hints. +sidebar_position: 40 +--- + +## What it is + +A curated list of Canadian and North American **staffing firms and consultancies** that regularly carry **QA / SDET / test automation** contract roles. Coverage emphasizes targets that are **live**, **contract-heavy**, and roughly ordered by **scraping ease** for automation. + +This is **not** an extractor implementation checklist: several firms need HTML or browser automation. Use native JobOps extractors where they apply, and [Manual Import](/docs/next/extractors/manual) elsewhere. + +## Why it exists + +Contract QA pipelines often come through agencies before they appear on Indeed or LinkedIn runs. Mapping firms to **ATS type** (Workday, Greenhouse, Lever, custom API, JS-rendered site) saves weeks of one-off research. + +Counts and role titles **change daily** — re-verify listings before relying on them for outreach. + +## How to use it + +### Tier 1 — Confirmed live QA contracts + scrapable + +| Firm | HQ | Where to scrape | Confirmed QA / contract notes | +| --- | --- | --- | --- | +| Procom | Toronto | [Find a job](https://procomservices.com/en-ca/find-a-job/) (~230+ roles when checked; paginated HTML). Titles such as “QA/QC Analyst”, “Automation QA Analyst”, “Sr QA Analyst” appear regularly. | Strong banking-sector volume; typical **4–6 month** contracts. No shipped extractor — HTML or browser automation. | +| S.i. Systems | Calgary / Toronto | [Search IT jobs (`q=QA`)](https://www.sisystems.com/search-it-jobs/?q=QA) — custom web API behind the UI; inspect DevTools network for JSON. | Frequent **Sr QA**, **Mobile QE** (WebdriverIO / Appium) and similar; lots of **GTA** roles; postings refresh often. | +| Synechron | NYC / Toronto / Montreal | Workday CXS: `POST https://synechron.wd1.myworkdayjobs.com/wday/cxs/synechron/SynechronCareers/jobs` with e.g. `{"limit":20,"offset":0,"searchText":"QA automation"}`. | Often **20+** QA automation-facing rows when searched; includes **Playwright** and related stacks. Add **`https://synechron.wd1.myworkdayjobs.com/en-US/SynechronCareers`** to **`workdayTenants`** / `WORKDAY_TENANTS` and use QA search terms (bundled extractor sends empty facets — see CLI snippet below for Canada facet example). | +| Capco | London / Toronto | Greenhouse: [boards-api capco](https://boards-api.greenhouse.io/v1/boards/capco/jobs?content=true) | Large board (**700+** roles when checked); dozens QA-related titles but many **India / Poland** — **filter by location** (Toronto / Canada metros). Add **`capco`** to **`greenhouseCompanies`**. | +| Foilcon | Toronto | [CVViz — Foilcon](https://jobs.cvviz.com/foilcon) | Lower volume; roles such as **Systems Testing QA Specialist** show up when hiring. | +| Robert Half (Technology) | Toronto / intl | [Jobs — QA automation keyword](https://www.roberthalf.com/ca/en/jobs?keywords=qa+automation) (often Workday-backed listings) | Discover tenant/host patterns from network tab if you want bulk Workday-style pulls; otherwise HTML/manual. | +| Hays Canada | Toronto | [QA automation search](https://www.hays.ca/job-search/qa-automation) — `/job-detail/...` permalinks | Custom HTML board; manual import or custom crawler. | +| Randstad Digital | Toronto / Montreal | [Randstad Canada jobs](https://www.randstad.ca/jobs/) | Site listings plus heavy **LinkedIn** contract volume under the Randstad Digital brand; often manual cross-check. | +| Compunnel | NJ / Toronto | [Job search](https://www.compunnel.com/job-search/) — **JS-rendered** | **Quality Assurance Automation Engineer** (and similar) titles recur; use **Playwright** if automating. | +| Pyramid Consulting | Atlanta / Toronto | [Job openings](https://www.pyramidci.com/careers/job-openings/) | **QA Automation Engineer** style roles often cross-posted on **LinkedIn** under their brand. | +| Qualitest Group | NYC / global | [Careers](https://careers.qualitestgroup.com) | Pure-play QA consultancy — schema varies by region; inspect ATS per locale. | + +### Tier 2 — Active in Canadian QA contracting but smaller / harder to scrape + +| Firm | Notes | +| --- | --- | +| Jarvis Consulting Group (`jrvs.ca`) | Toronto; [jobs.jrvs.ca/home/](https://jobs.jrvs.ca/home/) is **JS-rendered** — **Playwright** if scraping; SDET roles also surface on LinkedIn. | +| Electric Mind (formerly Intelliware) | Toronto. Lever: [`electricmind` postings JSON](https://api.lever.co/v0/postings/electricmind?mode=json) — add **`electricmind`** to **`leverCompanies`** when hiring opens; volume can be **small** (e.g. only a handful of open reqs) and sometimes **no QA** until project demand spikes. | +| Light Consulting (LightCI) | Toronto. Lever: [`lightci` postings JSON](https://api.lever.co/v0/postings/lightci?mode=json). Often **empty** publicly — they may prefer direct outreach. | +| Yoush Consulting | Toronto IT staffing; **no structured job board** — SDET contracts often only on **LinkedIn** company presence. | +| Accenture / Deloitte / EY / KPMG / PwC | Big-four **Canadian banking-tech QA** contract pools; usually **Workday** per brand (e.g. **EY** → `ey.wd3.myworkdayjobs.com`). Add each tenant you care about to **`workdayTenants`**. | +| CGI | Montreal (large gov / enterprise contractor). [cgi.com/en/careers](https://www.cgi.com/en/careers) — HTML / embedded ATS; inspect for stable patterns. | +| TEKsystems | US-based with Canadian offices — [Find a job (Canada)](https://www.teksystems.com/en-ca/careers/find-a-job). Enterprise staffing; HTML/search UX varies. | +| Robertson & Company | Often referenced from **LinkedIn** QA contract threads — treat as manual / referral-led unless you find a stable feed. | +| Plan A Technologies | Legit shop — [planatechnologies.com](https://www.planatechnologies.com); **no public job board** in many periods; **LinkedIn / referrals**. | + +### Ready-to-use CLI probes + +Greenhouse (Capco) — Canada-oriented QA filter (tweak city regex as needed): + +```bash +curl -s 'https://boards-api.greenhouse.io/v1/boards/capco/jobs?content=true' \ + | jq '.jobs[] + | select(.title | test("QA|SDET|Test|Automation|Quality"; "i")) + | select(.location.name | test("Toronto|Canada|Montreal|Vancouver|Calgary|Ottawa"; "i")) + | {title, location: .location.name, url: .absolute_url}' +``` + +Synechron Workday — `QA automation` search plus example **country facet** (`appliedFacets.locationCountry`). Facet IDs are **tenant-specific** and **expire or change** — if this stops matching, capture a fresh id from the career site’s network panel. + +```bash +curl -s 'https://synechron.wd1.myworkdayjobs.com/wday/cxs/synechron/SynechronCareers/jobs' \ + -X POST \ + -H 'Content-Type: application/json' \ + -d '{"limit":50,"offset":0,"searchText":"QA automation","appliedFacets":{"locationCountry":["bc33aa3152ec42d4995f4791a106ed09"]}}' \ + | jq '.jobPostings[] + | {title, location: .locationsText, url: ("https://synechron.wd1.myworkdayjobs.com" + .externalPath)}' +``` + +### Heads-up: Ionosphere and “Reqd” + +- **Ionosphere Inc.** — May only show a **LinkedIn** presence plus legacy **Google Sites** (e.g. `sites.google.com/a/ionosphereinc.com/...`) **without** a real careers board or steady postings — **not** a dependable scrape target unless you confirm a dedicated jobs URL. (“Ionosphere” is also used by unrelated firms — disambiguate by legal name and domain.) +- **Reqd** — No widely confirmed Canadian IT staffing brand spelled exactly **Reqd**. Possible leads to double-check: **Recroot**, **Reqroute**, **Recruitio**, **Required Technologies**, etc. If you have a **website or sample posting URL**, verify before adding to any automation list. + +## Common problems + +- **Workday without facets:** The bundled Workday extractor posts `appliedFacets: {}`. You still get roles via **`searchText`**; use tighter terms or post-filter for Canada. +- **Greenhouse volume:** Boards like Capco are large — always filter by title regex and location text before importing hundreds of rows. + +## Related pages + +- [Extractors overview](/docs/next/extractors/overview) +- [Canadian companies — QA-strong ATS](/docs/next/extractors/canadian-companies-qa-ats) +- [Eluta](/docs/next/extractors/eluta) (Canada employer-direct RSS) +- [QAJobsBoard](/docs/next/extractors/qajobsboard) +- [Arc.dev](/docs/next/extractors/arcdev) +- [Manual Import](/docs/next/extractors/manual) diff --git a/docs-site/docs/extractors/qajobsboard.md b/docs-site/docs/extractors/qajobsboard.md new file mode 100644 index 0000000..03c4c66 --- /dev/null +++ b/docs-site/docs/extractors/qajobsboard.md @@ -0,0 +1,36 @@ +--- +id: qajobsboard +title: QAJobsBoard Extractor +description: QA and automation-focused listings via the board’s public JSON feed. +sidebar_position: 16 +--- + +## What it is + +[QAJobsBoard](https://www.qajobsboard.com) publishes postings through JobBoardly. The extractor calls: + +`GET https://qajobsboard.jobboardly.com/jobs.json` + +Implementation: `extractors/qajobsboard/manifest.ts`. + +## Why it exists + +Dense QA / SDET / automation signal versus generic boards; categories often reflect tooling (Playwright, Cypress, Selenium). Geography skews India-remote unless you combine region filtering downstream. + +## How to use it + +1. Enable **QAJobsBoard** in pipeline sources (no credentials). +2. Set **`qajobsboardMaxJobsPerTerm`** (default `100`) to cap mapped rows after term filtering. +3. Tune **`searchTerms`** for QA-focused phrases (`QA automation`, `SDET`, `Playwright`, etc.). +4. Optional: narrow by geography using orchestrator city/country filters where applicable. + +## Common problems + +- **Few or no rows:** Terms may be too narrow; broaden titles or temporarily remove strict city filters. +- **Irrelevant locales:** The feed is global; pair with geography or employer filters in your pipeline profile. + +## Related pages + +- [Extractors overview](/docs/next/extractors/overview) +- [Canadian QA contracting firms](/docs/next/extractors/qa-contract-staffing-canada) +- [Manual Import](/docs/next/extractors/manual) diff --git a/docs-site/docs/extractors/smartrecruiters.md b/docs-site/docs/extractors/smartrecruiters.md new file mode 100644 index 0000000..283741b --- /dev/null +++ b/docs-site/docs/extractors/smartrecruiters.md @@ -0,0 +1,44 @@ +--- +id: smartrecruiters +title: SmartRecruiters Extractor +description: Public SmartRecruiters Posting API discovery with per-company identifiers. +sidebar_position: 14 +--- + +## What it is + +Original API: [SmartRecruiters Posting API](https://developers.smartrecruiters.com/reference/v1listpostings) + +The extractor lives in `extractors/smartrecruiters/manifest.ts`. It calls the public JSON endpoints (no API key for public boards), paginates active **PUBLIC** postings per configured company, optionally matches pipeline search terms against title and location, then loads each posting’s detail document so `jobUrl` / `applicationLink` and HTML descriptions resolve to the same URLs candidates see on `jobs.smartrecruiters.com`. + +## Why it exists + +Many large employers (including a significant share in Canada and the EU) publish on SmartRecruiters. This source complements Greenhouse, Lever, Ashby, and Workday by covering another major ATS with a predictable public API. + +## How to use it + +1. Find each employer’s **company identifier** — the path segment in their public board URL (for example `jobs.smartrecruiters.com/smartrecruiters/...` → `smartrecruiters`). +2. In **Settings**, set **SmartRecruiters companies** (`smartrecruitersCompanies`) to a JSON array or comma/newline-separated list of those identifiers, or set `SMARTRECRUITERS_COMPANIES` in the environment. +3. Optionally set **SmartRecruiters max jobs per company** (`smartrecruitersMaxJobsPerCompany`, default `100`, max `500`) to cap pagination after term filtering. +4. Set your pipeline **search geography** and **search terms** as usual; terms filter postings by title, location text, and company display name. +5. Enable **SmartRecruiters** in pipeline sources and run the pipeline. + +## Common problems + +### SmartRecruiters never appears in source toggles + +- No companies are configured (`smartrecruitersCompanies` / `SMARTRECRUITERS_COMPANIES` is empty). + +### Zero jobs for a slug I know is correct + +- The identifier must match the **public Posting API** path, not necessarily the marketing site name. Confirm listings exist on `jobs.smartrecruiters.com//`. + +### Rate limiting or intermittent HTTP errors + +- Reduce `smartrecruitersMaxJobsPerCompany` or the number of configured companies; each kept posting triggers a detail request after the list pass. + +## Related pages + +- [Extractors Overview](/docs/next/extractors/overview) +- [Add an Extractor](/docs/next/workflows/add-an-extractor) +- [Settings](/docs/next/features/settings) diff --git a/docs-site/docs/extractors/supplementary-sources-access-notes.md b/docs-site/docs/extractors/supplementary-sources-access-notes.md new file mode 100644 index 0000000..99827d7 --- /dev/null +++ b/docs-site/docs/extractors/supplementary-sources-access-notes.md @@ -0,0 +1,73 @@ +--- +id: supplementary-sources-access-notes +title: Supplementary sources — access notes +description: Credential gates, sources to skip, and practical alternatives for boards without a native JobOps extractor. +sidebar_position: 15 +--- + +This page captures **verified access paths** and realistic integration effort for boards that are not fully wired as pipeline extractors. Pair it with [Extractors overview](/docs/next/extractors/overview), [Manual Import](/docs/next/extractors/manual), and [Add an Extractor](/docs/next/workflows/add-an-extractor). + +## Credential-gated APIs (usually straightforward) + +### Careerjet (v4) + +- **Sign-up:** [careerjet.com/partners](https://www.careerjet.com/partners) → add your site → Access API → register **server egress IP(s)**. +- **Endpoint:** `https://search.api.careerjet.net/v4/query` +- **Important parameters:** `affid` (publisher key), **`user_ip`** (documented as end-user IP; for headless/server runs use an IP you allowlisted — fraud-checked), **`user_agent`**, **`url`** (referrer URL where results would appear — maps to `CAREERJET_REFERER` / query `url` + `Referer` header). Missing `user_ip` or `user_agent` tends to yield **403**. +- **Tip:** Official Python client: [`careerjet/careerjet-api-client-python`](https://github.com/careerjet/careerjet-api-client-python). + +### Reed + +- **Sign-up:** [reed.co.uk/developers](https://www.reed.co.uk/developers) — API key is issued via their contact flow (often ~1–2 business days). +- **Endpoint:** e.g. `https://www.reed.co.uk/api/1.0/search?keywords=...&locationName=...&resultsToTake=100` +- **Auth:** HTTP Basic — username = API key, password empty (`curl -u "YOUR_API_KEY:" ...`). +- **Pagination:** `resultsToTake` max **100** per request; advance with `resultsToSkip`. +- **Scope:** UK-centric; still useful for remote UK employers. + +## Usually not worth scraping yourself + +### Job Bank (Canada) + +XML syndication is **manual approval**: active Canadian Business Number, established Canadian-facing employment site. No simple public JSON/RSS for arbitrary candidates. HTML exists but is heavy JSF / anti-bot — treat as **skip** unless you qualify for the feed. + +### Jobboom / Workopolis / BCJobs + +No stable public API/RSS documented for generic job discovery. Third-party scrapers often need **residential proxies** and paid runtime — **skip or pay** for a maintained provider. + +### Jobillico + +Employer-oriented XML/OAuth API (posting and limited pull). Needs a **business account** — not a candidate discovery API. + +### MyVisaJobs / H1BGrader + +No practical public API for their enriched UX. Alternatives: **DOL OFLC LCA disclosure** quarterly CSVs (public, bulk), then join to your own job corpus; paid marketplace scrapers if you accept cost/compliance tradeoffs. Browser extensions may still be useful **personally**. + +### Untapped (Jopwell) + +Closed candidate platform — **no public job-posting API** for arbitrary ingestion. + +## Practical additions in JobOps + +### iCIMS (per-tenant HTML) + +Many tenants expose anonymous search HTML suitable for stable scraping patterns, e.g.: + +`https://{tenant}.icims.com/jobs/search?ss=1&searchKeyword=…&in_iframe=1` + +Pagination often uses `pr=`; job URLs commonly follow `/jobs/{id}/{slug}/job`. Maintain a **tenant host list** (similar to Greenhouse/Lever company lists). This is **not** the authenticated iCIMS Job Portal API. + +Shipped extractor: **iCIMS tenants (HTML)** — configure `icimsTenants` (+ caps in Settings). + +### BC T-Net RSS + +Free aggregate RSS (example): `https://www.bctechnology.com/rss/jobs/tnetjobs.xml` — useful for **BC / Vancouver** tech roles; custom slices via the site’s RSS builder. + +Shipped extractor: **BC T-Net (RSS)** — Canada geography only; optional `bctenetRssUrls` overrides default feed. + +## Related pages + +- [Extractors overview](/docs/next/extractors/overview) +- [Eluta](/docs/next/extractors/eluta) (Canada RSS) +- [SmartRecruiters](/docs/next/extractors/smartrecruiters) +- [Canadian companies — QA-strong ATS](/docs/next/extractors/canadian-companies-qa-ats) +- [Manual Import](/docs/next/extractors/manual) diff --git a/docs-site/docs/workflows/add-an-extractor.md b/docs-site/docs/workflows/add-an-extractor.md index e03e4e0..774f501 100644 --- a/docs-site/docs/workflows/add-an-extractor.md +++ b/docs-site/docs/workflows/add-an-extractor.md @@ -41,7 +41,8 @@ That keeps runtime wiring dynamic while preserving compile-time safety in API an - append to `EXTRACTOR_SOURCE_IDS` - add an entry in `EXTRACTOR_SOURCE_METADATA` 5. Ensure your extractor maps output to `CreateJobInput[]`. -6. Run the full CI checks. +6. Register it in `scripts/smoke-extractors.ts` (`ALL_TARGETS`): add one row per manifest so `npx tsx scripts/smoke-extractors.ts` exercises every shipped extractor (keyed sources `SKIP` until env vars exist). +7. Run the full CI checks. Example manifest: @@ -77,6 +78,22 @@ Subprocess extractors are supported. Keep subprocess spawning inside `run(contex - Add the new source id to `shared/src/extractors/index.ts`. - Confirm metadata exists for that source id. +### Smoke connectivity + +After wiring settings/env, run: + +```bash +npx tsx scripts/smoke-extractors.ts myextractor +``` + +Or the full suite (may take several minutes — JobSpy invokes Python, Hiring Cafe / startup.jobs may need browser deps): + +```bash +npx tsx scripts/smoke-extractors.ts +``` + +Keep `ALL_TARGETS` in that script aligned with manifests under each `extractors//` package (`manifest.ts` or `src/manifest.ts`). + ### Source appears in shared catalog but is unavailable at runtime - The manifest was not loaded successfully. diff --git a/docs-site/sidebars.ts b/docs-site/sidebars.ts index b59f2f9..89df7fe 100644 --- a/docs-site/sidebars.ts +++ b/docs-site/sidebars.ts @@ -46,6 +46,11 @@ const sidebars: SidebarsConfig = { label: "Extractors", items: [ "extractors/overview", + "extractors/supplementary-sources-access-notes", + "extractors/qajobsboard", + "extractors/arcdev", + "extractors/qa-contract-staffing-canada", + "extractors/canadian-companies-qa-ats", "extractors/gradcracker", "extractors/jobspy", "extractors/adzuna", diff --git a/extractors/arcdev/README.md b/extractors/arcdev/README.md new file mode 100644 index 0000000..09bee1c --- /dev/null +++ b/extractors/arcdev/README.md @@ -0,0 +1,15 @@ +# arcdev-extractor + +Reads Arc remote-job listings from **SSR HTML**: each page embeds `__NEXT_DATA__` with `arcJobs` (Arc talent network) and `externalJobs` (partner postings). + +Configure **`arcRemoteJobsPaths`** as URL paths on `https://arc.dev`, for example: + +- `/remote-jobs/playwright` +- `/remote-jobs/cypress` +- `/remote-jobs/selenium` + +Or set `ARC_REMOTE_JOBS_PATHS` (comma/newline-separated). Defaults include Playwright and Cypress stacks. + +**Employer names:** External jobs include `company.name`. Arc-managed listings omit company names in the payload — those rows use employer `"Arc talent network"` while preserving titles and skill categories. + +Cap merged matches per configuration fetch via `arcMaxJobsPerPath` (applied separately per path, default `120`). diff --git a/extractors/arcdev/manifest.ts b/extractors/arcdev/manifest.ts new file mode 100644 index 0000000..c4f7e30 --- /dev/null +++ b/extractors/arcdev/manifest.ts @@ -0,0 +1,329 @@ +/** + * Arc.dev remote jobs — parse embedded Next.js __NEXT_DATA__ from SSR HTML. + * + * Listing URLs look like https://arc.dev/remote-jobs/playwright + */ + +import type { + ExtractorManifest, + ExtractorRunResult, +} from "@shared/types/extractors"; +import type { CreateJobInput } from "@shared/types/jobs"; + +const ORIGIN = "https://arc.dev"; + +interface ArcCategory { + name?: string; + urlString?: string; +} + +interface ArcCompanyJson { + randomKey?: string | null; + urlString?: string; + name?: string; +} + +interface ArcJobJson { + randomKey?: string; + title?: string; + jobType?: string; + jobRole?: string; + urlString?: string; + postedAt?: number; + company?: ArcCompanyJson; + categories?: ArcCategory[]; + requiredCountries?: string[]; + minAnnualSalary?: number | null; + maxAnnualSalary?: number | null; + minHourlyRate?: number | null; + maxHourlyRate?: number | null; + timeZone?: string | null; + positionType?: string; + experienceLevel?: string; + experienceLevels?: string[]; +} + +function readPaths(raw: string | undefined): string[] { + if (!raw) return []; + try { + const parsed = JSON.parse(raw); + if (Array.isArray(parsed)) { + return parsed + .map((entry) => (typeof entry === "string" ? entry.trim() : "")) + .filter(Boolean); + } + } catch { + // fall through + } + return raw + .split(/[\n,;|]+/) + .map((entry) => entry.trim()) + .filter(Boolean); +} + +function defaultArcPaths(): string[] { + const raw = + typeof process !== "undefined" ? process.env.ARC_REMOTE_JOBS_PATHS : ""; + const parsed = readPaths(raw); + return parsed.length > 0 + ? parsed + : ["/remote-jobs/playwright", "/remote-jobs/cypress"]; +} + +function asString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const t = value.trim(); + return t ? t : undefined; +} + +function categoryHaystack(job: ArcJobJson): string { + if (!Array.isArray(job.categories)) return ""; + return job.categories + .map((c) => `${c.name ?? ""} ${c.urlString ?? ""}`) + .join(" ") + .toLowerCase(); +} + +function matchesTerm(job: ArcJobJson, term: string): boolean { + const lower = term.toLowerCase(); + if (job.title?.toLowerCase().includes(lower)) return true; + if (categoryHaystack(job).includes(lower)) return true; + if (job.jobRole?.toLowerCase().includes(lower)) return true; + if (job.positionType?.toLowerCase().includes(lower)) return true; + if ( + Array.isArray(job.experienceLevels) && + job.experienceLevels.some((l) => l.toLowerCase().includes(lower)) + ) + return true; + if (job.experienceLevel?.toLowerCase().includes(lower)) return true; + return false; +} + +function salaryParts(job: ArcJobJson): string | undefined { + const bits: string[] = []; + if ( + typeof job.minAnnualSalary === "number" && + typeof job.maxAnnualSalary === "number" + ) { + bits.push(`USD ${job.minAnnualSalary}–${job.maxAnnualSalary} / yr`); + } else if (typeof job.minAnnualSalary === "number") { + bits.push(`USD ${job.minAnnualSalary}+ / yr`); + } + if ( + typeof job.minHourlyRate === "number" || + typeof job.maxHourlyRate === "number" + ) { + bits.push(`$${job.minHourlyRate ?? "?"}–${job.maxHourlyRate ?? "?"} / hr`); + } + return bits.length > 0 ? bits.join("; ") : undefined; +} + +function locationLine(job: ArcJobJson): string { + if ( + Array.isArray(job.requiredCountries) && + job.requiredCountries.length > 0 + ) { + return job.requiredCountries.join(", "); + } + if (job.timeZone) return job.timeZone; + return "Remote"; +} + +function postedIso(postedAt: number | undefined): string | undefined { + if (typeof postedAt !== "number" || !Number.isFinite(postedAt)) + return undefined; + return new Date(postedAt * 1000).toISOString(); +} + +function parseNextPageProps(html: string): { + arcJobs: ArcJobJson[]; + externalJobs: ArcJobJson[]; +} | null { + const match = html.match( + /