diff --git a/docs-site/docs/extractors/arcdev.md b/docs-site/docs/extractors/arcdev.md index 76f99a2..4e4251c 100644 --- a/docs-site/docs/extractors/arcdev.md +++ b/docs-site/docs/extractors/arcdev.md @@ -24,7 +24,7 @@ Curated remote hiring with explicit tooling-oriented feeds; many roles are open ## Common problems -- **HTML changes:** If Arc ships a new payload shape, parsing may need an update; smoke-test with `npx tsx scripts/smoke-extractors.ts arcdev`, or run the full extractor suite with `npx tsx scripts/smoke-extractors.ts`. +- **HTML changes:** If Arc ships a new payload shape, parsing may need an update; smoke-test with `npm run smoke:extractors -- arcdev`, or run the full suite with `npm run smoke:extractors`. - **`Arc talent network` employer:** Some Arc-managed rows omit a company name; the mapper uses that placeholder. ## Related pages diff --git a/docs-site/docs/features/blocked-countries.md b/docs-site/docs/features/blocked-countries.md new file mode 100644 index 0000000..8870252 --- /dev/null +++ b/docs-site/docs/features/blocked-countries.md @@ -0,0 +1,56 @@ +--- +id: blocked-countries +title: Blocked countries +description: Drop jobs during discovery when the listing location mentions a blocked country. +sidebar_position: 7 +--- + +## What it is + +**Blocked countries** is the **Blocked countries (location filter)** field in **Settings → Scoring Settings**. Any job whose **location** string mentions a supported country you listed (for example `India`, `UK`, `Poland`) is **dropped during discovery** and is never imported. + +Country tokens are normalized to canonical keys (for example `India` → `india`, `UK` → `united kingdom`). + +## Why it exists + +Global and remote boards often return roles tagged to countries you do not want (for example India-remote QA listings while you target Canada). This filter applies at import time so those rows never enter your **Discovered** queue. + +## How to use it + +1. Open **Settings** and expand **Scoring Settings**. +2. Find **Blocked countries (location filter)**. +3. Add country names or aliases one at a time, or paste a comma- or newline-separated list. +4. Click **Save Changes**. +5. Run the pipeline again — blocked countries apply to **new discovery only**; they do **not** delete jobs already in the database. + +### Tips + +- Use country names as they appear on listings: `India`, `Poland`, `United Kingdom`, or aliases `UK`, `USA`. +- Listings with **no recognizable country** in the location field (for example `Remote` only) are **kept**, not blocked. +- The list is capped in Settings validation (max 50 entries, each up to 100 characters). +- Pair with **Search cities** / **country** settings to narrow what extractors query; blocked countries filter what still comes back from broad boards. + +### Maintenance + +- **Add** countries when you see unwanted geography on new runs. +- **Remove** a country if you blocked too aggressively — save, then run discovery again. +- **Existing jobs** are unchanged; use Jobs filters or **Danger Zone** only if you want to clear old rows. + +## Common problems + +### Blocked countries still appear + +- Confirm you clicked **Save Changes**. +- Only **new** pipeline runs apply the filter. Jobs already imported stay in **Discovered** until you skip or delete them. +- If the listing location does not mention the country (for example only `Remote`), the job is intentionally kept. + +### Too many jobs disappeared + +- Check whether a broad alias matched unexpectedly. Remove or narrow the token. + +## Related pages + +- [Company skip list](./company-skip-list) +- [Settings](/docs/features/settings) +- [Pipeline Run](/docs/features/pipeline-run) +- [Orchestrator](/docs/features/orchestrator) diff --git a/docs-site/docs/features/company-skip-list.md b/docs-site/docs/features/company-skip-list.md index e041af1..246b27e 100644 --- a/docs-site/docs/features/company-skip-list.md +++ b/docs-site/docs/features/company-skip-list.md @@ -48,6 +48,7 @@ You may want to avoid certain agencies, staffing brands, or employers without ha ## Related pages +- [Blocked countries](./blocked-countries) - [Settings](/docs/features/settings) - [Pipeline Run](/docs/features/pipeline-run) - [Orchestrator](/docs/features/orchestrator) diff --git a/docs-site/docs/features/pipeline-run.md b/docs-site/docs/features/pipeline-run.md index cd8175b..5bf3799 100644 --- a/docs-site/docs/features/pipeline-run.md +++ b/docs-site/docs/features/pipeline-run.md @@ -105,7 +105,10 @@ When new listings are imported, JobOps does not create a second database row if Existing jobs keep their stored URL; new imports use the canonical form so the same role is not added again under a slightly different link. -To drop companies before import, configure a **company skip list** (blocked company keywords) in **Settings → Scoring Settings**. See [Company skip list](./company-skip-list). +To drop listings before import, use **Settings → Scoring Settings**: + +- [Company skip list](./company-skip-list) — blocked **employer** keywords +- [Blocked countries](./blocked-countries) — drop jobs whose **location** mentions a country you list (for example India) ## Common problems @@ -140,6 +143,7 @@ To drop companies before import, configure a **company skip list** (blocked comp ## Related pages - [Company skip list](./company-skip-list) +- [Blocked countries](./blocked-countries) - [Find Jobs and Apply Workflow](/docs/next/workflows/find-jobs-and-apply-workflow) - [Manual Import Extractor](/docs/next/extractors/manual) - [Orchestrator](/docs/next/features/orchestrator) diff --git a/docs-site/docs/features/settings.md b/docs-site/docs/features/settings.md index e03a844..14757dc 100644 --- a/docs-site/docs/features/settings.md +++ b/docs-site/docs/features/settings.md @@ -170,6 +170,7 @@ Readiness requires: - Set penalty amount - Optional auto-skip threshold for low-score jobs - **Company skip list** (blocked company keywords): drop listings during discovery when the employer name contains a token — see [Company skip list](./company-skip-list) +- **Blocked countries** (location filter): drop listings when the job location mentions a listed country — see [Blocked countries](./blocked-countries) - Add custom scoring instructions to tell the AI what to weigh more or less ### Danger Zone @@ -262,6 +263,7 @@ curl -X POST "http://localhost:3001/api/backups" ## Related pages - [Company skip list](./company-skip-list) +- [Blocked countries](./blocked-countries) - [Reactive Resume](/docs/next/features/reactive-resume) - [Database Backups](/docs/next/getting-started/database-backups) - [Overview](/docs/next/features/overview) diff --git a/docs-site/docs/workflows/add-an-extractor.md b/docs-site/docs/workflows/add-an-extractor.md index 774f501..959b53b 100644 --- a/docs-site/docs/workflows/add-an-extractor.md +++ b/docs-site/docs/workflows/add-an-extractor.md @@ -83,13 +83,13 @@ Subprocess extractors are supported. Keep subprocess spawning inside `run(contex After wiring settings/env, run: ```bash -npx tsx scripts/smoke-extractors.ts myextractor +npm run smoke:extractors -- myextractor ``` -Or the full suite (may take several minutes — JobSpy invokes Python, Hiring Cafe / startup.jobs may need browser deps): +Or the full suite (may take several minutes — JobSpy invokes Python; Gradcracker / Hiring Cafe need Camoufox: `npx camoufox-js fetch`): ```bash -npx tsx scripts/smoke-extractors.ts +npm run smoke:extractors ``` Keep `ALL_TARGETS` in that script aligned with manifests under each `extractors//` package (`manifest.ts` or `src/manifest.ts`). diff --git a/docs-site/sidebars.ts b/docs-site/sidebars.ts index 89df7fe..9fa64a2 100644 --- a/docs-site/sidebars.ts +++ b/docs-site/sidebars.ts @@ -33,6 +33,7 @@ const sidebars: SidebarsConfig = { "features/orchestrator", "features/settings", "features/company-skip-list", + "features/blocked-countries", "features/reactive-resume", "features/in-progress-board", "features/ghostwriter", diff --git a/extractors/adzuna/src/run.ts b/extractors/adzuna/src/run.ts index 6b117b4..c4eaf8c 100644 --- a/extractors/adzuna/src/run.ts +++ b/extractors/adzuna/src/run.ts @@ -4,6 +4,7 @@ import { createRequire } from "node:module"; import { dirname, join } from "node:path"; import { createInterface } from "node:readline"; import { fileURLToPath } from "node:url"; +import { envForExtractorSubprocess } from "@shared/extractor-subprocess-env.js"; import { normalizeCountryKey } from "@shared/location-support.js"; import { resolveSearchCities, @@ -210,7 +211,7 @@ export async function runAdzuna( location !== null && shouldApplyStrictCityFilter(location, countryKey); await new Promise((resolve, reject) => { - const extractorEnv = { + const extractorEnv = envForExtractorSubprocess({ ...process.env, JOBOPS_EMIT_PROGRESS: "1", ADZUNA_APP_ID: appId, @@ -220,7 +221,7 @@ export async function runAdzuna( ADZUNA_SEARCH_TERMS: JSON.stringify(searchTerms), ADZUNA_OUTPUT_JSON: DATASET_PATH, ADZUNA_LOCATION_QUERY: strictLocationFilter ? location : "", - }; + }); const child = useNpmCommand ? spawn("npm", ["run", "start"], { cwd: EXTRACTOR_DIR, diff --git a/extractors/gradcracker/src/run.ts b/extractors/gradcracker/src/run.ts index 5890bd7..9a794f9 100644 --- a/extractors/gradcracker/src/run.ts +++ b/extractors/gradcracker/src/run.ts @@ -3,6 +3,7 @@ import { mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises"; import { dirname, join } from "node:path"; import { createInterface } from "node:readline"; import { fileURLToPath } from "node:url"; +import { envForExtractorSubprocess } from "@shared/extractor-subprocess-env.js"; type CreateJobInput = { source: "gradcracker"; @@ -75,7 +76,7 @@ export async function runCrawler( cwd: EXTRACTOR_DIR, shell: true, stdio: ["ignore", "pipe", "pipe"], - env: { + env: envForExtractorSubprocess({ ...process.env, JOBOPS_SKIP_APPLY_FOR_EXISTING: "1", JOBOPS_EMIT_PROGRESS: "1", @@ -88,7 +89,7 @@ export async function runCrawler( ...(existingJobUrlsFile ? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile } : {}), - }, + }), }); const handleLine = (line: string, stream: NodeJS.WriteStream) => { diff --git a/extractors/hiringcafe/src/run.ts b/extractors/hiringcafe/src/run.ts index 04df97e..6e76201 100644 --- a/extractors/hiringcafe/src/run.ts +++ b/extractors/hiringcafe/src/run.ts @@ -4,6 +4,7 @@ import { createRequire } from "node:module"; import { dirname, join } from "node:path"; import { createInterface } from "node:readline"; import { fileURLToPath } from "node:url"; +import { envForExtractorSubprocess } from "@shared/extractor-subprocess-env.js"; import { normalizeCountryKey } from "@shared/location-support.js"; import { resolveSearchCities, @@ -212,7 +213,7 @@ export async function runHiringCafe( await clearStorageDataset(); await new Promise((resolve, reject) => { - const extractorEnv = { + const extractorEnv = envForExtractorSubprocess({ ...process.env, JOBOPS_EMIT_PROGRESS: "1", HIRING_CAFE_SEARCH_TERMS: JSON.stringify(searchTerms), @@ -226,7 +227,7 @@ export async function runHiringCafe( HIRING_CAFE_LOCATION_RADIUS_MILES: strictLocationFilter ? String(locationRadiusMiles) : "", - }; + }); const child = useNpmCommand ? spawn("npm", ["run", "start"], { diff --git a/extractors/ukvisajobs/src/run.ts b/extractors/ukvisajobs/src/run.ts index e4e2e40..b3c4569 100644 --- a/extractors/ukvisajobs/src/run.ts +++ b/extractors/ukvisajobs/src/run.ts @@ -22,6 +22,7 @@ type CreateJobInput = { jobLevel?: string; }; +import { envForExtractorSubprocess } from "@shared/extractor-subprocess-env.js"; import { toNumberOrNull, toStringOrNull, @@ -299,12 +300,12 @@ export async function runUkVisaJobs( const child = spawn("npx", ["tsx", "src/main.ts"], { cwd: EXTRACTOR_DIR, stdio: ["ignore", "pipe", "pipe"], - env: { + env: envForExtractorSubprocess({ ...process.env, JOBOPS_EMIT_PROGRESS: "1", UKVISAJOBS_MAX_JOBS: String(options.maxJobs ?? 50), UKVISAJOBS_SEARCH_KEYWORD: term, - }, + }), }); const handleLine = (line: string, stream: NodeJS.WriteStream) => { diff --git a/orchestrator/src/client/pages/SettingsPage.tsx b/orchestrator/src/client/pages/SettingsPage.tsx index 87ca846..3ed0753 100644 --- a/orchestrator/src/client/pages/SettingsPage.tsx +++ b/orchestrator/src/client/pages/SettingsPage.tsx @@ -31,6 +31,7 @@ import { resumeProjectsEqual, } from "@client/pages/settings/utils"; import { zodResolver } from "@hookform/resolvers/zod"; +import { normalizeBlockedCountryTokens } from "@shared/blocked-countries.js"; import { normalizeStringArray } from "@shared/normalize-string-array.js"; import { type UpdateSettingsInput, @@ -102,6 +103,7 @@ const DEFAULT_FORM_VALUES: UpdateSettingsInput = { missingSalaryPenalty: null, autoSkipScoreThreshold: null, blockedCompanyKeywords: [], + blockedCountries: [], scoringInstructions: "", }; @@ -182,6 +184,7 @@ const NULL_SETTINGS_PAYLOAD: UpdateSettingsInput = { missingSalaryPenalty: null, autoSkipScoreThreshold: null, blockedCompanyKeywords: null, + blockedCountries: null, scoringInstructions: null, }; @@ -228,6 +231,7 @@ const mapSettingsToForm = (data: AppSettings): UpdateSettingsInput => ({ missingSalaryPenalty: data.missingSalaryPenalty.override, autoSkipScoreThreshold: data.autoSkipScoreThreshold.override, blockedCompanyKeywords: data.blockedCompanyKeywords.override ?? [], + blockedCountries: data.blockedCountries.override ?? [], scoringInstructions: data.scoringInstructions.override ?? "", }); @@ -398,6 +402,10 @@ const getDerivedSettings = (settings: AppSettings | null) => { effective: settings?.blockedCompanyKeywords?.value ?? [], default: settings?.blockedCompanyKeywords?.default ?? [], }, + blockedCountries: { + effective: settings?.blockedCountries?.value ?? [], + default: settings?.blockedCountries?.default ?? [], + }, scoringInstructions: { effective: settings?.scoringInstructions?.value ?? "", default: settings?.scoringInstructions?.default ?? "", @@ -928,6 +936,17 @@ export const SettingsPage: React.FC = () => { ? null : normalized; })(), + blockedCountries: (() => { + const normalized = normalizeBlockedCountryTokens( + normalizeStringArray(data.blockedCountries), + ); + const normalizedDefault = normalizeBlockedCountryTokens( + scoring.blockedCountries.default, + ); + return stringArraysEqual(normalized, normalizedDefault) + ? null + : normalized; + })(), scoringInstructions: nullIfSame( normalizeString(data.scoringInstructions), scoring.scoringInstructions.default, diff --git a/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx b/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx index 668ba3b..ae4baa4 100644 --- a/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx +++ b/orchestrator/src/client/pages/settings/components/ScoringSettingsSection.tsx @@ -1,6 +1,7 @@ import { TokenizedInput } from "@client/pages/orchestrator/TokenizedInput"; import { SettingsInput } from "@client/pages/settings/components/SettingsInput"; import type { ScoringValues } from "@client/pages/settings/types"; +import { formatCountryLabel } from "@shared/location-support"; import type { UpdateSettingsInput } from "@shared/settings-schema.js"; import type React from "react"; import { useState } from "react"; @@ -37,11 +38,13 @@ export const ScoringSettingsSection: React.FC = ({ missingSalaryPenalty, autoSkipScoreThreshold, blockedCompanyKeywords, + blockedCountries, scoringInstructions, } = values; const { control, watch, setValue } = useFormContext(); const [blockedCompanyKeywordDraft, setBlockedCompanyKeywordDraft] = useState(""); + const [blockedCountryDraft, setBlockedCountryDraft] = useState(""); // Watch the current form value to conditionally show/hide penalty input const currentPenalizeEnabled = @@ -51,6 +54,13 @@ export const ScoringSettingsSection: React.FC = ({ const currentAutoSkipThreshold = watch("autoSkipScoreThreshold"); const blockedCompanyKeywordValues = watch("blockedCompanyKeywords") ?? blockedCompanyKeywords.default; + const blockedCountryValues = + watch("blockedCountries") ?? blockedCountries.default; + + const formatCountryList = (keys: string[]) => + keys.length > 0 + ? keys.map((key) => formatCountryLabel(key)).join(", ") + : "None"; return ( @@ -243,6 +253,35 @@ export const ScoringSettingsSection: React.FC = ({ +
+ + + setValue("blockedCountries", value, { shouldDirty: true }) + } + placeholder='e.g. "India", "Poland"' + helperText="Country names or aliases (UK, USA). Jobs whose location mentions a listed country are dropped during discovery. Listings with no recognizable country in the location field are kept." + removeLabelPrefix="Remove blocked country" + disabled={isLoading || isSaving} + /> +
+ Effective: {formatCountryList(blockedCountryValues)} | Default:{" "} + {formatCountryList(blockedCountries.default)} +
+
+ + + {/* Effective/Default values display */}
diff --git a/orchestrator/src/client/pages/settings/types.ts b/orchestrator/src/client/pages/settings/types.ts index fa4bb17..33083d9 100644 --- a/orchestrator/src/client/pages/settings/types.ts +++ b/orchestrator/src/client/pages/settings/types.ts @@ -59,5 +59,6 @@ export type ScoringValues = { missingSalaryPenalty: EffectiveDefault; autoSkipScoreThreshold: EffectiveDefault; blockedCompanyKeywords: EffectiveDefault; + blockedCountries: EffectiveDefault; scoringInstructions: EffectiveDefault; }; diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts index 26d7a7c..c12322b 100644 --- a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts +++ b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts @@ -333,6 +333,67 @@ describe("discoverJobsStep", () => { expect(result.discoveredJobs).toHaveLength(0); }); + it("drops discovered jobs when location is in a blocked country", async () => { + const settingsRepo = await import("@server/repositories/settings"); + const registryModule = await import("@server/extractors/registry"); + + const jobspyManifest = { + id: "jobspy", + displayName: "JobSpy", + providesSources: ["linkedin"], + run: vi.fn().mockResolvedValue({ + success: true, + jobs: [ + { + source: "linkedin", + title: "SDET", + employer: "Acme", + location: "Bangalore, India", + jobUrl: "https://example.com/job-in", + }, + { + source: "linkedin", + title: "SDET", + employer: "Contoso", + location: "Toronto, ON, Canada", + jobUrl: "https://example.com/job-ca", + }, + { + source: "linkedin", + title: "SDET", + employer: "Remote Co", + location: "Remote", + jobUrl: "https://example.com/job-remote", + }, + ], + }), + }; + + vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({ + searchTerms: JSON.stringify(["sdet"]), + blockedCountries: JSON.stringify(["india"]), + } as any); + + vi.mocked(registryModule.getExtractorRegistry).mockResolvedValue({ + manifests: new Map([["jobspy", jobspyManifest as any]]), + manifestBySource: new Map([["linkedin", jobspyManifest as any]]), + availableSources: ["linkedin"], + } as any); + + const result = await discoverJobsStep({ + mergedConfig: { + ...baseConfig, + sources: ["linkedin"], + }, + }); + + expect(result.discoveredJobs).toHaveLength(2); + expect(result.discoveredJobs.map((job) => job.jobUrl)).toEqual([ + "https://example.com/job-ca", + "https://example.com/job-remote", + ]); + }); + it("applies shared city filtering for sources without native city filtering", async () => { const settingsRepo = await import("@server/repositories/settings"); const registryModule = await import("@server/extractors/registry"); diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.ts index 7a5ea02..10a55f0 100644 --- a/orchestrator/src/server/pipeline/steps/discover-jobs.ts +++ b/orchestrator/src/server/pipeline/steps/discover-jobs.ts @@ -6,6 +6,10 @@ import { getAllJobUrls } from "@server/repositories/jobs"; import { getProfileById } from "@server/repositories/profiles"; import * as settingsRepo from "@server/repositories/settings"; import { asyncPool } from "@server/utils/async-pool"; +import { + jobMatchesBlockedCountries, + resolveBlockedCountriesFromStoredString, +} from "@shared/blocked-countries.js"; import { formatCountryLabel, isSourceAllowedForCountry, @@ -520,19 +524,20 @@ export async function discoverJobsStep(args: { const blockedKeywordsLowerCase = blockedCompanyKeywords.map((value) => value.toLowerCase(), ); - const filteredDiscoveredJobs = cityFilteredJobs.filter( + const afterCompanyFilter = cityFilteredJobs.filter( (job) => !isBlockedEmployer(job.employer, blockedKeywordsLowerCase), ); - const droppedCount = cityFilteredJobs.length - filteredDiscoveredJobs.length; + const companyDroppedCount = + cityFilteredJobs.length - afterCompanyFilter.length; - if (droppedCount > 0) { + if (companyDroppedCount > 0) { const blockedCompanyKeywordsPreview = blockedCompanyKeywords.slice(0, 10); const blockedCompanyKeywordsTruncated = blockedCompanyKeywordsPreview.length < blockedCompanyKeywords.length; logger.info("Dropped discovered jobs matching blocked company keywords", { step: "discover-jobs", - droppedCount, + droppedCount: companyDroppedCount, blockedKeywordCount: blockedCompanyKeywords.length, blockedCompanyKeywordsPreview, blockedCompanyKeywordsTruncated, @@ -544,6 +549,34 @@ export async function discoverJobsStep(args: { }); } + const blockedCountryKeys = resolveBlockedCountriesFromStoredString( + settings.blockedCountries, + ); + const filteredDiscoveredJobs = afterCompanyFilter.filter( + (job) => !jobMatchesBlockedCountries(job.location, blockedCountryKeys), + ); + const countryDroppedCount = + afterCompanyFilter.length - filteredDiscoveredJobs.length; + + if (countryDroppedCount > 0) { + const blockedCountriesPreview = blockedCountryKeys.slice(0, 10); + const blockedCountriesTruncated = + blockedCountriesPreview.length < blockedCountryKeys.length; + + logger.info("Dropped discovered jobs in blocked countries", { + step: "discover-jobs", + droppedCount: countryDroppedCount, + blockedCountryCount: blockedCountryKeys.length, + blockedCountriesPreview, + blockedCountriesTruncated, + }); + + logger.debug("Full blocked countries used for filtering", { + step: "discover-jobs", + blockedCountryKeys, + }); + } + if (args.shouldCancel?.()) { return { discoveredJobs: filteredDiscoveredJobs, sourceErrors }; } diff --git a/package.json b/package.json index 98bf87e..cb8833d 100644 --- a/package.json +++ b/package.json @@ -21,7 +21,8 @@ "docs:serve": "npm --workspace docs-site run serve", "docs:version": "npm --workspace docs-site run docs:version", "release:set-version": "node ./scripts/set-orchestrator-version.mjs", - "knip": "knip" + "knip": "knip", + "smoke:extractors": "node scripts/run-smoke-extractors.mjs" }, "devDependencies": { "dotenv": "^17.2.3", diff --git a/scripts/run-smoke-extractors.mjs b/scripts/run-smoke-extractors.mjs new file mode 100644 index 0000000..ad714e9 --- /dev/null +++ b/scripts/run-smoke-extractors.mjs @@ -0,0 +1,17 @@ +import { spawnSync } from "node:child_process"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +const scriptsDir = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.join(scriptsDir, ".."); +const tsx = path.join(repoRoot, "node_modules", ".bin", "tsx"); +const tsconfig = path.join(scriptsDir, "tsconfig.smoke.json"); +const smokeScript = path.join(scriptsDir, "smoke-extractors.ts"); + +const result = spawnSync( + tsx, + ["--tsconfig", tsconfig, smokeScript, ...process.argv.slice(2)], + { cwd: repoRoot, stdio: "inherit", env: process.env }, +); + +process.exit(result.status ?? 1); diff --git a/scripts/smoke-extractors.ts b/scripts/smoke-extractors.ts index c265480..2e8da4c 100644 --- a/scripts/smoke-extractors.ts +++ b/scripts/smoke-extractors.ts @@ -2,21 +2,37 @@ * Smoke-test helper for extractor manifests: imports each manifest, runs it with a * minimal context, and prints mapped job counts + a sample row. * - * Run from repo root: - * npx tsx scripts/smoke-extractors.ts - * npx tsx scripts/smoke-extractors.ts arcdev,icims - * npx tsx scripts/smoke-extractors.ts indeed # alias → `jobspy` (same manifest) + * Run from repo root (`run-smoke-extractors.mjs` applies repo tsconfig for `@shared/*`): + * npm run smoke:extractors + * npm run smoke:extractors -- arcdev,icims + * npm run smoke:extractors -- indeed # alias → `jobspy` (same manifest) * * Keep `ALL_TARGETS` aligned with every shipped manifest under each * `extractors//` package (`manifest.ts` or `src/manifest.ts`). * * Loads repo-root `.env` so keyed extractors match orchestrator behavior (plain * `tsx` does not read `.env` automatically). + * + * Resolves `@shared/*` imports in extractor manifests via `scripts/smoke-resolve-shared.mjs`. */ +import { existsSync } from "node:fs"; +import { register } from "node:module"; +import { homedir } from "node:os"; import path from "node:path"; -import { fileURLToPath } from "node:url"; +import { fileURLToPath, pathToFileURL } from "node:url"; import { config as loadEnv } from "dotenv"; + +register( + pathToFileURL( + path.join( + path.dirname(fileURLToPath(import.meta.url)), + "smoke-resolve-shared.mjs", + ), + ).href, + import.meta.url, +); + import type { ExtractorManifest, ExtractorRuntimeContext, @@ -49,6 +65,8 @@ interface Target { id: string; importPath: string; needs?: string[]; // env vars required to run; skipped if missing + /** When set, skip with this message (e.g. local Camoufox not installed). */ + skipReason?: string; settings?: Record; /** When set, replaces the default smoke search terms (use [] for sources that filter client-side). */ searchTerms?: string[]; @@ -56,6 +74,15 @@ interface Target { selectedCountry?: string; } +function camoufoxInstalled(): boolean { + const home = homedir(); + const candidates = [ + path.join(home, "Library", "Caches", "camoufox", "version.json"), + path.join(home, ".cache", "camoufox", "version.json"), + ]; + return candidates.some((filePath) => existsSync(filePath)); +} + const ALL_TARGETS: Target[] = [ { id: "adzuna", @@ -120,6 +147,9 @@ const ALL_TARGETS: Target[] = [ importPath: "../extractors/gradcracker/manifest", selectedCountry: "United Kingdom", settings: { gradcrackerMaxJobsPerTerm: "10" }, + skipReason: camoufoxInstalled() + ? undefined + : "Camoufox not installed (run: npx camoufox-js fetch)", }, { id: "greenhouse", @@ -142,6 +172,9 @@ const ALL_TARGETS: Target[] = [ jobspyResultsWanted: "10", workplaceTypes: JSON.stringify(["remote", "hybrid", "onsite"]), }, + skipReason: camoufoxInstalled() + ? undefined + : "Camoufox not installed (run: npx camoufox-js fetch)", }, { id: "icims", @@ -280,6 +313,11 @@ function pad(s: string, n: number): string { } async function runOne(target: Target): Promise { + if (target.skipReason) { + console.log(`${pad(target.id, ID_COL)} SKIP ${target.skipReason}`); + return; + } + const missing = (target.needs ?? []).filter((k) => !process.env[k]); if (missing.length > 0) { console.log( diff --git a/scripts/smoke-resolve-shared.mjs b/scripts/smoke-resolve-shared.mjs new file mode 100644 index 0000000..1fe7eb6 --- /dev/null +++ b/scripts/smoke-resolve-shared.mjs @@ -0,0 +1,28 @@ +/** + * ESM resolve hook: map `@shared/foo.js` → `shared/src/foo.ts` for smoke-extractors. + * Extractor manifests use tsconfig path aliases; plain `tsx` does not apply them to + * dynamic `import()` unless each package's tsconfig includes `manifest.ts`. + */ +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath, pathToFileURL } from "node:url"; + +const repoRoot = join(dirname(fileURLToPath(import.meta.url)), ".."); +const sharedSrc = join(repoRoot, "shared", "src"); + +export async function resolve(specifier, context, nextResolve) { + if (specifier.startsWith("@shared/")) { + const subpath = specifier.slice("@shared/".length).replace(/\.js$/i, ""); + const candidates = [ + join(sharedSrc, `${subpath}.ts`), + join(sharedSrc, `${subpath}.tsx`), + join(sharedSrc, subpath, "index.ts"), + ]; + for (const filePath of candidates) { + if (existsSync(filePath)) { + return nextResolve(pathToFileURL(filePath).href, context); + } + } + } + return nextResolve(specifier, context); +} diff --git a/scripts/tsconfig.smoke.json b/scripts/tsconfig.smoke.json new file mode 100644 index 0000000..45eab7d --- /dev/null +++ b/scripts/tsconfig.smoke.json @@ -0,0 +1,16 @@ +{ + "compilerOptions": { + "module": "ESNext", + "moduleResolution": "bundler", + "target": "ES2022", + "strict": true, + "skipLibCheck": true, + "lib": ["ES2022"], + "types": ["node"], + "baseUrl": "..", + "paths": { + "@shared/*": ["shared/src/*"] + } + }, + "include": ["./smoke-extractors.ts", "../extractors/**/manifest.ts", "../extractors/**/src/**/*.ts"] +} diff --git a/shared/src/blocked-countries.test.ts b/shared/src/blocked-countries.test.ts new file mode 100644 index 0000000..876b372 --- /dev/null +++ b/shared/src/blocked-countries.test.ts @@ -0,0 +1,37 @@ +import { describe, expect, it } from "vitest"; +import { + jobMatchesBlockedCountries, + normalizeBlockedCountryTokens, + resolveBlockedCountriesFromStoredString, +} from "./blocked-countries.js"; + +describe("blocked-countries", () => { + it("normalizes country tokens to canonical keys", () => { + expect(normalizeBlockedCountryTokens(["India", "UK", "bogus"])).toEqual([ + "india", + "united kingdom", + ]); + }); + + it("parses JSON and legacy comma-separated storage", () => { + expect( + resolveBlockedCountriesFromStoredString(JSON.stringify(["India"])), + ).toEqual(["india"]); + expect(resolveBlockedCountriesFromStoredString("india, Poland")).toEqual([ + "india", + "poland", + ]); + }); + + it("matches jobs whose location includes a blocked country", () => { + const blocked = resolveBlockedCountriesFromStoredString('["india"]'); + expect( + jobMatchesBlockedCountries("Bangalore, Karnataka, India", blocked), + ).toBe(true); + expect(jobMatchesBlockedCountries("Toronto, ON, Canada", blocked)).toBe( + false, + ); + expect(jobMatchesBlockedCountries("Remote", blocked)).toBe(false); + expect(jobMatchesBlockedCountries(null, blocked)).toBe(false); + }); +}); diff --git a/shared/src/blocked-countries.ts b/shared/src/blocked-countries.ts new file mode 100644 index 0000000..cb498ec --- /dev/null +++ b/shared/src/blocked-countries.ts @@ -0,0 +1,56 @@ +import { + normalizeCountryKey, + SUPPORTED_COUNTRY_KEYS, +} from "./location-support.js"; +import { normalizeStringArray } from "./normalize-string-array.js"; +import { inferCountryKeysFromJobLocation } from "./search-cities.js"; + +const supportedCountryKeySet = new Set(SUPPORTED_COUNTRY_KEYS); + +/** + * Parse stored settings value for blocked countries. + * Accepts JSON string array (normal) or legacy plain comma/newline-separated text. + * Tokens are normalized to canonical country keys (e.g. "India" → "india"). + */ +export function resolveBlockedCountriesFromStoredString( + raw: string | undefined, +): string[] { + if (!raw?.trim()) return []; + try { + const parsed: unknown = JSON.parse(raw); + if (Array.isArray(parsed)) { + return normalizeBlockedCountryTokens( + parsed.filter((item): item is string => typeof item === "string"), + ); + } + } catch { + // Legacy or corrupted JSON: treat as token list + } + return normalizeBlockedCountryTokens( + raw + .split(/[\n,]/g) + .map((segment) => segment.trim()) + .filter(Boolean), + ); +} + +export function normalizeBlockedCountryTokens(tokens: string[]): string[] { + const keys = new Set(); + for (const token of normalizeStringArray(tokens)) { + const key = normalizeCountryKey(token); + if (supportedCountryKeySet.has(key)) keys.add(key); + } + return [...keys]; +} + +/** True when the job location mentions a blocked country (unknown location is kept). */ +export function jobMatchesBlockedCountries( + location: string | null | undefined, + blockedCountryKeys: readonly string[], +): boolean { + if (blockedCountryKeys.length === 0) return false; + const blocked = new Set(blockedCountryKeys); + const jobCountries = inferCountryKeysFromJobLocation(location); + if (jobCountries.length === 0) return false; + return jobCountries.some((key) => blocked.has(key)); +} diff --git a/shared/src/extractor-subprocess-env.ts b/shared/src/extractor-subprocess-env.ts new file mode 100644 index 0000000..1b58532 --- /dev/null +++ b/shared/src/extractor-subprocess-env.ts @@ -0,0 +1,20 @@ +/** Drop repo-root smoke `tsx --tsconfig` settings so nested extractor `tsx` runs use package tsconfig. */ +export function envForExtractorSubprocess( + base: NodeJS.ProcessEnv, +): NodeJS.ProcessEnv { + const env = { ...base }; + delete env.TSX_TSCONFIG; + delete env.TSX_TSCONFIG_PATH; + const nodeOptions = env.NODE_OPTIONS; + if ( + typeof nodeOptions === "string" && + nodeOptions.includes("tsconfig.smoke") + ) { + env.NODE_OPTIONS = nodeOptions + .split(/\s+/) + .filter((part) => !part.includes("tsconfig.smoke")) + .join(" ") + .trim(); + } + return env; +} diff --git a/shared/src/settings-registry.ts b/shared/src/settings-registry.ts index a258078..79ed2f5 100644 --- a/shared/src/settings-registry.ts +++ b/shared/src/settings-registry.ts @@ -658,6 +658,13 @@ export const settingsRegistry = { parse: parseJsonArrayOrNull, serialize: serializeNullableJsonArray, }, + blockedCountries: { + kind: "typed" as const, + schema: z.array(z.string().trim().min(1).max(100)).max(50), + default: (): string[] => [], + parse: parseJsonArrayOrNull, + serialize: serializeNullableJsonArray, + }, scoringInstructions: { kind: "typed" as const, schema: z.string().trim().max(4000), diff --git a/shared/src/testing/factories.ts b/shared/src/testing/factories.ts index 8380095..a74887c 100644 --- a/shared/src/testing/factories.ts +++ b/shared/src/testing/factories.ts @@ -239,6 +239,11 @@ export const createAppSettings = ( default: [], override: null, }, + blockedCountries: { + value: [], + default: [], + override: null, + }, scoringInstructions: { value: "", default: "", diff --git a/shared/src/types/settings.ts b/shared/src/types/settings.ts index 4112f22..529300b 100644 --- a/shared/src/types/settings.ts +++ b/shared/src/types/settings.ts @@ -232,6 +232,7 @@ export interface AppSettings { searchTerms: Resolved; workplaceTypes: Resolved>; blockedCompanyKeywords: Resolved; + blockedCountries: Resolved; scoringInstructions: Resolved; searchCities: Resolved; jobspyResultsWanted: Resolved;