From 4e1ea28301d170f0eda3b0eda57b8c383e1ad48f Mon Sep 17 00:00:00 2001
From: Shaheer Sarfaraz <53654735+DaKheera47@users.noreply.github.com>
Date: Tue, 10 Feb 2026 17:57:49 +0000
Subject: [PATCH] Enable Glassdoor as a JobSpy source (#126)
* feat(shared): add glassdoor to job source model
* feat(jobspy): support glassdoor site in scraper and discovery
* feat(pipeline): include glassdoor in source selection and API schema
* feat(ui): add glassdoor toggle to jobspy settings and run estimates
* test/docs: cover glassdoor jobspy integration end-to-end
* fix(jobspy): make glassdoor always-on without settings toggle
* fix(jobspy): fallback glassdoor when location is country-level
* refactor(jobspy): drop direct pandas usage in wrapper
* feat(pipeline): gate glassdoor by supported countries
* fix(jobspy): restore pandas output and keep glassdoor disable copy
* fix(jobspy): map country-level glassdoor searches to city fallbacks
* feat(ui): require glassdoor city for country-level runs
---
README.md | 2 +-
documentation/extractors/jobspy.md | 4 +-
extractors/jobspy/requirements.txt | 1 +
extractors/jobspy/scrape_jobs.py | 130 ++++++++++++++++--
.../src/client/pages/OrchestratorPage.tsx | 6 +-
.../src/client/pages/SettingsPage.tsx | 18 ++-
.../orchestrator/AutomaticRunTab.test.tsx | 65 +++++++++
.../pages/orchestrator/AutomaticRunTab.tsx | 92 ++++++++++---
.../pages/orchestrator/automatic-run.test.ts | 4 +-
.../pages/orchestrator/automatic-run.ts | 8 +-
.../client/pages/orchestrator/constants.ts | 1 +
.../src/client/pages/orchestrator/utils.ts | 20 ++-
.../components/JobspySection.test.tsx | 7 +-
.../settings/components/JobspySection.tsx | 12 +-
orchestrator/src/lib/utils.ts | 1 +
.../src/server/api/routes/pipeline.test.ts | 11 ++
.../src/server/api/routes/pipeline.ts | 4 +-
.../src/server/config/demo-defaults.data.ts | 3 +-
orchestrator/src/server/db/schema.ts | 9 +-
.../src/server/pipeline/orchestrator.ts | 1 +
.../pipeline/steps/discover-jobs.test.ts | 86 ++++++++++++
.../server/pipeline/steps/discover-jobs.ts | 8 +-
orchestrator/src/server/services/jobspy.ts | 7 +-
.../services/settings-conversion.test.ts | 20 +++
.../server/services/settings-conversion.ts | 28 +++-
shared/src/location-support.test.ts | 15 +-
shared/src/location-support.ts | 35 ++++-
shared/src/types.ts | 1 +
28 files changed, 530 insertions(+), 69 deletions(-)
diff --git a/README.md b/README.md
index 3209922..f3e57f3 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ AI-powered job discovery and application pipeline. Automatically finds jobs, sco
## Workflow
-1. **Search**: Scrapes Gradcracker, Indeed, LinkedIn, and UK Visa Sponsorship jobs.
+1. **Search**: Scrapes Gradcracker, Indeed, LinkedIn, Glassdoor, and UK Visa Sponsorship jobs.
2. **Score**: AI ranks jobs by suitability using the configured LLM provider (OpenRouter by default).
3. **Tailor**: Generates a custom resume summary for top-tier matches.
4. **Export**: Uses [RxResume v4](https://v4.rxresu.me) to create tailored PDFs.
diff --git a/documentation/extractors/jobspy.md b/documentation/extractors/jobspy.md
index 488a5dd..b54a07c 100644
--- a/documentation/extractors/jobspy.md
+++ b/documentation/extractors/jobspy.md
@@ -1,6 +1,6 @@
# JobSpy Extractor (How It Works)
-This is a simple walkthrough of the JobSpy extractor used for Indeed and LinkedIn.
+This is a simple walkthrough of the JobSpy extractor used for Indeed, LinkedIn, and Glassdoor.
## Big picture
@@ -34,7 +34,7 @@ The Node service (`orchestrator/src/server/services/jobspy.ts`) controls the run
The mapper normalizes fields like salary ranges, converts empty values to null, and keeps extra metadata (skills, company rating, remote flag, etc.) when available.
-If a row is missing a valid site (`indeed` or `linkedin`) or a job URL, it gets skipped.
+If a row is missing a valid site (`indeed`, `linkedin`, or `glassdoor`) or a job URL, it gets skipped.
## Notes
diff --git a/extractors/jobspy/requirements.txt b/extractors/jobspy/requirements.txt
index 8c5560a..45fa5b4 100644
--- a/extractors/jobspy/requirements.txt
+++ b/extractors/jobspy/requirements.txt
@@ -1 +1,2 @@
python-jobspy
+pandas
diff --git a/extractors/jobspy/scrape_jobs.py b/extractors/jobspy/scrape_jobs.py
index 81bfe47..2c85bf1 100644
--- a/extractors/jobspy/scrape_jobs.py
+++ b/extractors/jobspy/scrape_jobs.py
@@ -3,9 +3,41 @@ import json
import os
from pathlib import Path
+import pandas as pd
from jobspy import scrape_jobs
PROGRESS_PREFIX = "JOBOPS_PROGRESS "
+COUNTRY_ALIASES = {
+ "uk": "united kingdom",
+ "united kingdom": "united kingdom",
+ "us": "united states",
+ "usa": "united states",
+ "united states": "united states",
+ "türkiye": "turkey",
+ "czech republic": "czechia",
+}
+GLASSDOOR_COUNTRY_TO_CITY = {
+ "australia": "Sydney",
+ "austria": "Vienna",
+ "belgium": "Brussels",
+ "brazil": "Sao Paulo",
+ "canada": "Toronto",
+ "france": "Paris",
+ "germany": "Berlin",
+ "hong kong": "Hong Kong",
+ "india": "Bengaluru",
+ "ireland": "Dublin",
+ "italy": "Milan",
+ "mexico": "Mexico City",
+ "netherlands": "Amsterdam",
+ "new zealand": "Auckland",
+ "singapore": "Singapore",
+ "spain": "Madrid",
+ "switzerland": "Zurich",
+ "united kingdom": "London",
+ "united states": "New York",
+ "vietnam": "Ho Chi Minh City",
+}
def _env_str(name: str, default: str) -> str:
@@ -39,6 +71,47 @@ def _parse_sites(raw: str) -> list[str]:
return [s.strip() for s in raw.split(",") if s.strip()]
+def _normalize_country_token(value: str) -> str:
+ normalized = " ".join(value.strip().lower().split())
+ return COUNTRY_ALIASES.get(normalized, normalized)
+
+
+def _is_country_level_location(location: str, country_indeed: str) -> bool:
+ if not location.strip() or not country_indeed.strip():
+ return False
+ return _normalize_country_token(location) == _normalize_country_token(country_indeed)
+
+
+def _glassdoor_city_for_country(country_indeed: str, location: str) -> str | None:
+ country_key = _normalize_country_token(country_indeed or location)
+ return GLASSDOOR_COUNTRY_TO_CITY.get(country_key)
+
+
+def _scrape_for_sites(
+ *,
+ sites: list[str],
+ search_term: str,
+ location: str | None,
+ results_wanted: int,
+ hours_old: int,
+ country_indeed: str,
+ linkedin_fetch_description: bool,
+ is_remote: bool,
+) -> pd.DataFrame:
+ kwargs: dict[str, object] = {
+ "site_name": sites,
+ "search_term": search_term,
+ "results_wanted": results_wanted,
+ "hours_old": hours_old,
+ "country_indeed": country_indeed,
+ "linkedin_fetch_description": linkedin_fetch_description,
+ "is_remote": is_remote,
+ }
+ if location and location.strip():
+ kwargs["location"] = location
+ return scrape_jobs(**kwargs)
+
+
def main() -> int:
sites = _parse_sites(_env_str("JOBSPY_SITES", "indeed,linkedin"))
search_term = _env_str("JOBSPY_SEARCH_TERM", "web developer")
@@ -68,16 +141,52 @@ def main() -> int:
"searchTerm": search_term,
},
)
- jobs = scrape_jobs(
- site_name=sites,
- search_term=search_term,
- location=location,
- results_wanted=results_wanted,
- hours_old=hours_old,
- country_indeed=country_indeed,
- linkedin_fetch_description=linkedin_fetch_description,
- is_remote=is_remote,
- )
+ frames: list[pd.DataFrame] = []
+ non_glassdoor_sites = [site for site in sites if site != "glassdoor"]
+
+ if non_glassdoor_sites:
+ frames.append(
+ _scrape_for_sites(
+ sites=non_glassdoor_sites,
+ search_term=search_term,
+ location=location,
+ results_wanted=results_wanted,
+ hours_old=hours_old,
+ country_indeed=country_indeed,
+ linkedin_fetch_description=linkedin_fetch_description,
+ is_remote=is_remote,
+ )
+ )
+
+ if "glassdoor" in sites:
+ glassdoor_location = location
+ if _is_country_level_location(location, country_indeed):
+ # Glassdoor works best with city-level location terms.
+ fallback_city = _glassdoor_city_for_country(country_indeed, location)
+ if fallback_city:
+ glassdoor_location = fallback_city
+ print(
+ "jobspy: Glassdoor location matched country; using city fallback "
+ f"({fallback_city})"
+ )
+ else:
+ print(
+ "jobspy: Glassdoor location matched country; keeping original location"
+ )
+ frames.append(
+ _scrape_for_sites(
+ sites=["glassdoor"],
+ search_term=search_term,
+ location=glassdoor_location,
+ results_wanted=results_wanted,
+ hours_old=hours_old,
+ country_indeed=country_indeed,
+ linkedin_fetch_description=linkedin_fetch_description,
+ is_remote=is_remote,
+ )
+ )
+
+ jobs = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
print(f"Found {len(jobs)} jobs")
_emit_progress(
@@ -96,7 +205,6 @@ def main() -> int:
escapechar="\\",
index=False,
)
-
jobs.to_json(output_json, orient="records", force_ascii=False)
print(f"Wrote CSV: {output_csv}")
diff --git a/orchestrator/src/client/pages/OrchestratorPage.tsx b/orchestrator/src/client/pages/OrchestratorPage.tsx
index c262866..507cb30 100644
--- a/orchestrator/src/client/pages/OrchestratorPage.tsx
+++ b/orchestrator/src/client/pages/OrchestratorPage.tsx
@@ -257,13 +257,17 @@ export const OrchestratorPage: React.FC = () => {
searchTerms: values.searchTerms,
sources: compatibleSources,
});
+ const jobspyLocation = compatibleSources.includes("glassdoor")
+ ? (values.glassdoorLocation ?? "").trim() ||
+ formatCountryLabel(values.country)
+ : formatCountryLabel(values.country);
await api.updateSettings({
searchTerms: values.searchTerms,
jobspyResultsWanted: limits.jobspyResultsWanted,
gradcrackerMaxJobsPerTerm: limits.gradcrackerMaxJobsPerTerm,
ukvisajobsMaxJobs: limits.ukvisajobsMaxJobs,
jobspyCountryIndeed: values.country,
- jobspyLocation: formatCountryLabel(values.country),
+ jobspyLocation,
});
await refreshSettings();
await startPipelineRun({
diff --git a/orchestrator/src/client/pages/SettingsPage.tsx b/orchestrator/src/client/pages/SettingsPage.tsx
index 3c2d9f0..e355f9d 100644
--- a/orchestrator/src/client/pages/SettingsPage.tsx
+++ b/orchestrator/src/client/pages/SettingsPage.tsx
@@ -236,6 +236,14 @@ const nullIfSameSortedList = (
defaultValue: string[],
) => (isSameSortedStringList(value, defaultValue) ? null : (value ?? null));
+const withAlwaysOnGlassdoor = (
+ sites: string[] | null | undefined,
+): string[] => {
+ const unique = new Set((sites ?? []).filter(Boolean));
+ unique.add("glassdoor");
+ return Array.from(unique);
+};
+
const getDerivedSettings = (settings: AppSettings | null) => {
const profileProjects = settings?.profileProjects ?? [];
@@ -289,8 +297,12 @@ const getDerivedSettings = (settings: AppSettings | null) => {
default: settings?.defaultJobspyCountryIndeed ?? "",
},
sites: {
- effective: settings?.jobspySites ?? ["indeed", "linkedin"],
- default: settings?.defaultJobspySites ?? ["indeed", "linkedin"],
+ effective: withAlwaysOnGlassdoor(
+ settings?.jobspySites ?? ["indeed", "linkedin", "glassdoor"],
+ ),
+ default: withAlwaysOnGlassdoor(
+ settings?.defaultJobspySites ?? ["indeed", "linkedin", "glassdoor"],
+ ),
},
linkedinFetchDescription: {
effective: settings?.jobspyLinkedinFetchDescription ?? true,
@@ -691,7 +703,7 @@ export const SettingsPage: React.FC = () => {
jobspy.countryIndeed.default,
),
jobspySites: nullIfSameSortedList(
- data.jobspySites,
+ withAlwaysOnGlassdoor(data.jobspySites),
jobspy.sites.default,
),
jobspyLinkedinFetchDescription: nullIfSame(
diff --git a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx
index 8183fcc..c50141c 100644
--- a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx
+++ b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx
@@ -96,4 +96,69 @@ describe("AutomaticRunTab", () => {
),
).toBeInTheDocument();
});
+
+ it("disables glassdoor for unsupported countries with guidance copy", async () => {
+ const onSetPipelineSources = vi.fn();
+
+ render(
+ ,
+ );
+
+ await waitFor(() => {
+ expect(onSetPipelineSources).toHaveBeenCalledWith(["linkedin"]);
+ });
+
+ const glassdoorButton = screen.getByRole("button", { name: "Glassdoor" });
+ expect(glassdoorButton).toBeDisabled();
+ expect(glassdoorButton.getAttribute("title")).toContain(
+ "Glassdoor is not available for the selected country.",
+ );
+ });
+
+ it("disables glassdoor for supported countries until city is provided", async () => {
+ const onSetPipelineSources = vi.fn();
+
+ render(
+ ,
+ );
+
+ await waitFor(() => {
+ expect(onSetPipelineSources).toHaveBeenCalledWith(["linkedin"]);
+ });
+
+ const glassdoorButton = screen.getByRole("button", { name: "Glassdoor" });
+ expect(glassdoorButton).toBeDisabled();
+ expect(glassdoorButton.getAttribute("title")).toContain(
+ "Set a Glassdoor city in Advanced settings to enable Glassdoor.",
+ );
+ });
});
diff --git a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx
index c4db0c3..351c95a 100644
--- a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx
+++ b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx
@@ -1,14 +1,13 @@
import * as PopoverPrimitive from "@radix-ui/react-popover";
import {
formatCountryLabel,
- getCompatibleSourcesForCountry,
isSourceAllowedForCountry,
normalizeCountryKey,
SUPPORTED_COUNTRY_KEYS,
} from "@shared/location-support.js";
import type { AppSettings, JobSource } from "@shared/types";
import { Check, ChevronsUpDown, Loader2, Sparkles, X } from "lucide-react";
-import { useEffect, useMemo, useState } from "react";
+import { useCallback, useEffect, useMemo, useState } from "react";
import { useForm } from "react-hook-form";
import {
Accordion,
@@ -71,12 +70,18 @@ interface AutomaticRunFormValues {
minSuitabilityScore: string;
runBudget: string;
country: string;
+ glassdoorLocation: string;
searchTerms: string[];
searchTermDraft: string;
}
type AutomaticPresetSelection = AutomaticPresetId | "custom";
+const GLASSDOOR_COUNTRY_REASON =
+ "Glassdoor is not available for the selected country.";
+const GLASSDOOR_LOCATION_REASON =
+ "Set a Glassdoor city in Advanced settings to enable Glassdoor.";
+
function toNumber(input: string, min: number, max: number, fallback: number) {
const parsed = Number.parseInt(input, 10);
if (Number.isNaN(parsed)) return fallback;
@@ -134,6 +139,7 @@ export const AutomaticRunTab: React.FC = ({
minSuitabilityScore: String(DEFAULT_VALUES.minSuitabilityScore),
runBudget: String(DEFAULT_VALUES.runBudget),
country: DEFAULT_VALUES.country,
+ glassdoorLocation: "",
searchTerms: DEFAULT_VALUES.searchTerms,
searchTermDraft: "",
},
@@ -144,6 +150,7 @@ export const AutomaticRunTab: React.FC = ({
const minScoreInput = watch("minSuitabilityScore");
const runBudgetInput = watch("runBudget");
const countryInput = watch("country");
+ const glassdoorLocationInput = watch("glassdoorLocation");
const searchTerms = watch("searchTerms");
const searchTermDraft = watch("searchTermDraft");
@@ -164,12 +171,24 @@ export const AutomaticRunTab: React.FC = ({
settings?.jobspyLocation ??
DEFAULT_VALUES.country,
);
+ const rememberedCountryKey = rememberedCountry || DEFAULT_VALUES.country;
+ const rememberedLocationRaw = settings?.jobspyLocation?.trim() ?? "";
+ const rememberedLocationNormalized = normalizeCountryKey(
+ rememberedLocationRaw,
+ );
+ const rememberedGlassdoorLocation =
+ rememberedLocationRaw &&
+ rememberedLocationNormalized &&
+ rememberedLocationNormalized !== normalizeCountryKey(rememberedCountryKey)
+ ? rememberedLocationRaw
+ : "";
reset({
topN: String(topN),
minSuitabilityScore: String(minSuitabilityScore),
runBudget: String(rememberedRunBudget),
country: rememberedCountry || DEFAULT_VALUES.country,
+ glassdoorLocation: rememberedGlassdoorLocation,
searchTerms: settings?.searchTerms ?? DEFAULT_VALUES.searchTerms,
searchTermDraft: "",
});
@@ -200,27 +219,40 @@ export const AutomaticRunTab: React.FC = ({
),
runBudget: toNumber(runBudgetInput, 1, 1000, DEFAULT_VALUES.runBudget),
country: normalizedCountry || DEFAULT_VALUES.country,
+ glassdoorLocation: glassdoorLocationInput.trim() || undefined,
searchTerms,
};
- }, [topNInput, minScoreInput, runBudgetInput, countryInput, searchTerms]);
+ }, [
+ topNInput,
+ minScoreInput,
+ runBudgetInput,
+ countryInput,
+ glassdoorLocationInput,
+ searchTerms,
+ ]);
+
+ const isSourceAvailableForRun = useCallback(
+ (source: JobSource) => {
+ if (!isSourceAllowedForCountry(source, values.country)) return false;
+ if (source === "glassdoor" && !values.glassdoorLocation) return false;
+ return true;
+ },
+ [values.country, values.glassdoorLocation],
+ );
const compatibleEnabledSources = useMemo(
- () =>
- enabledSources.filter((source) =>
- isSourceAllowedForCountry(source, values.country),
- ),
- [enabledSources, values.country],
+ () => enabledSources.filter((source) => isSourceAvailableForRun(source)),
+ [enabledSources, isSourceAvailableForRun],
);
const compatiblePipelineSources = useMemo(
- () => getCompatibleSourcesForCountry(pipelineSources, values.country),
- [pipelineSources, values.country],
+ () => pipelineSources.filter((source) => isSourceAvailableForRun(source)),
+ [pipelineSources, isSourceAvailableForRun],
);
useEffect(() => {
- const filtered = getCompatibleSourcesForCountry(
- pipelineSources,
- values.country,
+ const filtered = pipelineSources.filter((source) =>
+ isSourceAvailableForRun(source),
);
if (filtered.length === pipelineSources.length) return;
if (filtered.length > 0) {
@@ -232,9 +264,9 @@ export const AutomaticRunTab: React.FC = ({
}
}, [
compatibleEnabledSources,
+ isSourceAvailableForRun,
onSetPipelineSources,
pipelineSources,
- values.country,
]);
const estimate = useMemo(
@@ -441,6 +473,23 @@ export const AutomaticRunTab: React.FC = ({
}
/>
+
+
+
+ setValue("glassdoorLocation", event.target.value, {
+ shouldDirty: true,
+ })
+ }
+ placeholder='e.g. "London"'
+ />
+
+ Required only for Glassdoor. Use a city (not country) to
+ keep results localized.
+
+
@@ -526,12 +575,18 @@ export const AutomaticRunTab: React.FC = ({
{enabledSources.map((source) => {
- const allowed = isSourceAllowedForCountry(
+ const countryAllowed = isSourceAllowedForCountry(
source,
values.country,
);
+ const allowed = isSourceAvailableForRun(source);
const selected = compatiblePipelineSources.includes(source);
- const disabledReason = `${sourceLabel[source]} is available only when country is United Kingdom.`;
+ const disabledReason =
+ source === "glassdoor"
+ ? countryAllowed
+ ? GLASSDOOR_LOCATION_REASON
+ : GLASSDOOR_COUNTRY_REASON
+ : `${sourceLabel[source]} is available only when country is United Kingdom.`;
const button = (
)}
- Select which sites JobSpy should scrape.
+ Select configurable sites JobSpy should scrape.
- Effective: {(sites.effective || []).join(", ") || "None"}
+ Effective: {configurableEffectiveSites.join(", ") || "None"}
- Default: {(sites.default || []).join(", ")}
+ Default: {configurableDefaultSites.join(", ")}
diff --git a/orchestrator/src/lib/utils.ts b/orchestrator/src/lib/utils.ts
index f0f0119..cda1b95 100644
--- a/orchestrator/src/lib/utils.ts
+++ b/orchestrator/src/lib/utils.ts
@@ -138,6 +138,7 @@ export const sourceLabel: Record = {
gradcracker: "Gradcracker",
indeed: "Indeed",
linkedin: "LinkedIn",
+ glassdoor: "Glassdoor",
ukvisajobs: "UK Visa Jobs",
manual: "Manual",
};
diff --git a/orchestrator/src/server/api/routes/pipeline.test.ts b/orchestrator/src/server/api/routes/pipeline.test.ts
index c485954..da98bc4 100644
--- a/orchestrator/src/server/api/routes/pipeline.test.ts
+++ b/orchestrator/src/server/api/routes/pipeline.test.ts
@@ -44,6 +44,17 @@ describe.sequential("Pipeline API routes", () => {
topN: 5,
sources: ["gradcracker"],
});
+
+ const glassdoorRunRes = await fetch(`${baseUrl}/api/pipeline/run`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ sources: ["glassdoor"] }),
+ });
+ const glassdoorRunBody = await glassdoorRunRes.json();
+ expect(glassdoorRunBody.ok).toBe(true);
+ expect(runPipeline).toHaveBeenNthCalledWith(2, {
+ sources: ["glassdoor"],
+ });
});
it("returns conflict when cancelling with no active pipeline", async () => {
diff --git a/orchestrator/src/server/api/routes/pipeline.ts b/orchestrator/src/server/api/routes/pipeline.ts
index 7b873a3..22fa85b 100644
--- a/orchestrator/src/server/api/routes/pipeline.ts
+++ b/orchestrator/src/server/api/routes/pipeline.ts
@@ -98,7 +98,9 @@ const runPipelineSchema = z.object({
topN: z.number().min(1).max(50).optional(),
minSuitabilityScore: z.number().min(0).max(100).optional(),
sources: z
- .array(z.enum(["gradcracker", "indeed", "linkedin", "ukvisajobs"]))
+ .array(
+ z.enum(["gradcracker", "indeed", "linkedin", "glassdoor", "ukvisajobs"]),
+ )
.min(1)
.optional(),
});
diff --git a/orchestrator/src/server/config/demo-defaults.data.ts b/orchestrator/src/server/config/demo-defaults.data.ts
index 241920d..d4db8b4 100644
--- a/orchestrator/src/server/config/demo-defaults.data.ts
+++ b/orchestrator/src/server/config/demo-defaults.data.ts
@@ -28,7 +28,7 @@ export const DEMO_DEFAULT_SETTINGS: DemoDefaultSettings = {
jobspyResultsWanted: "25",
jobspyHoursOld: "72",
jobspyCountryIndeed: "US",
- jobspySites: JSON.stringify(["linkedin", "indeed"]),
+ jobspySites: JSON.stringify(["linkedin", "indeed", "glassdoor"]),
jobspyLinkedinFetchDescription: "1",
jobspyIsRemote: "0",
resumeProjects: JSON.stringify({
@@ -253,6 +253,7 @@ export const COMPANY_SUFFIXES = [
export const DEMO_SOURCE_BASE_URLS: Record = {
linkedin: "https://www.linkedin.com",
indeed: "https://www.indeed.com",
+ glassdoor: "https://www.glassdoor.com",
gradcracker: "https://www.gradcracker.com",
ukvisajobs: "https://www.ukvisajobs.com",
manual: "https://example.com",
diff --git a/orchestrator/src/server/db/schema.ts b/orchestrator/src/server/db/schema.ts
index 6a41bfa..cf44920 100644
--- a/orchestrator/src/server/db/schema.ts
+++ b/orchestrator/src/server/db/schema.ts
@@ -17,7 +17,14 @@ export const jobs = sqliteTable("jobs", {
// From crawler
source: text("source", {
- enum: ["gradcracker", "indeed", "linkedin", "ukvisajobs", "manual"],
+ enum: [
+ "gradcracker",
+ "indeed",
+ "linkedin",
+ "glassdoor",
+ "ukvisajobs",
+ "manual",
+ ],
})
.notNull()
.default("gradcracker"),
diff --git a/orchestrator/src/server/pipeline/orchestrator.ts b/orchestrator/src/server/pipeline/orchestrator.ts
index 4dc4132..eac9d8f 100644
--- a/orchestrator/src/server/pipeline/orchestrator.ts
+++ b/orchestrator/src/server/pipeline/orchestrator.ts
@@ -37,6 +37,7 @@ import {
const DEFAULT_CONFIG: PipelineConfig = {
topN: 10,
minSuitabilityScore: 50,
+ // Keep Glassdoor opt-in via source picker/settings; do not enable by default.
sources: ["gradcracker", "indeed", "linkedin", "ukvisajobs"],
outputDir: join(getDataDir(), "pdfs"),
enableCrawling: true,
diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts
index 20fbb82..9b15a21 100644
--- a/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts
+++ b/orchestrator/src/server/pipeline/steps/discover-jobs.test.ts
@@ -76,6 +76,92 @@ describe("discoverJobsStep", () => {
);
});
+ it("passes glassdoor through to JobSpy when selected", async () => {
+ const settingsRepo = await import("../../repositories/settings");
+ const jobSpy = await import("../../services/jobspy");
+
+ vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
+ searchTerms: JSON.stringify(["engineer"]),
+ jobspySites: JSON.stringify(["glassdoor"]),
+ } as any);
+
+ vi.mocked(jobSpy.runJobSpy).mockResolvedValue({
+ success: true,
+ jobs: [
+ {
+ source: "glassdoor",
+ title: "Engineer",
+ employer: "ACME",
+ jobUrl: "https://example.com/job",
+ },
+ ],
+ } as any);
+
+ const result = await discoverJobsStep({
+ mergedConfig: {
+ ...config,
+ sources: ["glassdoor"],
+ },
+ });
+
+ expect(result.discoveredJobs).toHaveLength(1);
+ expect(vi.mocked(jobSpy.runJobSpy)).toHaveBeenCalledWith(
+ expect.objectContaining({ sites: ["glassdoor"] }),
+ );
+ });
+
+ it("keeps glassdoor enabled even when jobspySites override omits it", async () => {
+ const settingsRepo = await import("../../repositories/settings");
+ const jobSpy = await import("../../services/jobspy");
+
+ vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
+ searchTerms: JSON.stringify(["engineer"]),
+ jobspySites: JSON.stringify(["linkedin"]),
+ } as any);
+
+ vi.mocked(jobSpy.runJobSpy).mockResolvedValue({
+ success: true,
+ jobs: [],
+ } as any);
+
+ await discoverJobsStep({
+ mergedConfig: {
+ ...config,
+ sources: ["glassdoor", "linkedin"],
+ },
+ });
+
+ expect(vi.mocked(jobSpy.runJobSpy)).toHaveBeenCalledWith(
+ expect.objectContaining({ sites: ["glassdoor", "linkedin"] }),
+ );
+ });
+
+ it("filters out glassdoor for unsupported countries", async () => {
+ const settingsRepo = await import("../../repositories/settings");
+ const jobSpy = await import("../../services/jobspy");
+
+ vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
+ searchTerms: JSON.stringify(["engineer"]),
+ jobspyCountryIndeed: "japan",
+ } as any);
+
+ vi.mocked(jobSpy.runJobSpy).mockResolvedValue({
+ success: true,
+ jobs: [],
+ } as any);
+
+ await discoverJobsStep({
+ mergedConfig: {
+ ...config,
+ sources: ["glassdoor", "linkedin"],
+ },
+ });
+
+ expect(vi.mocked(jobSpy.runJobSpy)).toHaveBeenCalledWith(
+ expect.objectContaining({ sites: ["linkedin"] }),
+ );
+ });
+
it("throws when all enabled sources fail", async () => {
const settingsRepo = await import("../../repositories/settings");
const ukVisa = await import("../../services/ukvisajobs");
diff --git a/orchestrator/src/server/pipeline/steps/discover-jobs.ts b/orchestrator/src/server/pipeline/steps/discover-jobs.ts
index f2f7183..7a32d79 100644
--- a/orchestrator/src/server/pipeline/steps/discover-jobs.ts
+++ b/orchestrator/src/server/pipeline/steps/discover-jobs.ts
@@ -67,8 +67,8 @@ export async function discoverJobsStep(args: {
}
let jobSpySites = compatibleSources.filter(
- (source): source is "indeed" | "linkedin" =>
- source === "indeed" || source === "linkedin",
+ (source): source is "indeed" | "linkedin" | "glassdoor" =>
+ source === "indeed" || source === "linkedin" || source === "glassdoor",
);
const jobspySitesSettingRaw = settings.jobspySites;
@@ -76,7 +76,9 @@ export async function discoverJobsStep(args: {
try {
const allowed = JSON.parse(jobspySitesSettingRaw);
if (Array.isArray(allowed)) {
- jobSpySites = jobSpySites.filter((site) => allowed.includes(site));
+ jobSpySites = jobSpySites.filter(
+ (site) => site === "glassdoor" || allowed.includes(site),
+ );
}
} catch {
// ignore JSON parse error
diff --git a/orchestrator/src/server/services/jobspy.ts b/orchestrator/src/server/services/jobspy.ts
index f72370d..8b7a415 100644
--- a/orchestrator/src/server/services/jobspy.ts
+++ b/orchestrator/src/server/services/jobspy.ts
@@ -105,6 +105,7 @@ function toJobSource(site: unknown): JobSource | null {
if (raw === "gradcracker") return "gradcracker";
if (raw === "indeed") return "indeed";
if (raw === "linkedin") return "linkedin";
+ if (raw === "glassdoor") return "glassdoor";
return null;
}
@@ -164,8 +165,8 @@ export async function runJobSpy(
const outputDir = join(dataDir, "imports");
await mkdir(outputDir, { recursive: true });
- const sites = (options.sites ?? ["indeed", "linkedin"])
- .filter((s) => s === "indeed" || s === "linkedin")
+ const sites = (options.sites ?? ["indeed", "linkedin", "glassdoor"])
+ .filter((s) => s === "indeed" || s === "linkedin" || s === "glassdoor")
.join(",");
const searchTerms = resolveSearchTerms(options);
@@ -191,7 +192,7 @@ export async function runJobSpy(
stdio: ["ignore", "pipe", "pipe"],
env: {
...process.env,
- JOBSPY_SITES: sites || "indeed,linkedin",
+ JOBSPY_SITES: sites || "indeed,linkedin,glassdoor",
JOBSPY_SEARCH_TERM: searchTerm,
JOBSPY_TERM_INDEX: String(i + 1),
JOBSPY_TERM_TOTAL: String(searchTerms.length),
diff --git a/orchestrator/src/server/services/settings-conversion.test.ts b/orchestrator/src/server/services/settings-conversion.test.ts
index ba661c7..44daf96 100644
--- a/orchestrator/src/server/services/settings-conversion.test.ts
+++ b/orchestrator/src/server/services/settings-conversion.test.ts
@@ -79,6 +79,26 @@ describe("settings-conversion", () => {
expect(malformedOverride.value).toEqual(["web developer"]);
});
+ it("always includes glassdoor in resolved jobspySites", () => {
+ delete process.env.JOBSPY_SITES;
+ expect(resolveSettingValue("jobspySites", undefined).value).toEqual([
+ "indeed",
+ "linkedin",
+ "glassdoor",
+ ]);
+
+ process.env.JOBSPY_SITES = "indeed,linkedin";
+ expect(resolveSettingValue("jobspySites", undefined).value).toEqual([
+ "indeed",
+ "linkedin",
+ "glassdoor",
+ ]);
+
+ expect(
+ resolveSettingValue("jobspySites", JSON.stringify(["linkedin"])).value,
+ ).toEqual(["linkedin", "glassdoor"]);
+ });
+
it("round-trips penalizeMissingSalary boolean setting", () => {
expect(serializeSettingValue("penalizeMissingSalary", true)).toBe("1");
expect(serializeSettingValue("penalizeMissingSalary", false)).toBe("0");
diff --git a/orchestrator/src/server/services/settings-conversion.ts b/orchestrator/src/server/services/settings-conversion.ts
index ebdbb3e..d086773 100644
--- a/orchestrator/src/server/services/settings-conversion.ts
+++ b/orchestrator/src/server/services/settings-conversion.ts
@@ -57,6 +57,24 @@ function parseJsonArrayOrNull(raw: string | undefined): string[] | null {
}
}
+function normalizeJobspySites(value: string[]): string[] {
+ const seen = new Set();
+ const normalized: string[] = [];
+
+ for (const site of value) {
+ const trimmed = site.trim();
+ if (!trimmed || seen.has(trimmed)) continue;
+ seen.add(trimmed);
+ normalized.push(trimmed);
+ }
+
+ if (!seen.has("glassdoor")) {
+ normalized.push("glassdoor");
+ }
+
+ return normalized;
+}
+
function parseBitBoolOrNull(raw: string | undefined): boolean | null {
if (!raw) return null;
return raw === "true" || raw === "1";
@@ -143,13 +161,13 @@ export const settingsConversionMetadata: SettingsConversionMetadata = {
},
jobspySites: {
defaultValue: () =>
- (process.env.JOBSPY_SITES || "indeed,linkedin")
- .split(",")
- .map((value) => value.trim())
- .filter(Boolean),
+ normalizeJobspySites(
+ (process.env.JOBSPY_SITES || "indeed,linkedin,glassdoor").split(","),
+ ),
parseOverride: parseJsonArrayOrNull,
serialize: serializeNullableJsonArray,
- resolve: resolveWithNullishFallback,
+ resolve: ({ defaultValue, overrideValue }) =>
+ normalizeJobspySites(overrideValue ?? defaultValue),
},
jobspyLinkedinFetchDescription: {
defaultValue: () =>
diff --git a/shared/src/location-support.test.ts b/shared/src/location-support.test.ts
index 1b1156a..281f90c 100644
--- a/shared/src/location-support.test.ts
+++ b/shared/src/location-support.test.ts
@@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest";
import {
formatCountryLabel,
getCompatibleSourcesForCountry,
+ isGlassdoorCountry,
isSourceAllowedForCountry,
isUkCountry,
normalizeCountryKey,
@@ -49,14 +50,24 @@ describe("location-support", () => {
expect(isSourceAllowedForCountry("ukvisajobs", "worldwide")).toBe(false);
expect(isSourceAllowedForCountry("indeed", "united states")).toBe(true);
expect(isSourceAllowedForCountry("linkedin", "worldwide")).toBe(true);
+ expect(isSourceAllowedForCountry("glassdoor", "united states")).toBe(true);
+ expect(isSourceAllowedForCountry("glassdoor", "japan")).toBe(false);
});
it("filters incompatible sources while preserving compatible order", () => {
expect(
getCompatibleSourcesForCountry(
- ["gradcracker", "indeed", "ukvisajobs", "linkedin"],
+ ["gradcracker", "indeed", "glassdoor", "ukvisajobs", "linkedin"],
"united states",
),
- ).toEqual(["indeed", "linkedin"]);
+ ).toEqual(["indeed", "glassdoor", "linkedin"]);
+ });
+
+ it("supports glassdoor only in explicitly supported countries", () => {
+ expect(isGlassdoorCountry("united kingdom")).toBe(true);
+ expect(isGlassdoorCountry("uk")).toBe(true);
+ expect(isGlassdoorCountry("usa")).toBe(true);
+ expect(isGlassdoorCountry("japan")).toBe(false);
+ expect(isGlassdoorCountry("worldwide")).toBe(false);
});
});
diff --git a/shared/src/location-support.ts b/shared/src/location-support.ts
index fd91baf..eeef084 100644
--- a/shared/src/location-support.ts
+++ b/shared/src/location-support.ts
@@ -100,6 +100,30 @@ export const SUPPORTED_COUNTRY_INPUTS = [
] as const;
const UK_ONLY_SOURCES = new Set(["gradcracker", "ukvisajobs"]);
+const GLASSDOOR_SUPPORTED_COUNTRIES = new Set(
+ [
+ "australia",
+ "austria",
+ "belgium",
+ "brazil",
+ "canada",
+ "france",
+ "germany",
+ "hong kong",
+ "india",
+ "ireland",
+ "italy",
+ "mexico",
+ "netherlands",
+ "new zealand",
+ "singapore",
+ "spain",
+ "switzerland",
+ "united kingdom",
+ "united states",
+ "vietnam",
+ ].map((country) => normalizeCountryKey(country)),
+);
export function normalizeCountryKey(value: string | null | undefined): string {
const normalized = value?.trim().toLowerCase() ?? "";
@@ -125,12 +149,19 @@ export function isUkCountry(country: string | null | undefined): boolean {
return normalizeCountryKey(country) === "united kingdom";
}
+export function isGlassdoorCountry(
+ country: string | null | undefined,
+): boolean {
+ return GLASSDOOR_SUPPORTED_COUNTRIES.has(normalizeCountryKey(country));
+}
+
export function isSourceAllowedForCountry(
source: JobSource,
country: string | null | undefined,
): boolean {
- if (!UK_ONLY_SOURCES.has(source)) return true;
- return isUkCountry(country);
+ if (UK_ONLY_SOURCES.has(source)) return isUkCountry(country);
+ if (source === "glassdoor") return isGlassdoorCountry(country);
+ return true;
}
export function getCompatibleSourcesForCountry(
diff --git a/shared/src/types.ts b/shared/src/types.ts
index 857bc8b..8a8dce1 100644
--- a/shared/src/types.ts
+++ b/shared/src/types.ts
@@ -122,6 +122,7 @@ export type JobSource =
| "gradcracker"
| "indeed"
| "linkedin"
+ | "glassdoor"
| "ukvisajobs"
| "manual";