Enable Glassdoor as a JobSpy source (#126)

* feat(shared): add glassdoor to job source model

* feat(jobspy): support glassdoor site in scraper and discovery

* feat(pipeline): include glassdoor in source selection and API schema

* feat(ui): add glassdoor toggle to jobspy settings and run estimates

* test/docs: cover glassdoor jobspy integration end-to-end

* fix(jobspy): make glassdoor always-on without settings toggle

* fix(jobspy): fallback glassdoor when location is country-level

* refactor(jobspy): drop direct pandas usage in wrapper

* feat(pipeline): gate glassdoor by supported countries

* fix(jobspy): restore pandas output and keep glassdoor disable copy

* fix(jobspy): map country-level glassdoor searches to city fallbacks

* feat(ui): require glassdoor city for country-level runs
This commit is contained in:
Shaheer Sarfaraz 2026-02-10 17:57:49 +00:00 committed by GitHub
parent 2c8de6c92e
commit 4e1ea28301
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 530 additions and 69 deletions

View File

@ -4,7 +4,7 @@ AI-powered job discovery and application pipeline. Automatically finds jobs, sco
## Workflow ## Workflow
1. **Search**: Scrapes Gradcracker, Indeed, LinkedIn, and UK Visa Sponsorship jobs. 1. **Search**: Scrapes Gradcracker, Indeed, LinkedIn, Glassdoor, and UK Visa Sponsorship jobs.
2. **Score**: AI ranks jobs by suitability using the configured LLM provider (OpenRouter by default). 2. **Score**: AI ranks jobs by suitability using the configured LLM provider (OpenRouter by default).
3. **Tailor**: Generates a custom resume summary for top-tier matches. 3. **Tailor**: Generates a custom resume summary for top-tier matches.
4. **Export**: Uses [RxResume v4](https://v4.rxresu.me) to create tailored PDFs. 4. **Export**: Uses [RxResume v4](https://v4.rxresu.me) to create tailored PDFs.

View File

@ -1,6 +1,6 @@
# JobSpy Extractor (How It Works) # JobSpy Extractor (How It Works)
This is a simple walkthrough of the JobSpy extractor used for Indeed and LinkedIn. This is a simple walkthrough of the JobSpy extractor used for Indeed, LinkedIn, and Glassdoor.
## Big picture ## Big picture
@ -34,7 +34,7 @@ The Node service (`orchestrator/src/server/services/jobspy.ts`) controls the run
The mapper normalizes fields like salary ranges, converts empty values to null, and keeps extra metadata (skills, company rating, remote flag, etc.) when available. The mapper normalizes fields like salary ranges, converts empty values to null, and keeps extra metadata (skills, company rating, remote flag, etc.) when available.
If a row is missing a valid site (`indeed` or `linkedin`) or a job URL, it gets skipped. If a row is missing a valid site (`indeed`, `linkedin`, or `glassdoor`) or a job URL, it gets skipped.
## Notes ## Notes

View File

@ -1 +1,2 @@
python-jobspy python-jobspy
pandas

View File

@ -3,9 +3,41 @@ import json
import os import os
from pathlib import Path from pathlib import Path
import pandas as pd
from jobspy import scrape_jobs from jobspy import scrape_jobs
PROGRESS_PREFIX = "JOBOPS_PROGRESS " PROGRESS_PREFIX = "JOBOPS_PROGRESS "
COUNTRY_ALIASES = {
"uk": "united kingdom",
"united kingdom": "united kingdom",
"us": "united states",
"usa": "united states",
"united states": "united states",
"türkiye": "turkey",
"czech republic": "czechia",
}
GLASSDOOR_COUNTRY_TO_CITY = {
"australia": "Sydney",
"austria": "Vienna",
"belgium": "Brussels",
"brazil": "Sao Paulo",
"canada": "Toronto",
"france": "Paris",
"germany": "Berlin",
"hong kong": "Hong Kong",
"india": "Bengaluru",
"ireland": "Dublin",
"italy": "Milan",
"mexico": "Mexico City",
"netherlands": "Amsterdam",
"new zealand": "Auckland",
"singapore": "Singapore",
"spain": "Madrid",
"switzerland": "Zurich",
"united kingdom": "London",
"united states": "New York",
"vietnam": "Ho Chi Minh City",
}
def _env_str(name: str, default: str) -> str: def _env_str(name: str, default: str) -> str:
@ -39,6 +71,47 @@ def _parse_sites(raw: str) -> list[str]:
return [s.strip() for s in raw.split(",") if s.strip()] return [s.strip() for s in raw.split(",") if s.strip()]
def _normalize_country_token(value: str) -> str:
normalized = " ".join(value.strip().lower().split())
return COUNTRY_ALIASES.get(normalized, normalized)
def _is_country_level_location(location: str, country_indeed: str) -> bool:
if not location.strip() or not country_indeed.strip():
return False
return _normalize_country_token(location) == _normalize_country_token(country_indeed)
def _glassdoor_city_for_country(country_indeed: str, location: str) -> str | None:
country_key = _normalize_country_token(country_indeed or location)
return GLASSDOOR_COUNTRY_TO_CITY.get(country_key)
def _scrape_for_sites(
*,
sites: list[str],
search_term: str,
location: str | None,
results_wanted: int,
hours_old: int,
country_indeed: str,
linkedin_fetch_description: bool,
is_remote: bool,
) -> pd.DataFrame:
kwargs: dict[str, object] = {
"site_name": sites,
"search_term": search_term,
"results_wanted": results_wanted,
"hours_old": hours_old,
"country_indeed": country_indeed,
"linkedin_fetch_description": linkedin_fetch_description,
"is_remote": is_remote,
}
if location and location.strip():
kwargs["location"] = location
return scrape_jobs(**kwargs)
def main() -> int: def main() -> int:
sites = _parse_sites(_env_str("JOBSPY_SITES", "indeed,linkedin")) sites = _parse_sites(_env_str("JOBSPY_SITES", "indeed,linkedin"))
search_term = _env_str("JOBSPY_SEARCH_TERM", "web developer") search_term = _env_str("JOBSPY_SEARCH_TERM", "web developer")
@ -68,16 +141,52 @@ def main() -> int:
"searchTerm": search_term, "searchTerm": search_term,
}, },
) )
jobs = scrape_jobs( frames: list[pd.DataFrame] = []
site_name=sites, non_glassdoor_sites = [site for site in sites if site != "glassdoor"]
search_term=search_term,
location=location, if non_glassdoor_sites:
results_wanted=results_wanted, frames.append(
hours_old=hours_old, _scrape_for_sites(
country_indeed=country_indeed, sites=non_glassdoor_sites,
linkedin_fetch_description=linkedin_fetch_description, search_term=search_term,
is_remote=is_remote, location=location,
) results_wanted=results_wanted,
hours_old=hours_old,
country_indeed=country_indeed,
linkedin_fetch_description=linkedin_fetch_description,
is_remote=is_remote,
)
)
if "glassdoor" in sites:
glassdoor_location = location
if _is_country_level_location(location, country_indeed):
# Glassdoor works best with city-level location terms.
fallback_city = _glassdoor_city_for_country(country_indeed, location)
if fallback_city:
glassdoor_location = fallback_city
print(
"jobspy: Glassdoor location matched country; using city fallback "
f"({fallback_city})"
)
else:
print(
"jobspy: Glassdoor location matched country; keeping original location"
)
frames.append(
_scrape_for_sites(
sites=["glassdoor"],
search_term=search_term,
location=glassdoor_location,
results_wanted=results_wanted,
hours_old=hours_old,
country_indeed=country_indeed,
linkedin_fetch_description=linkedin_fetch_description,
is_remote=is_remote,
)
)
jobs = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
print(f"Found {len(jobs)} jobs") print(f"Found {len(jobs)} jobs")
_emit_progress( _emit_progress(
@ -96,7 +205,6 @@ def main() -> int:
escapechar="\\", escapechar="\\",
index=False, index=False,
) )
jobs.to_json(output_json, orient="records", force_ascii=False) jobs.to_json(output_json, orient="records", force_ascii=False)
print(f"Wrote CSV: {output_csv}") print(f"Wrote CSV: {output_csv}")

View File

@ -257,13 +257,17 @@ export const OrchestratorPage: React.FC = () => {
searchTerms: values.searchTerms, searchTerms: values.searchTerms,
sources: compatibleSources, sources: compatibleSources,
}); });
const jobspyLocation = compatibleSources.includes("glassdoor")
? (values.glassdoorLocation ?? "").trim() ||
formatCountryLabel(values.country)
: formatCountryLabel(values.country);
await api.updateSettings({ await api.updateSettings({
searchTerms: values.searchTerms, searchTerms: values.searchTerms,
jobspyResultsWanted: limits.jobspyResultsWanted, jobspyResultsWanted: limits.jobspyResultsWanted,
gradcrackerMaxJobsPerTerm: limits.gradcrackerMaxJobsPerTerm, gradcrackerMaxJobsPerTerm: limits.gradcrackerMaxJobsPerTerm,
ukvisajobsMaxJobs: limits.ukvisajobsMaxJobs, ukvisajobsMaxJobs: limits.ukvisajobsMaxJobs,
jobspyCountryIndeed: values.country, jobspyCountryIndeed: values.country,
jobspyLocation: formatCountryLabel(values.country), jobspyLocation,
}); });
await refreshSettings(); await refreshSettings();
await startPipelineRun({ await startPipelineRun({

View File

@ -236,6 +236,14 @@ const nullIfSameSortedList = (
defaultValue: string[], defaultValue: string[],
) => (isSameSortedStringList(value, defaultValue) ? null : (value ?? null)); ) => (isSameSortedStringList(value, defaultValue) ? null : (value ?? null));
const withAlwaysOnGlassdoor = (
sites: string[] | null | undefined,
): string[] => {
const unique = new Set((sites ?? []).filter(Boolean));
unique.add("glassdoor");
return Array.from(unique);
};
const getDerivedSettings = (settings: AppSettings | null) => { const getDerivedSettings = (settings: AppSettings | null) => {
const profileProjects = settings?.profileProjects ?? []; const profileProjects = settings?.profileProjects ?? [];
@ -289,8 +297,12 @@ const getDerivedSettings = (settings: AppSettings | null) => {
default: settings?.defaultJobspyCountryIndeed ?? "", default: settings?.defaultJobspyCountryIndeed ?? "",
}, },
sites: { sites: {
effective: settings?.jobspySites ?? ["indeed", "linkedin"], effective: withAlwaysOnGlassdoor(
default: settings?.defaultJobspySites ?? ["indeed", "linkedin"], settings?.jobspySites ?? ["indeed", "linkedin", "glassdoor"],
),
default: withAlwaysOnGlassdoor(
settings?.defaultJobspySites ?? ["indeed", "linkedin", "glassdoor"],
),
}, },
linkedinFetchDescription: { linkedinFetchDescription: {
effective: settings?.jobspyLinkedinFetchDescription ?? true, effective: settings?.jobspyLinkedinFetchDescription ?? true,
@ -691,7 +703,7 @@ export const SettingsPage: React.FC = () => {
jobspy.countryIndeed.default, jobspy.countryIndeed.default,
), ),
jobspySites: nullIfSameSortedList( jobspySites: nullIfSameSortedList(
data.jobspySites, withAlwaysOnGlassdoor(data.jobspySites),
jobspy.sites.default, jobspy.sites.default,
), ),
jobspyLinkedinFetchDescription: nullIfSame( jobspyLinkedinFetchDescription: nullIfSame(

View File

@ -96,4 +96,69 @@ describe("AutomaticRunTab", () => {
), ),
).toBeInTheDocument(); ).toBeInTheDocument();
}); });
it("disables glassdoor for unsupported countries with guidance copy", async () => {
const onSetPipelineSources = vi.fn();
render(
<AutomaticRunTab
open
settings={
{
searchTerms: ["backend engineer"],
jobspyCountryIndeed: "japan",
} as AppSettings
}
enabledSources={["linkedin", "glassdoor"]}
pipelineSources={["linkedin", "glassdoor"]}
onToggleSource={vi.fn()}
onSetPipelineSources={onSetPipelineSources}
isPipelineRunning={false}
onSaveAndRun={vi.fn().mockResolvedValue(undefined)}
/>,
);
await waitFor(() => {
expect(onSetPipelineSources).toHaveBeenCalledWith(["linkedin"]);
});
const glassdoorButton = screen.getByRole("button", { name: "Glassdoor" });
expect(glassdoorButton).toBeDisabled();
expect(glassdoorButton.getAttribute("title")).toContain(
"Glassdoor is not available for the selected country.",
);
});
it("disables glassdoor for supported countries until city is provided", async () => {
const onSetPipelineSources = vi.fn();
render(
<AutomaticRunTab
open
settings={
{
searchTerms: ["backend engineer"],
jobspyCountryIndeed: "united kingdom",
jobspyLocation: "United Kingdom",
} as AppSettings
}
enabledSources={["linkedin", "glassdoor"]}
pipelineSources={["linkedin", "glassdoor"]}
onToggleSource={vi.fn()}
onSetPipelineSources={onSetPipelineSources}
isPipelineRunning={false}
onSaveAndRun={vi.fn().mockResolvedValue(undefined)}
/>,
);
await waitFor(() => {
expect(onSetPipelineSources).toHaveBeenCalledWith(["linkedin"]);
});
const glassdoorButton = screen.getByRole("button", { name: "Glassdoor" });
expect(glassdoorButton).toBeDisabled();
expect(glassdoorButton.getAttribute("title")).toContain(
"Set a Glassdoor city in Advanced settings to enable Glassdoor.",
);
});
}); });

View File

@ -1,14 +1,13 @@
import * as PopoverPrimitive from "@radix-ui/react-popover"; import * as PopoverPrimitive from "@radix-ui/react-popover";
import { import {
formatCountryLabel, formatCountryLabel,
getCompatibleSourcesForCountry,
isSourceAllowedForCountry, isSourceAllowedForCountry,
normalizeCountryKey, normalizeCountryKey,
SUPPORTED_COUNTRY_KEYS, SUPPORTED_COUNTRY_KEYS,
} from "@shared/location-support.js"; } from "@shared/location-support.js";
import type { AppSettings, JobSource } from "@shared/types"; import type { AppSettings, JobSource } from "@shared/types";
import { Check, ChevronsUpDown, Loader2, Sparkles, X } from "lucide-react"; import { Check, ChevronsUpDown, Loader2, Sparkles, X } from "lucide-react";
import { useEffect, useMemo, useState } from "react"; import { useCallback, useEffect, useMemo, useState } from "react";
import { useForm } from "react-hook-form"; import { useForm } from "react-hook-form";
import { import {
Accordion, Accordion,
@ -71,12 +70,18 @@ interface AutomaticRunFormValues {
minSuitabilityScore: string; minSuitabilityScore: string;
runBudget: string; runBudget: string;
country: string; country: string;
glassdoorLocation: string;
searchTerms: string[]; searchTerms: string[];
searchTermDraft: string; searchTermDraft: string;
} }
type AutomaticPresetSelection = AutomaticPresetId | "custom"; type AutomaticPresetSelection = AutomaticPresetId | "custom";
const GLASSDOOR_COUNTRY_REASON =
"Glassdoor is not available for the selected country.";
const GLASSDOOR_LOCATION_REASON =
"Set a Glassdoor city in Advanced settings to enable Glassdoor.";
function toNumber(input: string, min: number, max: number, fallback: number) { function toNumber(input: string, min: number, max: number, fallback: number) {
const parsed = Number.parseInt(input, 10); const parsed = Number.parseInt(input, 10);
if (Number.isNaN(parsed)) return fallback; if (Number.isNaN(parsed)) return fallback;
@ -134,6 +139,7 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
minSuitabilityScore: String(DEFAULT_VALUES.minSuitabilityScore), minSuitabilityScore: String(DEFAULT_VALUES.minSuitabilityScore),
runBudget: String(DEFAULT_VALUES.runBudget), runBudget: String(DEFAULT_VALUES.runBudget),
country: DEFAULT_VALUES.country, country: DEFAULT_VALUES.country,
glassdoorLocation: "",
searchTerms: DEFAULT_VALUES.searchTerms, searchTerms: DEFAULT_VALUES.searchTerms,
searchTermDraft: "", searchTermDraft: "",
}, },
@ -144,6 +150,7 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
const minScoreInput = watch("minSuitabilityScore"); const minScoreInput = watch("minSuitabilityScore");
const runBudgetInput = watch("runBudget"); const runBudgetInput = watch("runBudget");
const countryInput = watch("country"); const countryInput = watch("country");
const glassdoorLocationInput = watch("glassdoorLocation");
const searchTerms = watch("searchTerms"); const searchTerms = watch("searchTerms");
const searchTermDraft = watch("searchTermDraft"); const searchTermDraft = watch("searchTermDraft");
@ -164,12 +171,24 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
settings?.jobspyLocation ?? settings?.jobspyLocation ??
DEFAULT_VALUES.country, DEFAULT_VALUES.country,
); );
const rememberedCountryKey = rememberedCountry || DEFAULT_VALUES.country;
const rememberedLocationRaw = settings?.jobspyLocation?.trim() ?? "";
const rememberedLocationNormalized = normalizeCountryKey(
rememberedLocationRaw,
);
const rememberedGlassdoorLocation =
rememberedLocationRaw &&
rememberedLocationNormalized &&
rememberedLocationNormalized !== normalizeCountryKey(rememberedCountryKey)
? rememberedLocationRaw
: "";
reset({ reset({
topN: String(topN), topN: String(topN),
minSuitabilityScore: String(minSuitabilityScore), minSuitabilityScore: String(minSuitabilityScore),
runBudget: String(rememberedRunBudget), runBudget: String(rememberedRunBudget),
country: rememberedCountry || DEFAULT_VALUES.country, country: rememberedCountry || DEFAULT_VALUES.country,
glassdoorLocation: rememberedGlassdoorLocation,
searchTerms: settings?.searchTerms ?? DEFAULT_VALUES.searchTerms, searchTerms: settings?.searchTerms ?? DEFAULT_VALUES.searchTerms,
searchTermDraft: "", searchTermDraft: "",
}); });
@ -200,27 +219,40 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
), ),
runBudget: toNumber(runBudgetInput, 1, 1000, DEFAULT_VALUES.runBudget), runBudget: toNumber(runBudgetInput, 1, 1000, DEFAULT_VALUES.runBudget),
country: normalizedCountry || DEFAULT_VALUES.country, country: normalizedCountry || DEFAULT_VALUES.country,
glassdoorLocation: glassdoorLocationInput.trim() || undefined,
searchTerms, searchTerms,
}; };
}, [topNInput, minScoreInput, runBudgetInput, countryInput, searchTerms]); }, [
topNInput,
minScoreInput,
runBudgetInput,
countryInput,
glassdoorLocationInput,
searchTerms,
]);
const isSourceAvailableForRun = useCallback(
(source: JobSource) => {
if (!isSourceAllowedForCountry(source, values.country)) return false;
if (source === "glassdoor" && !values.glassdoorLocation) return false;
return true;
},
[values.country, values.glassdoorLocation],
);
const compatibleEnabledSources = useMemo( const compatibleEnabledSources = useMemo(
() => () => enabledSources.filter((source) => isSourceAvailableForRun(source)),
enabledSources.filter((source) => [enabledSources, isSourceAvailableForRun],
isSourceAllowedForCountry(source, values.country),
),
[enabledSources, values.country],
); );
const compatiblePipelineSources = useMemo( const compatiblePipelineSources = useMemo(
() => getCompatibleSourcesForCountry(pipelineSources, values.country), () => pipelineSources.filter((source) => isSourceAvailableForRun(source)),
[pipelineSources, values.country], [pipelineSources, isSourceAvailableForRun],
); );
useEffect(() => { useEffect(() => {
const filtered = getCompatibleSourcesForCountry( const filtered = pipelineSources.filter((source) =>
pipelineSources, isSourceAvailableForRun(source),
values.country,
); );
if (filtered.length === pipelineSources.length) return; if (filtered.length === pipelineSources.length) return;
if (filtered.length > 0) { if (filtered.length > 0) {
@ -232,9 +264,9 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
} }
}, [ }, [
compatibleEnabledSources, compatibleEnabledSources,
isSourceAvailableForRun,
onSetPipelineSources, onSetPipelineSources,
pipelineSources, pipelineSources,
values.country,
]); ]);
const estimate = useMemo( const estimate = useMemo(
@ -441,6 +473,23 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
} }
/> />
</div> </div>
<div className="space-y-2 md:col-span-3">
<Label htmlFor="glassdoor-location">Glassdoor city</Label>
<Input
id="glassdoor-location"
value={glassdoorLocationInput}
onChange={(event) =>
setValue("glassdoorLocation", event.target.value, {
shouldDirty: true,
})
}
placeholder='e.g. "London"'
/>
<p className="text-xs text-muted-foreground">
Required only for Glassdoor. Use a city (not country) to
keep results localized.
</p>
</div>
</div> </div>
</AccordionContent> </AccordionContent>
</AccordionItem> </AccordionItem>
@ -526,12 +575,18 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
<CardContent className="flex flex-wrap gap-2"> <CardContent className="flex flex-wrap gap-2">
<TooltipProvider> <TooltipProvider>
{enabledSources.map((source) => { {enabledSources.map((source) => {
const allowed = isSourceAllowedForCountry( const countryAllowed = isSourceAllowedForCountry(
source, source,
values.country, values.country,
); );
const allowed = isSourceAvailableForRun(source);
const selected = compatiblePipelineSources.includes(source); const selected = compatiblePipelineSources.includes(source);
const disabledReason = `${sourceLabel[source]} is available only when country is United Kingdom.`; const disabledReason =
source === "glassdoor"
? countryAllowed
? GLASSDOOR_LOCATION_REASON
: GLASSDOOR_COUNTRY_REASON
: `${sourceLabel[source]} is available only when country is United Kingdom.`;
const button = ( const button = (
<Button <Button
@ -540,6 +595,7 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
size="sm" size="sm"
variant={selected ? "default" : "outline"} variant={selected ? "default" : "outline"}
disabled={!allowed} disabled={!allowed}
title={!allowed ? disabledReason : undefined}
onClick={() => onToggleSource(source, !selected)} onClick={() => onToggleSource(source, !selected)}
> >
{sourceLabel[source]} {sourceLabel[source]}
@ -553,9 +609,7 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
return ( return (
<Tooltip key={source}> <Tooltip key={source}>
<TooltipTrigger asChild> <TooltipTrigger asChild>
<span className="inline-flex" title={disabledReason}> <span className="inline-flex">{button}</span>
{button}
</span>
</TooltipTrigger> </TooltipTrigger>
<TooltipContent side="top">{disabledReason}</TooltipContent> <TooltipContent side="top">{disabledReason}</TooltipContent>
</Tooltip> </Tooltip>

View File

@ -42,11 +42,11 @@ describe("automatic-run utilities", () => {
const limits = deriveExtractorLimits({ const limits = deriveExtractorLimits({
budget: 750, budget: 750,
searchTerms: ["a", "b", "c"], searchTerms: ["a", "b", "c"],
sources: ["indeed", "linkedin", "gradcracker"], sources: ["indeed", "linkedin", "glassdoor", "gradcracker"],
}); });
const cap = const cap =
2 * limits.jobspyResultsWanted * 3 + limits.gradcrackerMaxJobsPerTerm * 3; 3 * limits.jobspyResultsWanted * 3 + limits.gradcrackerMaxJobsPerTerm * 3;
expect(cap).toBeLessThanOrEqual(750); expect(cap).toBeLessThanOrEqual(750);
}); });

View File

@ -8,6 +8,7 @@ export interface AutomaticRunValues {
searchTerms: string[]; searchTerms: string[];
runBudget: number; runBudget: number;
country: string; country: string;
glassdoorLocation?: string;
} }
export interface AutomaticPresetValues { export interface AutomaticPresetValues {
@ -71,12 +72,14 @@ export function deriveExtractorLimits(args: {
const termCount = Math.max(1, args.searchTerms.length); const termCount = Math.max(1, args.searchTerms.length);
const includesIndeed = args.sources.includes("indeed"); const includesIndeed = args.sources.includes("indeed");
const includesLinkedIn = args.sources.includes("linkedin"); const includesLinkedIn = args.sources.includes("linkedin");
const includesGlassdoor = args.sources.includes("glassdoor");
const includesGradcracker = args.sources.includes("gradcracker"); const includesGradcracker = args.sources.includes("gradcracker");
const includesUkVisaJobs = args.sources.includes("ukvisajobs"); const includesUkVisaJobs = args.sources.includes("ukvisajobs");
const weightedContributors = const weightedContributors =
(includesIndeed ? termCount : 0) + (includesIndeed ? termCount : 0) +
(includesLinkedIn ? termCount : 0) + (includesLinkedIn ? termCount : 0) +
(includesGlassdoor ? termCount : 0) +
(includesGradcracker ? termCount : 0) + (includesGradcracker ? termCount : 0) +
(includesUkVisaJobs ? 1 : 0); (includesUkVisaJobs ? 1 : 0);
@ -133,13 +136,16 @@ export function calculateAutomaticEstimate(args: {
const hasUkVisaJobs = sources.includes("ukvisajobs"); const hasUkVisaJobs = sources.includes("ukvisajobs");
const hasIndeed = sources.includes("indeed"); const hasIndeed = sources.includes("indeed");
const hasLinkedIn = sources.includes("linkedin"); const hasLinkedIn = sources.includes("linkedin");
const hasGlassdoor = sources.includes("glassdoor");
const limits = deriveExtractorLimits({ const limits = deriveExtractorLimits({
budget: values.runBudget, budget: values.runBudget,
searchTerms: values.searchTerms, searchTerms: values.searchTerms,
sources, sources,
}); });
const jobspySitesCount = [hasIndeed, hasLinkedIn].filter(Boolean).length; const jobspySitesCount = [hasIndeed, hasLinkedIn, hasGlassdoor].filter(
Boolean,
).length;
const jobspyCap = jobspySitesCount * limits.jobspyResultsWanted * termCount; const jobspyCap = jobspySitesCount * limits.jobspyResultsWanted * termCount;
const gradcrackerCap = hasGradcracker const gradcrackerCap = hasGradcracker
? limits.gradcrackerMaxJobsPerTerm * termCount ? limits.gradcrackerMaxJobsPerTerm * termCount

View File

@ -12,6 +12,7 @@ export const orderedSources: JobSource[] = [
"gradcracker", "gradcracker",
"indeed", "indeed",
"linkedin", "linkedin",
"glassdoor",
"ukvisajobs", "ukvisajobs",
]; ];
export const orderedFilterSources: JobSource[] = [...orderedSources, "manual"]; export const orderedFilterSources: JobSource[] = [...orderedSources, "manual"];

View File

@ -1,6 +1,10 @@
import type { AppSettings, JobListItem, JobSource } from "@shared/types"; import type { AppSettings, JobListItem, JobSource } from "@shared/types";
import type { FilterTab, JobSort } from "./constants"; import type { FilterTab, JobSort } from "./constants";
import { orderedFilterSources, orderedSources } from "./constants"; import {
DEFAULT_PIPELINE_SOURCES,
orderedFilterSources,
orderedSources,
} from "./constants";
const dateValue = (value: string | null) => { const dateValue = (value: string | null) => {
if (!value) return null; if (!value) return null;
@ -159,7 +163,7 @@ export const getSourcesWithJobs = (jobs: JobListItem[]): JobSource[] => {
export const getEnabledSources = ( export const getEnabledSources = (
settings: AppSettings | null, settings: AppSettings | null,
): JobSource[] => { ): JobSource[] => {
if (!settings) return [...orderedSources]; if (!settings) return [...DEFAULT_PIPELINE_SOURCES, "glassdoor"];
const enabled: JobSource[] = []; const enabled: JobSource[] = [];
const jobspySites = settings.jobspySites ?? []; const jobspySites = settings.jobspySites ?? [];
@ -176,10 +180,16 @@ export const getEnabledSources = (
if (hasUkVisaJobsAuth) enabled.push(source); if (hasUkVisaJobsAuth) enabled.push(source);
continue; continue;
} }
if (source === "indeed" || source === "linkedin") { if (
if (jobspySites.includes(source)) enabled.push(source); source === "indeed" ||
source === "linkedin" ||
source === "glassdoor"
) {
if (source === "glassdoor" || jobspySites.includes(source)) {
enabled.push(source);
}
} }
} }
return enabled.length > 0 ? enabled : [...orderedSources]; return enabled.length > 0 ? enabled : [...DEFAULT_PIPELINE_SOURCES];
}; };

View File

@ -8,7 +8,7 @@ import { JobspySection } from "./JobspySection";
const JobspyHarness = () => { const JobspyHarness = () => {
const methods = useForm<UpdateSettingsInput>({ const methods = useForm<UpdateSettingsInput>({
defaultValues: { defaultValues: {
jobspySites: ["indeed", "linkedin"], jobspySites: ["indeed", "linkedin", "glassdoor"],
jobspyLocation: "UK", jobspyLocation: "UK",
jobspyResultsWanted: 200, jobspyResultsWanted: 200,
jobspyHoursOld: 72, jobspyHoursOld: 72,
@ -24,8 +24,8 @@ const JobspyHarness = () => {
<JobspySection <JobspySection
values={{ values={{
sites: { sites: {
default: ["indeed", "linkedin"], default: ["indeed", "linkedin", "glassdoor"],
effective: ["indeed", "linkedin"], effective: ["indeed", "linkedin", "glassdoor"],
}, },
location: { default: "UK", effective: "UK" }, location: { default: "UK", effective: "UK" },
resultsWanted: { default: 200, effective: 200 }, resultsWanted: { default: 200, effective: 200 },
@ -51,6 +51,7 @@ describe("JobspySection", () => {
expect(indeedCheckbox).toBeChecked(); expect(indeedCheckbox).toBeChecked();
expect(linkedinCheckbox).toBeChecked(); expect(linkedinCheckbox).toBeChecked();
expect(screen.queryByLabelText(/glassdoor/i)).not.toBeInTheDocument();
fireEvent.click(indeedCheckbox); fireEvent.click(indeedCheckbox);
expect(indeedCheckbox).not.toBeChecked(); expect(indeedCheckbox).not.toBeChecked();

View File

@ -43,6 +43,12 @@ export const JobspySection: React.FC<JobspySectionProps> = ({
linkedinFetchDescription, linkedinFetchDescription,
isRemote, isRemote,
} = values; } = values;
const configurableDefaultSites = sites.default.filter(
(site) => site !== "glassdoor",
);
const configurableEffectiveSites = sites.effective.filter(
(site) => site !== "glassdoor",
);
const { const {
control, control,
register, register,
@ -130,13 +136,13 @@ export const JobspySection: React.FC<JobspySectionProps> = ({
</p> </p>
)} )}
<div className="text-xs text-muted-foreground"> <div className="text-xs text-muted-foreground">
Select which sites JobSpy should scrape. Select configurable sites JobSpy should scrape.
</div> </div>
<div className="flex gap-2 text-xs text-muted-foreground"> <div className="flex gap-2 text-xs text-muted-foreground">
<span> <span>
Effective: {(sites.effective || []).join(", ") || "None"} Effective: {configurableEffectiveSites.join(", ") || "None"}
</span> </span>
<span>Default: {(sites.default || []).join(", ")}</span> <span>Default: {configurableDefaultSites.join(", ")}</span>
</div> </div>
</div> </div>

View File

@ -138,6 +138,7 @@ export const sourceLabel: Record<Job["source"], string> = {
gradcracker: "Gradcracker", gradcracker: "Gradcracker",
indeed: "Indeed", indeed: "Indeed",
linkedin: "LinkedIn", linkedin: "LinkedIn",
glassdoor: "Glassdoor",
ukvisajobs: "UK Visa Jobs", ukvisajobs: "UK Visa Jobs",
manual: "Manual", manual: "Manual",
}; };

View File

@ -44,6 +44,17 @@ describe.sequential("Pipeline API routes", () => {
topN: 5, topN: 5,
sources: ["gradcracker"], sources: ["gradcracker"],
}); });
const glassdoorRunRes = await fetch(`${baseUrl}/api/pipeline/run`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ sources: ["glassdoor"] }),
});
const glassdoorRunBody = await glassdoorRunRes.json();
expect(glassdoorRunBody.ok).toBe(true);
expect(runPipeline).toHaveBeenNthCalledWith(2, {
sources: ["glassdoor"],
});
}); });
it("returns conflict when cancelling with no active pipeline", async () => { it("returns conflict when cancelling with no active pipeline", async () => {

View File

@ -98,7 +98,9 @@ const runPipelineSchema = z.object({
topN: z.number().min(1).max(50).optional(), topN: z.number().min(1).max(50).optional(),
minSuitabilityScore: z.number().min(0).max(100).optional(), minSuitabilityScore: z.number().min(0).max(100).optional(),
sources: z sources: z
.array(z.enum(["gradcracker", "indeed", "linkedin", "ukvisajobs"])) .array(
z.enum(["gradcracker", "indeed", "linkedin", "glassdoor", "ukvisajobs"]),
)
.min(1) .min(1)
.optional(), .optional(),
}); });

View File

@ -28,7 +28,7 @@ export const DEMO_DEFAULT_SETTINGS: DemoDefaultSettings = {
jobspyResultsWanted: "25", jobspyResultsWanted: "25",
jobspyHoursOld: "72", jobspyHoursOld: "72",
jobspyCountryIndeed: "US", jobspyCountryIndeed: "US",
jobspySites: JSON.stringify(["linkedin", "indeed"]), jobspySites: JSON.stringify(["linkedin", "indeed", "glassdoor"]),
jobspyLinkedinFetchDescription: "1", jobspyLinkedinFetchDescription: "1",
jobspyIsRemote: "0", jobspyIsRemote: "0",
resumeProjects: JSON.stringify({ resumeProjects: JSON.stringify({
@ -253,6 +253,7 @@ export const COMPANY_SUFFIXES = [
export const DEMO_SOURCE_BASE_URLS: Record<JobSource, string> = { export const DEMO_SOURCE_BASE_URLS: Record<JobSource, string> = {
linkedin: "https://www.linkedin.com", linkedin: "https://www.linkedin.com",
indeed: "https://www.indeed.com", indeed: "https://www.indeed.com",
glassdoor: "https://www.glassdoor.com",
gradcracker: "https://www.gradcracker.com", gradcracker: "https://www.gradcracker.com",
ukvisajobs: "https://www.ukvisajobs.com", ukvisajobs: "https://www.ukvisajobs.com",
manual: "https://example.com", manual: "https://example.com",

View File

@ -17,7 +17,14 @@ export const jobs = sqliteTable("jobs", {
// From crawler // From crawler
source: text("source", { source: text("source", {
enum: ["gradcracker", "indeed", "linkedin", "ukvisajobs", "manual"], enum: [
"gradcracker",
"indeed",
"linkedin",
"glassdoor",
"ukvisajobs",
"manual",
],
}) })
.notNull() .notNull()
.default("gradcracker"), .default("gradcracker"),

View File

@ -37,6 +37,7 @@ import {
const DEFAULT_CONFIG: PipelineConfig = { const DEFAULT_CONFIG: PipelineConfig = {
topN: 10, topN: 10,
minSuitabilityScore: 50, minSuitabilityScore: 50,
// Keep Glassdoor opt-in via source picker/settings; do not enable by default.
sources: ["gradcracker", "indeed", "linkedin", "ukvisajobs"], sources: ["gradcracker", "indeed", "linkedin", "ukvisajobs"],
outputDir: join(getDataDir(), "pdfs"), outputDir: join(getDataDir(), "pdfs"),
enableCrawling: true, enableCrawling: true,

View File

@ -76,6 +76,92 @@ describe("discoverJobsStep", () => {
); );
}); });
it("passes glassdoor through to JobSpy when selected", async () => {
const settingsRepo = await import("../../repositories/settings");
const jobSpy = await import("../../services/jobspy");
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
searchTerms: JSON.stringify(["engineer"]),
jobspySites: JSON.stringify(["glassdoor"]),
} as any);
vi.mocked(jobSpy.runJobSpy).mockResolvedValue({
success: true,
jobs: [
{
source: "glassdoor",
title: "Engineer",
employer: "ACME",
jobUrl: "https://example.com/job",
},
],
} as any);
const result = await discoverJobsStep({
mergedConfig: {
...config,
sources: ["glassdoor"],
},
});
expect(result.discoveredJobs).toHaveLength(1);
expect(vi.mocked(jobSpy.runJobSpy)).toHaveBeenCalledWith(
expect.objectContaining({ sites: ["glassdoor"] }),
);
});
it("keeps glassdoor enabled even when jobspySites override omits it", async () => {
const settingsRepo = await import("../../repositories/settings");
const jobSpy = await import("../../services/jobspy");
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
searchTerms: JSON.stringify(["engineer"]),
jobspySites: JSON.stringify(["linkedin"]),
} as any);
vi.mocked(jobSpy.runJobSpy).mockResolvedValue({
success: true,
jobs: [],
} as any);
await discoverJobsStep({
mergedConfig: {
...config,
sources: ["glassdoor", "linkedin"],
},
});
expect(vi.mocked(jobSpy.runJobSpy)).toHaveBeenCalledWith(
expect.objectContaining({ sites: ["glassdoor", "linkedin"] }),
);
});
it("filters out glassdoor for unsupported countries", async () => {
const settingsRepo = await import("../../repositories/settings");
const jobSpy = await import("../../services/jobspy");
vi.mocked(settingsRepo.getAllSettings).mockResolvedValue({
searchTerms: JSON.stringify(["engineer"]),
jobspyCountryIndeed: "japan",
} as any);
vi.mocked(jobSpy.runJobSpy).mockResolvedValue({
success: true,
jobs: [],
} as any);
await discoverJobsStep({
mergedConfig: {
...config,
sources: ["glassdoor", "linkedin"],
},
});
expect(vi.mocked(jobSpy.runJobSpy)).toHaveBeenCalledWith(
expect.objectContaining({ sites: ["linkedin"] }),
);
});
it("throws when all enabled sources fail", async () => { it("throws when all enabled sources fail", async () => {
const settingsRepo = await import("../../repositories/settings"); const settingsRepo = await import("../../repositories/settings");
const ukVisa = await import("../../services/ukvisajobs"); const ukVisa = await import("../../services/ukvisajobs");

View File

@ -67,8 +67,8 @@ export async function discoverJobsStep(args: {
} }
let jobSpySites = compatibleSources.filter( let jobSpySites = compatibleSources.filter(
(source): source is "indeed" | "linkedin" => (source): source is "indeed" | "linkedin" | "glassdoor" =>
source === "indeed" || source === "linkedin", source === "indeed" || source === "linkedin" || source === "glassdoor",
); );
const jobspySitesSettingRaw = settings.jobspySites; const jobspySitesSettingRaw = settings.jobspySites;
@ -76,7 +76,9 @@ export async function discoverJobsStep(args: {
try { try {
const allowed = JSON.parse(jobspySitesSettingRaw); const allowed = JSON.parse(jobspySitesSettingRaw);
if (Array.isArray(allowed)) { if (Array.isArray(allowed)) {
jobSpySites = jobSpySites.filter((site) => allowed.includes(site)); jobSpySites = jobSpySites.filter(
(site) => site === "glassdoor" || allowed.includes(site),
);
} }
} catch { } catch {
// ignore JSON parse error // ignore JSON parse error

View File

@ -105,6 +105,7 @@ function toJobSource(site: unknown): JobSource | null {
if (raw === "gradcracker") return "gradcracker"; if (raw === "gradcracker") return "gradcracker";
if (raw === "indeed") return "indeed"; if (raw === "indeed") return "indeed";
if (raw === "linkedin") return "linkedin"; if (raw === "linkedin") return "linkedin";
if (raw === "glassdoor") return "glassdoor";
return null; return null;
} }
@ -164,8 +165,8 @@ export async function runJobSpy(
const outputDir = join(dataDir, "imports"); const outputDir = join(dataDir, "imports");
await mkdir(outputDir, { recursive: true }); await mkdir(outputDir, { recursive: true });
const sites = (options.sites ?? ["indeed", "linkedin"]) const sites = (options.sites ?? ["indeed", "linkedin", "glassdoor"])
.filter((s) => s === "indeed" || s === "linkedin") .filter((s) => s === "indeed" || s === "linkedin" || s === "glassdoor")
.join(","); .join(",");
const searchTerms = resolveSearchTerms(options); const searchTerms = resolveSearchTerms(options);
@ -191,7 +192,7 @@ export async function runJobSpy(
stdio: ["ignore", "pipe", "pipe"], stdio: ["ignore", "pipe", "pipe"],
env: { env: {
...process.env, ...process.env,
JOBSPY_SITES: sites || "indeed,linkedin", JOBSPY_SITES: sites || "indeed,linkedin,glassdoor",
JOBSPY_SEARCH_TERM: searchTerm, JOBSPY_SEARCH_TERM: searchTerm,
JOBSPY_TERM_INDEX: String(i + 1), JOBSPY_TERM_INDEX: String(i + 1),
JOBSPY_TERM_TOTAL: String(searchTerms.length), JOBSPY_TERM_TOTAL: String(searchTerms.length),

View File

@ -79,6 +79,26 @@ describe("settings-conversion", () => {
expect(malformedOverride.value).toEqual(["web developer"]); expect(malformedOverride.value).toEqual(["web developer"]);
}); });
it("always includes glassdoor in resolved jobspySites", () => {
delete process.env.JOBSPY_SITES;
expect(resolveSettingValue("jobspySites", undefined).value).toEqual([
"indeed",
"linkedin",
"glassdoor",
]);
process.env.JOBSPY_SITES = "indeed,linkedin";
expect(resolveSettingValue("jobspySites", undefined).value).toEqual([
"indeed",
"linkedin",
"glassdoor",
]);
expect(
resolveSettingValue("jobspySites", JSON.stringify(["linkedin"])).value,
).toEqual(["linkedin", "glassdoor"]);
});
it("round-trips penalizeMissingSalary boolean setting", () => { it("round-trips penalizeMissingSalary boolean setting", () => {
expect(serializeSettingValue("penalizeMissingSalary", true)).toBe("1"); expect(serializeSettingValue("penalizeMissingSalary", true)).toBe("1");
expect(serializeSettingValue("penalizeMissingSalary", false)).toBe("0"); expect(serializeSettingValue("penalizeMissingSalary", false)).toBe("0");

View File

@ -57,6 +57,24 @@ function parseJsonArrayOrNull(raw: string | undefined): string[] | null {
} }
} }
function normalizeJobspySites(value: string[]): string[] {
const seen = new Set<string>();
const normalized: string[] = [];
for (const site of value) {
const trimmed = site.trim();
if (!trimmed || seen.has(trimmed)) continue;
seen.add(trimmed);
normalized.push(trimmed);
}
if (!seen.has("glassdoor")) {
normalized.push("glassdoor");
}
return normalized;
}
function parseBitBoolOrNull(raw: string | undefined): boolean | null { function parseBitBoolOrNull(raw: string | undefined): boolean | null {
if (!raw) return null; if (!raw) return null;
return raw === "true" || raw === "1"; return raw === "true" || raw === "1";
@ -143,13 +161,13 @@ export const settingsConversionMetadata: SettingsConversionMetadata = {
}, },
jobspySites: { jobspySites: {
defaultValue: () => defaultValue: () =>
(process.env.JOBSPY_SITES || "indeed,linkedin") normalizeJobspySites(
.split(",") (process.env.JOBSPY_SITES || "indeed,linkedin,glassdoor").split(","),
.map((value) => value.trim()) ),
.filter(Boolean),
parseOverride: parseJsonArrayOrNull, parseOverride: parseJsonArrayOrNull,
serialize: serializeNullableJsonArray, serialize: serializeNullableJsonArray,
resolve: resolveWithNullishFallback, resolve: ({ defaultValue, overrideValue }) =>
normalizeJobspySites(overrideValue ?? defaultValue),
}, },
jobspyLinkedinFetchDescription: { jobspyLinkedinFetchDescription: {
defaultValue: () => defaultValue: () =>

View File

@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest";
import { import {
formatCountryLabel, formatCountryLabel,
getCompatibleSourcesForCountry, getCompatibleSourcesForCountry,
isGlassdoorCountry,
isSourceAllowedForCountry, isSourceAllowedForCountry,
isUkCountry, isUkCountry,
normalizeCountryKey, normalizeCountryKey,
@ -49,14 +50,24 @@ describe("location-support", () => {
expect(isSourceAllowedForCountry("ukvisajobs", "worldwide")).toBe(false); expect(isSourceAllowedForCountry("ukvisajobs", "worldwide")).toBe(false);
expect(isSourceAllowedForCountry("indeed", "united states")).toBe(true); expect(isSourceAllowedForCountry("indeed", "united states")).toBe(true);
expect(isSourceAllowedForCountry("linkedin", "worldwide")).toBe(true); expect(isSourceAllowedForCountry("linkedin", "worldwide")).toBe(true);
expect(isSourceAllowedForCountry("glassdoor", "united states")).toBe(true);
expect(isSourceAllowedForCountry("glassdoor", "japan")).toBe(false);
}); });
it("filters incompatible sources while preserving compatible order", () => { it("filters incompatible sources while preserving compatible order", () => {
expect( expect(
getCompatibleSourcesForCountry( getCompatibleSourcesForCountry(
["gradcracker", "indeed", "ukvisajobs", "linkedin"], ["gradcracker", "indeed", "glassdoor", "ukvisajobs", "linkedin"],
"united states", "united states",
), ),
).toEqual(["indeed", "linkedin"]); ).toEqual(["indeed", "glassdoor", "linkedin"]);
});
it("supports glassdoor only in explicitly supported countries", () => {
expect(isGlassdoorCountry("united kingdom")).toBe(true);
expect(isGlassdoorCountry("uk")).toBe(true);
expect(isGlassdoorCountry("usa")).toBe(true);
expect(isGlassdoorCountry("japan")).toBe(false);
expect(isGlassdoorCountry("worldwide")).toBe(false);
}); });
}); });

View File

@ -100,6 +100,30 @@ export const SUPPORTED_COUNTRY_INPUTS = [
] as const; ] as const;
const UK_ONLY_SOURCES = new Set<JobSource>(["gradcracker", "ukvisajobs"]); const UK_ONLY_SOURCES = new Set<JobSource>(["gradcracker", "ukvisajobs"]);
const GLASSDOOR_SUPPORTED_COUNTRIES = new Set(
[
"australia",
"austria",
"belgium",
"brazil",
"canada",
"france",
"germany",
"hong kong",
"india",
"ireland",
"italy",
"mexico",
"netherlands",
"new zealand",
"singapore",
"spain",
"switzerland",
"united kingdom",
"united states",
"vietnam",
].map((country) => normalizeCountryKey(country)),
);
export function normalizeCountryKey(value: string | null | undefined): string { export function normalizeCountryKey(value: string | null | undefined): string {
const normalized = value?.trim().toLowerCase() ?? ""; const normalized = value?.trim().toLowerCase() ?? "";
@ -125,12 +149,19 @@ export function isUkCountry(country: string | null | undefined): boolean {
return normalizeCountryKey(country) === "united kingdom"; return normalizeCountryKey(country) === "united kingdom";
} }
export function isGlassdoorCountry(
country: string | null | undefined,
): boolean {
return GLASSDOOR_SUPPORTED_COUNTRIES.has(normalizeCountryKey(country));
}
export function isSourceAllowedForCountry( export function isSourceAllowedForCountry(
source: JobSource, source: JobSource,
country: string | null | undefined, country: string | null | undefined,
): boolean { ): boolean {
if (!UK_ONLY_SOURCES.has(source)) return true; if (UK_ONLY_SOURCES.has(source)) return isUkCountry(country);
return isUkCountry(country); if (source === "glassdoor") return isGlassdoorCountry(country);
return true;
} }
export function getCompatibleSourcesForCountry( export function getCompatibleSourcesForCountry(

View File

@ -122,6 +122,7 @@ export type JobSource =
| "gradcracker" | "gradcracker"
| "indeed" | "indeed"
| "linkedin" | "linkedin"
| "glassdoor"
| "ukvisajobs" | "ukvisajobs"
| "manual"; | "manual";