diff --git a/README.md b/README.md index 3209922..f3e57f3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ AI-powered job discovery and application pipeline. Automatically finds jobs, sco ## Workflow -1. **Search**: Scrapes Gradcracker, Indeed, LinkedIn, and UK Visa Sponsorship jobs. +1. **Search**: Scrapes Gradcracker, Indeed, LinkedIn, Glassdoor, and UK Visa Sponsorship jobs. 2. **Score**: AI ranks jobs by suitability using the configured LLM provider (OpenRouter by default). 3. **Tailor**: Generates a custom resume summary for top-tier matches. 4. **Export**: Uses [RxResume v4](https://v4.rxresu.me) to create tailored PDFs. diff --git a/documentation/extractors/jobspy.md b/documentation/extractors/jobspy.md index 488a5dd..b54a07c 100644 --- a/documentation/extractors/jobspy.md +++ b/documentation/extractors/jobspy.md @@ -1,6 +1,6 @@ # JobSpy Extractor (How It Works) -This is a simple walkthrough of the JobSpy extractor used for Indeed and LinkedIn. +This is a simple walkthrough of the JobSpy extractor used for Indeed, LinkedIn, and Glassdoor. ## Big picture @@ -34,7 +34,7 @@ The Node service (`orchestrator/src/server/services/jobspy.ts`) controls the run The mapper normalizes fields like salary ranges, converts empty values to null, and keeps extra metadata (skills, company rating, remote flag, etc.) when available. -If a row is missing a valid site (`indeed` or `linkedin`) or a job URL, it gets skipped. +If a row is missing a valid site (`indeed`, `linkedin`, or `glassdoor`) or a job URL, it gets skipped. ## Notes diff --git a/extractors/jobspy/requirements.txt b/extractors/jobspy/requirements.txt index 8c5560a..45fa5b4 100644 --- a/extractors/jobspy/requirements.txt +++ b/extractors/jobspy/requirements.txt @@ -1 +1,2 @@ python-jobspy +pandas diff --git a/extractors/jobspy/scrape_jobs.py b/extractors/jobspy/scrape_jobs.py index 81bfe47..2c85bf1 100644 --- a/extractors/jobspy/scrape_jobs.py +++ b/extractors/jobspy/scrape_jobs.py @@ -3,9 +3,41 @@ import json import os from pathlib import Path +import pandas as pd from jobspy import scrape_jobs PROGRESS_PREFIX = "JOBOPS_PROGRESS " +COUNTRY_ALIASES = { + "uk": "united kingdom", + "united kingdom": "united kingdom", + "us": "united states", + "usa": "united states", + "united states": "united states", + "türkiye": "turkey", + "czech republic": "czechia", +} +GLASSDOOR_COUNTRY_TO_CITY = { + "australia": "Sydney", + "austria": "Vienna", + "belgium": "Brussels", + "brazil": "Sao Paulo", + "canada": "Toronto", + "france": "Paris", + "germany": "Berlin", + "hong kong": "Hong Kong", + "india": "Bengaluru", + "ireland": "Dublin", + "italy": "Milan", + "mexico": "Mexico City", + "netherlands": "Amsterdam", + "new zealand": "Auckland", + "singapore": "Singapore", + "spain": "Madrid", + "switzerland": "Zurich", + "united kingdom": "London", + "united states": "New York", + "vietnam": "Ho Chi Minh City", +} def _env_str(name: str, default: str) -> str: @@ -39,6 +71,47 @@ def _parse_sites(raw: str) -> list[str]: return [s.strip() for s in raw.split(",") if s.strip()] +def _normalize_country_token(value: str) -> str: + normalized = " ".join(value.strip().lower().split()) + return COUNTRY_ALIASES.get(normalized, normalized) + + +def _is_country_level_location(location: str, country_indeed: str) -> bool: + if not location.strip() or not country_indeed.strip(): + return False + return _normalize_country_token(location) == _normalize_country_token(country_indeed) + + +def _glassdoor_city_for_country(country_indeed: str, location: str) -> str | None: + country_key = _normalize_country_token(country_indeed or location) + return GLASSDOOR_COUNTRY_TO_CITY.get(country_key) + + +def _scrape_for_sites( + *, + sites: list[str], + search_term: str, + location: str | None, + results_wanted: int, + hours_old: int, + country_indeed: str, + linkedin_fetch_description: bool, + is_remote: bool, +) -> pd.DataFrame: + kwargs: dict[str, object] = { + "site_name": sites, + "search_term": search_term, + "results_wanted": results_wanted, + "hours_old": hours_old, + "country_indeed": country_indeed, + "linkedin_fetch_description": linkedin_fetch_description, + "is_remote": is_remote, + } + if location and location.strip(): + kwargs["location"] = location + return scrape_jobs(**kwargs) + + def main() -> int: sites = _parse_sites(_env_str("JOBSPY_SITES", "indeed,linkedin")) search_term = _env_str("JOBSPY_SEARCH_TERM", "web developer") @@ -68,16 +141,52 @@ def main() -> int: "searchTerm": search_term, }, ) - jobs = scrape_jobs( - site_name=sites, - search_term=search_term, - location=location, - results_wanted=results_wanted, - hours_old=hours_old, - country_indeed=country_indeed, - linkedin_fetch_description=linkedin_fetch_description, - is_remote=is_remote, - ) + frames: list[pd.DataFrame] = [] + non_glassdoor_sites = [site for site in sites if site != "glassdoor"] + + if non_glassdoor_sites: + frames.append( + _scrape_for_sites( + sites=non_glassdoor_sites, + search_term=search_term, + location=location, + results_wanted=results_wanted, + hours_old=hours_old, + country_indeed=country_indeed, + linkedin_fetch_description=linkedin_fetch_description, + is_remote=is_remote, + ) + ) + + if "glassdoor" in sites: + glassdoor_location = location + if _is_country_level_location(location, country_indeed): + # Glassdoor works best with city-level location terms. + fallback_city = _glassdoor_city_for_country(country_indeed, location) + if fallback_city: + glassdoor_location = fallback_city + print( + "jobspy: Glassdoor location matched country; using city fallback " + f"({fallback_city})" + ) + else: + print( + "jobspy: Glassdoor location matched country; keeping original location" + ) + frames.append( + _scrape_for_sites( + sites=["glassdoor"], + search_term=search_term, + location=glassdoor_location, + results_wanted=results_wanted, + hours_old=hours_old, + country_indeed=country_indeed, + linkedin_fetch_description=linkedin_fetch_description, + is_remote=is_remote, + ) + ) + + jobs = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame() print(f"Found {len(jobs)} jobs") _emit_progress( @@ -96,7 +205,6 @@ def main() -> int: escapechar="\\", index=False, ) - jobs.to_json(output_json, orient="records", force_ascii=False) print(f"Wrote CSV: {output_csv}") diff --git a/orchestrator/src/client/pages/OrchestratorPage.tsx b/orchestrator/src/client/pages/OrchestratorPage.tsx index c262866..507cb30 100644 --- a/orchestrator/src/client/pages/OrchestratorPage.tsx +++ b/orchestrator/src/client/pages/OrchestratorPage.tsx @@ -257,13 +257,17 @@ export const OrchestratorPage: React.FC = () => { searchTerms: values.searchTerms, sources: compatibleSources, }); + const jobspyLocation = compatibleSources.includes("glassdoor") + ? (values.glassdoorLocation ?? "").trim() || + formatCountryLabel(values.country) + : formatCountryLabel(values.country); await api.updateSettings({ searchTerms: values.searchTerms, jobspyResultsWanted: limits.jobspyResultsWanted, gradcrackerMaxJobsPerTerm: limits.gradcrackerMaxJobsPerTerm, ukvisajobsMaxJobs: limits.ukvisajobsMaxJobs, jobspyCountryIndeed: values.country, - jobspyLocation: formatCountryLabel(values.country), + jobspyLocation, }); await refreshSettings(); await startPipelineRun({ diff --git a/orchestrator/src/client/pages/SettingsPage.tsx b/orchestrator/src/client/pages/SettingsPage.tsx index 3c2d9f0..e355f9d 100644 --- a/orchestrator/src/client/pages/SettingsPage.tsx +++ b/orchestrator/src/client/pages/SettingsPage.tsx @@ -236,6 +236,14 @@ const nullIfSameSortedList = ( defaultValue: string[], ) => (isSameSortedStringList(value, defaultValue) ? null : (value ?? null)); +const withAlwaysOnGlassdoor = ( + sites: string[] | null | undefined, +): string[] => { + const unique = new Set((sites ?? []).filter(Boolean)); + unique.add("glassdoor"); + return Array.from(unique); +}; + const getDerivedSettings = (settings: AppSettings | null) => { const profileProjects = settings?.profileProjects ?? []; @@ -289,8 +297,12 @@ const getDerivedSettings = (settings: AppSettings | null) => { default: settings?.defaultJobspyCountryIndeed ?? "", }, sites: { - effective: settings?.jobspySites ?? ["indeed", "linkedin"], - default: settings?.defaultJobspySites ?? ["indeed", "linkedin"], + effective: withAlwaysOnGlassdoor( + settings?.jobspySites ?? ["indeed", "linkedin", "glassdoor"], + ), + default: withAlwaysOnGlassdoor( + settings?.defaultJobspySites ?? ["indeed", "linkedin", "glassdoor"], + ), }, linkedinFetchDescription: { effective: settings?.jobspyLinkedinFetchDescription ?? true, @@ -691,7 +703,7 @@ export const SettingsPage: React.FC = () => { jobspy.countryIndeed.default, ), jobspySites: nullIfSameSortedList( - data.jobspySites, + withAlwaysOnGlassdoor(data.jobspySites), jobspy.sites.default, ), jobspyLinkedinFetchDescription: nullIfSame( diff --git a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx index 8183fcc..c50141c 100644 --- a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx +++ b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.test.tsx @@ -96,4 +96,69 @@ describe("AutomaticRunTab", () => { ), ).toBeInTheDocument(); }); + + it("disables glassdoor for unsupported countries with guidance copy", async () => { + const onSetPipelineSources = vi.fn(); + + render( + , + ); + + await waitFor(() => { + expect(onSetPipelineSources).toHaveBeenCalledWith(["linkedin"]); + }); + + const glassdoorButton = screen.getByRole("button", { name: "Glassdoor" }); + expect(glassdoorButton).toBeDisabled(); + expect(glassdoorButton.getAttribute("title")).toContain( + "Glassdoor is not available for the selected country.", + ); + }); + + it("disables glassdoor for supported countries until city is provided", async () => { + const onSetPipelineSources = vi.fn(); + + render( + , + ); + + await waitFor(() => { + expect(onSetPipelineSources).toHaveBeenCalledWith(["linkedin"]); + }); + + const glassdoorButton = screen.getByRole("button", { name: "Glassdoor" }); + expect(glassdoorButton).toBeDisabled(); + expect(glassdoorButton.getAttribute("title")).toContain( + "Set a Glassdoor city in Advanced settings to enable Glassdoor.", + ); + }); }); diff --git a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx index c4db0c3..351c95a 100644 --- a/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx +++ b/orchestrator/src/client/pages/orchestrator/AutomaticRunTab.tsx @@ -1,14 +1,13 @@ import * as PopoverPrimitive from "@radix-ui/react-popover"; import { formatCountryLabel, - getCompatibleSourcesForCountry, isSourceAllowedForCountry, normalizeCountryKey, SUPPORTED_COUNTRY_KEYS, } from "@shared/location-support.js"; import type { AppSettings, JobSource } from "@shared/types"; import { Check, ChevronsUpDown, Loader2, Sparkles, X } from "lucide-react"; -import { useEffect, useMemo, useState } from "react"; +import { useCallback, useEffect, useMemo, useState } from "react"; import { useForm } from "react-hook-form"; import { Accordion, @@ -71,12 +70,18 @@ interface AutomaticRunFormValues { minSuitabilityScore: string; runBudget: string; country: string; + glassdoorLocation: string; searchTerms: string[]; searchTermDraft: string; } type AutomaticPresetSelection = AutomaticPresetId | "custom"; +const GLASSDOOR_COUNTRY_REASON = + "Glassdoor is not available for the selected country."; +const GLASSDOOR_LOCATION_REASON = + "Set a Glassdoor city in Advanced settings to enable Glassdoor."; + function toNumber(input: string, min: number, max: number, fallback: number) { const parsed = Number.parseInt(input, 10); if (Number.isNaN(parsed)) return fallback; @@ -134,6 +139,7 @@ export const AutomaticRunTab: React.FC = ({ minSuitabilityScore: String(DEFAULT_VALUES.minSuitabilityScore), runBudget: String(DEFAULT_VALUES.runBudget), country: DEFAULT_VALUES.country, + glassdoorLocation: "", searchTerms: DEFAULT_VALUES.searchTerms, searchTermDraft: "", }, @@ -144,6 +150,7 @@ export const AutomaticRunTab: React.FC = ({ const minScoreInput = watch("minSuitabilityScore"); const runBudgetInput = watch("runBudget"); const countryInput = watch("country"); + const glassdoorLocationInput = watch("glassdoorLocation"); const searchTerms = watch("searchTerms"); const searchTermDraft = watch("searchTermDraft"); @@ -164,12 +171,24 @@ export const AutomaticRunTab: React.FC = ({ settings?.jobspyLocation ?? DEFAULT_VALUES.country, ); + const rememberedCountryKey = rememberedCountry || DEFAULT_VALUES.country; + const rememberedLocationRaw = settings?.jobspyLocation?.trim() ?? ""; + const rememberedLocationNormalized = normalizeCountryKey( + rememberedLocationRaw, + ); + const rememberedGlassdoorLocation = + rememberedLocationRaw && + rememberedLocationNormalized && + rememberedLocationNormalized !== normalizeCountryKey(rememberedCountryKey) + ? rememberedLocationRaw + : ""; reset({ topN: String(topN), minSuitabilityScore: String(minSuitabilityScore), runBudget: String(rememberedRunBudget), country: rememberedCountry || DEFAULT_VALUES.country, + glassdoorLocation: rememberedGlassdoorLocation, searchTerms: settings?.searchTerms ?? DEFAULT_VALUES.searchTerms, searchTermDraft: "", }); @@ -200,27 +219,40 @@ export const AutomaticRunTab: React.FC = ({ ), runBudget: toNumber(runBudgetInput, 1, 1000, DEFAULT_VALUES.runBudget), country: normalizedCountry || DEFAULT_VALUES.country, + glassdoorLocation: glassdoorLocationInput.trim() || undefined, searchTerms, }; - }, [topNInput, minScoreInput, runBudgetInput, countryInput, searchTerms]); + }, [ + topNInput, + minScoreInput, + runBudgetInput, + countryInput, + glassdoorLocationInput, + searchTerms, + ]); + + const isSourceAvailableForRun = useCallback( + (source: JobSource) => { + if (!isSourceAllowedForCountry(source, values.country)) return false; + if (source === "glassdoor" && !values.glassdoorLocation) return false; + return true; + }, + [values.country, values.glassdoorLocation], + ); const compatibleEnabledSources = useMemo( - () => - enabledSources.filter((source) => - isSourceAllowedForCountry(source, values.country), - ), - [enabledSources, values.country], + () => enabledSources.filter((source) => isSourceAvailableForRun(source)), + [enabledSources, isSourceAvailableForRun], ); const compatiblePipelineSources = useMemo( - () => getCompatibleSourcesForCountry(pipelineSources, values.country), - [pipelineSources, values.country], + () => pipelineSources.filter((source) => isSourceAvailableForRun(source)), + [pipelineSources, isSourceAvailableForRun], ); useEffect(() => { - const filtered = getCompatibleSourcesForCountry( - pipelineSources, - values.country, + const filtered = pipelineSources.filter((source) => + isSourceAvailableForRun(source), ); if (filtered.length === pipelineSources.length) return; if (filtered.length > 0) { @@ -232,9 +264,9 @@ export const AutomaticRunTab: React.FC = ({ } }, [ compatibleEnabledSources, + isSourceAvailableForRun, onSetPipelineSources, pipelineSources, - values.country, ]); const estimate = useMemo( @@ -441,6 +473,23 @@ export const AutomaticRunTab: React.FC = ({ } /> +
+ + + setValue("glassdoorLocation", event.target.value, { + shouldDirty: true, + }) + } + placeholder='e.g. "London"' + /> +

+ Required only for Glassdoor. Use a city (not country) to + keep results localized. +

+
@@ -526,12 +575,18 @@ export const AutomaticRunTab: React.FC = ({ {enabledSources.map((source) => { - const allowed = isSourceAllowedForCountry( + const countryAllowed = isSourceAllowedForCountry( source, values.country, ); + const allowed = isSourceAvailableForRun(source); const selected = compatiblePipelineSources.includes(source); - const disabledReason = `${sourceLabel[source]} is available only when country is United Kingdom.`; + const disabledReason = + source === "glassdoor" + ? countryAllowed + ? GLASSDOOR_LOCATION_REASON + : GLASSDOOR_COUNTRY_REASON + : `${sourceLabel[source]} is available only when country is United Kingdom.`; const button = (