Jobber/extractors/jobspy/scrape_jobs.py

import csv
import json
import os
from pathlib import Path

import pandas as pd
from jobspy import scrape_jobs

PROGRESS_PREFIX = "JOBOPS_PROGRESS "
COUNTRY_ALIASES = {
    "uk": "united kingdom",
    "united kingdom": "united kingdom",
    "us": "united states",
    "usa": "united states",
    "united states": "united states",
    "türkiye": "turkey",
    "czech republic": "czechia",
}
GLASSDOOR_COUNTRY_TO_CITY = {
    "australia": "Sydney",
    "austria": "Vienna",
    "belgium": "Brussels",
    "brazil": "Sao Paulo",
    "canada": "Toronto",
    "france": "Paris",
    "germany": "Berlin",
    "hong kong": "Hong Kong",
    "india": "Bengaluru",
    "ireland": "Dublin",
    "italy": "Milan",
    "mexico": "Mexico City",
    "netherlands": "Amsterdam",
    "new zealand": "Auckland",
    "singapore": "Singapore",
    "spain": "Madrid",
    "switzerland": "Zurich",
    "united kingdom": "London",
    "united states": "New York",
    "vietnam": "Ho Chi Minh City",
}


def _env_str(name: str, default: str) -> str:
    value = os.getenv(name)
    return value if value and value.strip() else default


def _env_int(name: str, default: int) -> int:
    value = os.getenv(name)
    if value is None or value.strip() == "":
        return default
    try:
        return int(value)
    except ValueError:
        return default


def _env_bool(name: str, default: bool) -> bool:
    value = os.getenv(name)
    if value is None or value.strip() == "":
        return default
    return value.strip().lower() in ("1", "true", "yes", "y", "on")


def _emit_progress(event: str, payload: dict) -> None:
    serialized = json.dumps({"event": event, **payload}, ensure_ascii=True)
    print(f"{PROGRESS_PREFIX}{serialized}", flush=True)


def _parse_sites(raw: str) -> list[str]:
    return [s.strip() for s in raw.split(",") if s.strip()]


def _normalize_country_token(value: str) -> str:
    normalized = " ".join(value.strip().lower().split())
    return COUNTRY_ALIASES.get(normalized, normalized)


def _is_country_level_location(location: str, country_indeed: str) -> bool:
    if not location.strip() or not country_indeed.strip():
        return False
    return _normalize_country_token(location) == _normalize_country_token(country_indeed)


def _glassdoor_city_for_country(country_indeed: str, location: str) -> str | None:
    country_key = _normalize_country_token(country_indeed or location)
    return GLASSDOOR_COUNTRY_TO_CITY.get(country_key)


def _scrape_for_sites(
    *,
    sites: list[str],
    search_term: str,
    location: str | None,
    results_wanted: int,
    hours_old: int,
    country_indeed: str,
    linkedin_fetch_description: bool,
    is_remote: bool,
) -> pd.DataFrame:
    kwargs: dict[str, object] = {
        "site_name": sites,
        "search_term": search_term,
        "results_wanted": results_wanted,
        "hours_old": hours_old,
        "country_indeed": country_indeed,
        "linkedin_fetch_description": linkedin_fetch_description,
        "is_remote": is_remote,
    }
    if location and location.strip():
        kwargs["location"] = location
    return scrape_jobs(**kwargs)


def main() -> int:
    sites = _parse_sites(_env_str("JOBSPY_SITES", "indeed,linkedin"))
    search_term = _env_str("JOBSPY_SEARCH_TERM", "web developer")
    location = _env_str("JOBSPY_LOCATION", "UK")
    results_wanted = _env_int("JOBSPY_RESULTS_WANTED", 200)
    hours_old = _env_int("JOBSPY_HOURS_OLD", 72)
    country_indeed = _env_str("JOBSPY_COUNTRY_INDEED", "UK")
    linkedin_fetch_description = _env_bool("JOBSPY_LINKEDIN_FETCH_DESCRIPTION", True)
    is_remote = _env_bool("JOBSPY_IS_REMOTE", False)
    term_index = _env_int("JOBSPY_TERM_INDEX", 1)
    term_total = _env_int("JOBSPY_TERM_TOTAL", 1)

    output_csv = Path(_env_str("JOBSPY_OUTPUT_CSV", "jobs.csv"))
    output_json = Path(
        _env_str("JOBSPY_OUTPUT_JSON", str(output_csv.with_suffix(".json")))
    )

    output_csv.parent.mkdir(parents=True, exist_ok=True)
    output_json.parent.mkdir(parents=True, exist_ok=True)

    print(f"jobspy: Search term: {search_term}")
    _emit_progress(
        "term_start",
        {
            "termIndex": term_index,
            "termTotal": term_total,
            "searchTerm": search_term,
        },
    )
    frames: list[pd.DataFrame] = []
    non_glassdoor_sites = [site for site in sites if site != "glassdoor"]

    if non_glassdoor_sites:
        frames.append(
            _scrape_for_sites(
                sites=non_glassdoor_sites,
                search_term=search_term,
                location=location,
                results_wanted=results_wanted,
                hours_old=hours_old,
                country_indeed=country_indeed,
                linkedin_fetch_description=linkedin_fetch_description,
                is_remote=is_remote,
            )
        )

    if "glassdoor" in sites:
        glassdoor_location = location
        if _is_country_level_location(location, country_indeed):
            # Glassdoor works best with city-level location terms.
            fallback_city = _glassdoor_city_for_country(country_indeed, location)
            if fallback_city:
                glassdoor_location = fallback_city
                print(
                    "jobspy: Glassdoor location matched country; using city fallback "
                    f"({fallback_city})"
                )
            else:
                print(
                    "jobspy: Glassdoor location matched country; keeping original location"
                )
        frames.append(
            _scrape_for_sites(
                sites=["glassdoor"],
                search_term=search_term,
                location=glassdoor_location,
                results_wanted=results_wanted,
                hours_old=hours_old,
                country_indeed=country_indeed,
                linkedin_fetch_description=linkedin_fetch_description,
                is_remote=is_remote,
            )
        )

    jobs = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

    print(f"Found {len(jobs)} jobs")
    _emit_progress(
        "term_complete",
        {
            "termIndex": term_index,
            "termTotal": term_total,
            "searchTerm": search_term,
            "jobsFoundTerm": int(len(jobs)),
        },
    )

    jobs.to_csv(
        output_csv,
        quoting=csv.QUOTE_NONNUMERIC,
        escapechar="\\",
        index=False,
    )
    jobs.to_json(output_json, orient="records", force_ascii=False)

    print(f"Wrote CSV:  {output_csv}")
    print(f"Wrote JSON: {output_json}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())