"""Deduplicate events across multiple providers.""" from __future__ import annotations import logging import re from difflib import SequenceMatcher from src.models import NormalizedEvent logger = logging.getLogger(__name__) # Cross-provider titles for the same show often differ slightly. _NAME_SIMILARITY_MIN = 0.78 # Venue strings vary (suffixes, punctuation); stricter than names. _VENUE_SIMILARITY_MIN = 0.88 # Same calendar slot at the same venue for the same pro team (e.g. two Ticketmaster # listings for one Jays game: full title vs promo night). _TEAM_SLOT_KEYS: tuple[str, ...] = ( "blue jays", "raptors", "maple leafs", "toronto marlies", "marlies", "toronto fc", "argonauts", ) # Prefer the cleaner listing when merging promo variants of the same game. _PROMO_VARIANT_HINTS: tuple[str, ...] = ( "loonie", "theme night", "special event", "bobblehead", "giveaway", ) _WS_RE = re.compile(r"\s+") def _collapse_ws(s: str) -> str: return _WS_RE.sub(" ", s.strip().lower()) def _similarity(a: str, b: str) -> float: if not a or not b: return 0.0 ca, cb = _collapse_ws(a), _collapse_ws(b) if ca == cb: return 1.0 return SequenceMatcher(None, ca, cb).ratio() def _is_same_event(a: NormalizedEvent, b: NormalizedEvent) -> bool: if a.event_date != b.event_date: return False if _similarity(a.venue, b.venue) < _VENUE_SIMILARITY_MIN: return False if _similarity(a.name, b.name) < _NAME_SIMILARITY_MIN: return False return True def _team_keys_in(name: str) -> frozenset[str]: n = name.lower() return frozenset(k for k in _TEAM_SLOT_KEYS if k in n) def _is_same_game_slot(a: NormalizedEvent, b: NormalizedEvent) -> bool: if a.event_date != b.event_date: return False if _similarity(a.venue, b.venue) < _VENUE_SIMILARITY_MIN: return False ka, kb = _team_keys_in(a.name), _team_keys_in(b.name) if not ka or not kb: return False return bool(ka & kb) def _promo_variant_penalty(name: str) -> int: n = name.lower() return sum(1 for h in _PROMO_VARIANT_HINTS if h in n) def _pick_representative(cluster: list[NormalizedEvent]) -> NormalizedEvent: """Prefer richer records when merging duplicates (pre-scoring).""" source_rank = {"ticketmaster": 2, "seatgeek": 1} def key(e: NormalizedEvent) -> tuple: return ( _promo_variant_penalty(e.name), not bool(e.url), -source_rank.get(e.source, 0), -len(e.name), e.name, ) return min(cluster, key=key) def deduplicate(events: list[NormalizedEvent]) -> list[NormalizedEvent]: """Remove duplicate events across providers. Strategy: same calendar day + fuzzy venue + fuzzy event name. Exact ``dedup_key`` matches are a subset and merge into one cluster. """ if not events: return [] clusters: list[list[NormalizedEvent]] = [] for e in events: for cluster in clusters: if any(_is_same_event(x, e) or _is_same_game_slot(x, e) for x in cluster): cluster.append(e) break else: clusters.append([e]) deduped = [_pick_representative(c) for c in clusters] removed = len(events) - len(deduped) if removed: logger.info("Deduplication removed %d duplicate(s)", removed) return deduped