Tests cover providers, dedup, Telegram, scoring, main runner, and Airbnb stubs. Ticketmaster and SeatGeek use configurable lat/lon/radius (Thornhill default). Pipeline filters noise listings, merges same-day sports duplicates, optional MIN_ALERT_SCORE, and Telegram severity summary. Made-with: Cursor
125 lines
3.4 KiB
Python
125 lines
3.4 KiB
Python
"""Deduplicate events across multiple providers."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
|
|
from src.models import NormalizedEvent
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Cross-provider titles for the same show often differ slightly.
|
|
_NAME_SIMILARITY_MIN = 0.78
|
|
# Venue strings vary (suffixes, punctuation); stricter than names.
|
|
_VENUE_SIMILARITY_MIN = 0.88
|
|
|
|
# Same calendar slot at the same venue for the same pro team (e.g. two Ticketmaster
|
|
# listings for one Jays game: full title vs promo night).
|
|
_TEAM_SLOT_KEYS: tuple[str, ...] = (
|
|
"blue jays",
|
|
"raptors",
|
|
"maple leafs",
|
|
"toronto marlies",
|
|
"marlies",
|
|
"toronto fc",
|
|
"argonauts",
|
|
)
|
|
|
|
# Prefer the cleaner listing when merging promo variants of the same game.
|
|
_PROMO_VARIANT_HINTS: tuple[str, ...] = (
|
|
"loonie",
|
|
"theme night",
|
|
"special event",
|
|
"bobblehead",
|
|
"giveaway",
|
|
)
|
|
|
|
_WS_RE = re.compile(r"\s+")
|
|
|
|
|
|
def _collapse_ws(s: str) -> str:
|
|
return _WS_RE.sub(" ", s.strip().lower())
|
|
|
|
|
|
def _similarity(a: str, b: str) -> float:
|
|
if not a or not b:
|
|
return 0.0
|
|
ca, cb = _collapse_ws(a), _collapse_ws(b)
|
|
if ca == cb:
|
|
return 1.0
|
|
return SequenceMatcher(None, ca, cb).ratio()
|
|
|
|
|
|
def _is_same_event(a: NormalizedEvent, b: NormalizedEvent) -> bool:
|
|
if a.event_date != b.event_date:
|
|
return False
|
|
if _similarity(a.venue, b.venue) < _VENUE_SIMILARITY_MIN:
|
|
return False
|
|
if _similarity(a.name, b.name) < _NAME_SIMILARITY_MIN:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _team_keys_in(name: str) -> frozenset[str]:
|
|
n = name.lower()
|
|
return frozenset(k for k in _TEAM_SLOT_KEYS if k in n)
|
|
|
|
|
|
def _is_same_game_slot(a: NormalizedEvent, b: NormalizedEvent) -> bool:
|
|
if a.event_date != b.event_date:
|
|
return False
|
|
if _similarity(a.venue, b.venue) < _VENUE_SIMILARITY_MIN:
|
|
return False
|
|
ka, kb = _team_keys_in(a.name), _team_keys_in(b.name)
|
|
if not ka or not kb:
|
|
return False
|
|
return bool(ka & kb)
|
|
|
|
|
|
def _promo_variant_penalty(name: str) -> int:
|
|
n = name.lower()
|
|
return sum(1 for h in _PROMO_VARIANT_HINTS if h in n)
|
|
|
|
|
|
def _pick_representative(cluster: list[NormalizedEvent]) -> NormalizedEvent:
|
|
"""Prefer richer records when merging duplicates (pre-scoring)."""
|
|
source_rank = {"ticketmaster": 2, "seatgeek": 1}
|
|
|
|
def key(e: NormalizedEvent) -> tuple:
|
|
return (
|
|
_promo_variant_penalty(e.name),
|
|
not bool(e.url),
|
|
-source_rank.get(e.source, 0),
|
|
-len(e.name),
|
|
e.name,
|
|
)
|
|
|
|
return min(cluster, key=key)
|
|
|
|
|
|
def deduplicate(events: list[NormalizedEvent]) -> list[NormalizedEvent]:
|
|
"""Remove duplicate events across providers.
|
|
|
|
Strategy: same calendar day + fuzzy venue + fuzzy event name.
|
|
Exact ``dedup_key`` matches are a subset and merge into one cluster.
|
|
"""
|
|
if not events:
|
|
return []
|
|
|
|
clusters: list[list[NormalizedEvent]] = []
|
|
for e in events:
|
|
for cluster in clusters:
|
|
if any(_is_same_event(x, e) or _is_same_game_slot(x, e) for x in cluster):
|
|
cluster.append(e)
|
|
break
|
|
else:
|
|
clusters.append([e])
|
|
|
|
deduped = [_pick_representative(c) for c in clusters]
|
|
removed = len(events) - len(deduped)
|
|
if removed:
|
|
logger.info("Deduplication removed %d duplicate(s)", removed)
|
|
return deduped
|