AtAnyRate/src/dedup.py
ilia 1a7298f755 Initial commit: EventRate pipeline, fuzzy dedup, Airbnb retries
Wire up Ticketmaster, SeatGeek, Telegram, scoring, Playwright stubs.
Deduplicate events with fuzzy venue/name matching. Retry calendar
updates on transient failures. Backlog tasks marked complete.

Made-with: Cursor
2026-04-04 12:31:53 -04:00

82 lines
2.2 KiB
Python

"""Deduplicate events across multiple providers."""
from __future__ import annotations
import logging
import re
from difflib import SequenceMatcher
from src.models import NormalizedEvent
logger = logging.getLogger(__name__)
# Cross-provider titles for the same show often differ slightly.
_NAME_SIMILARITY_MIN = 0.78
# Venue strings vary (suffixes, punctuation); stricter than names.
_VENUE_SIMILARITY_MIN = 0.88
_WS_RE = re.compile(r"\s+")
def _collapse_ws(s: str) -> str:
return _WS_RE.sub(" ", s.strip().lower())
def _similarity(a: str, b: str) -> float:
if not a or not b:
return 0.0
ca, cb = _collapse_ws(a), _collapse_ws(b)
if ca == cb:
return 1.0
return SequenceMatcher(None, ca, cb).ratio()
def _is_same_event(a: NormalizedEvent, b: NormalizedEvent) -> bool:
if a.event_date != b.event_date:
return False
if _similarity(a.venue, b.venue) < _VENUE_SIMILARITY_MIN:
return False
if _similarity(a.name, b.name) < _NAME_SIMILARITY_MIN:
return False
return True
def _pick_representative(cluster: list[NormalizedEvent]) -> NormalizedEvent:
"""Prefer richer records when merging duplicates (pre-scoring)."""
source_rank = {"ticketmaster": 2, "seatgeek": 1}
def key(e: NormalizedEvent) -> tuple:
return (
bool(e.url),
source_rank.get(e.source, 0),
len(e.name),
e.name,
)
return max(cluster, key=key)
def deduplicate(events: list[NormalizedEvent]) -> list[NormalizedEvent]:
"""Remove duplicate events across providers.
Strategy: same calendar day + fuzzy venue + fuzzy event name.
Exact ``dedup_key`` matches are a subset and merge into one cluster.
"""
if not events:
return []
clusters: list[list[NormalizedEvent]] = []
for e in events:
for cluster in clusters:
if any(_is_same_event(x, e) for x in cluster):
cluster.append(e)
break
else:
clusters.append([e])
deduped = [_pick_representative(c) for c in clusters]
removed = len(events) - len(deduped)
if removed:
logger.info("Deduplication removed %d duplicate(s)", removed)
return deduped