""" Metrics collection for LLM services. Tracks request counts, latency, errors, and other metrics. """ import time from collections import defaultdict from datetime import datetime, timedelta from typing import Any, Dict, List, Optional from dataclasses import dataclass, asdict import json from pathlib import Path # Metrics storage METRICS_DIR = Path(__file__).parent.parent / "data" / "metrics" METRICS_DIR.mkdir(parents=True, exist_ok=True) @dataclass class AgentMetrics: """Metrics for a single agent.""" agent_type: str total_requests: int = 0 successful_requests: int = 0 failed_requests: int = 0 total_latency_ms: float = 0.0 total_tokens_in: int = 0 total_tokens_out: int = 0 tools_called_count: int = 0 last_request_time: Optional[str] = None class MetricsCollector: """Collects and aggregates metrics.""" def __init__(self): """Initialize metrics collector.""" self.metrics: Dict[str, AgentMetrics] = { "work": AgentMetrics(agent_type="work"), "family": AgentMetrics(agent_type="family") } self._hourly_stats: Dict[str, List[Dict[str, Any]]] = defaultdict(list) def record_request(self, agent_type: str, success: bool, latency_ms: float, tokens_in: Optional[int] = None, tokens_out: Optional[int] = None, tools_called: int = 0): """Record a request metric.""" if agent_type not in self.metrics: self.metrics[agent_type] = AgentMetrics(agent_type=agent_type) metrics = self.metrics[agent_type] metrics.total_requests += 1 if success: metrics.successful_requests += 1 else: metrics.failed_requests += 1 metrics.total_latency_ms += latency_ms metrics.total_tokens_in += tokens_in or 0 metrics.total_tokens_out += tokens_out or 0 metrics.tools_called_count += tools_called metrics.last_request_time = datetime.now().isoformat() # Record hourly stat hour_key = datetime.now().strftime("%Y-%m-%d-%H") self._hourly_stats[hour_key].append({ "timestamp": datetime.now().isoformat(), "agent_type": agent_type, "success": success, "latency_ms": latency_ms, "tokens_in": tokens_in, "tokens_out": tokens_out, "tools_called": tools_called }) def get_metrics(self, agent_type: Optional[str] = None) -> Dict[str, Any]: """Get current metrics.""" if agent_type: if agent_type in self.metrics: metrics = self.metrics[agent_type] return { "agent_type": metrics.agent_type, "total_requests": metrics.total_requests, "successful_requests": metrics.successful_requests, "failed_requests": metrics.failed_requests, "average_latency_ms": round( metrics.total_latency_ms / metrics.total_requests, 2 ) if metrics.total_requests > 0 else 0, "total_tokens_in": metrics.total_tokens_in, "total_tokens_out": metrics.total_tokens_out, "total_tokens": metrics.total_tokens_in + metrics.total_tokens_out, "tools_called_count": metrics.tools_called_count, "last_request_time": metrics.last_request_time } return {} # Return all metrics result = {} for agent, metrics in self.metrics.items(): result[agent] = { "agent_type": metrics.agent_type, "total_requests": metrics.total_requests, "successful_requests": metrics.successful_requests, "failed_requests": metrics.failed_requests, "average_latency_ms": round( metrics.total_latency_ms / metrics.total_requests, 2 ) if metrics.total_requests > 0 else 0, "total_tokens_in": metrics.total_tokens_in, "total_tokens_out": metrics.total_tokens_out, "total_tokens": metrics.total_tokens_in + metrics.total_tokens_out, "tools_called_count": metrics.tools_called_count, "last_request_time": metrics.last_request_time } return result def save_metrics(self): """Save metrics to file.""" metrics_file = METRICS_DIR / f"metrics_{datetime.now().strftime('%Y%m%d')}.json" data = { "timestamp": datetime.now().isoformat(), "metrics": self.get_metrics(), "hourly_stats": {k: v[-100:] for k, v in self._hourly_stats.items()} # Keep last 100 per hour } metrics_file.write_text(json.dumps(data, indent=2)) def get_recent_stats(self, hours: int = 24) -> List[Dict[str, Any]]: """Get recent statistics for the last N hours.""" cutoff = datetime.now() - timedelta(hours=hours) recent = [] for hour_key, stats in self._hourly_stats.items(): # Parse hour from key try: hour_time = datetime.strptime(hour_key, "%Y-%m-%d-%H") if hour_time >= cutoff: recent.extend(stats) except ValueError: continue return sorted(recent, key=lambda x: x["timestamp"]) # Global metrics collector _metrics_collector = MetricsCollector() def get_metrics_collector() -> MetricsCollector: """Get the global metrics collector instance.""" return _metrics_collector