#!/usr/bin/env python3 """ ASR Service using faster-whisper. Provides HTTP and WebSocket endpoints for speech-to-text transcription. """ import logging import io import asyncio import numpy as np from typing import Optional, Dict, Any from pathlib import Path try: from faster_whisper import WhisperModel HAS_FASTER_WHISPER = True except ImportError: HAS_FASTER_WHISPER = False logging.warning("faster-whisper not available. Install with: pip install faster-whisper") try: import soundfile as sf HAS_SOUNDFILE = True except ImportError: HAS_SOUNDFILE = False logger = logging.getLogger(__name__) class ASRService: """ASR service using faster-whisper.""" def __init__( self, model_size: str = "small", device: str = "cpu", compute_type: str = "int8", language: Optional[str] = "en" ): """ Initialize ASR service. Args: model_size: Model size (tiny, base, small, medium, large) device: Device to use (cpu, cuda) compute_type: Compute type (int8, int8_float16, float16, float32) language: Language code (None for auto-detect) """ if not HAS_FASTER_WHISPER: raise ImportError("faster-whisper not installed. Install with: pip install faster-whisper") self.model_size = model_size self.device = device self.compute_type = compute_type self.language = language logger.info(f"Loading Whisper model: {model_size} on {device}") try: self.model = WhisperModel( model_size, device=device, compute_type=compute_type ) logger.info("ASR model loaded successfully") except Exception as e: logger.error(f"Error loading ASR model: {e}") raise def transcribe_file( self, audio_file: bytes, format: str = "json", language: Optional[str] = None ) -> Dict[str, Any]: """ Transcribe audio file. Args: audio_file: Audio file bytes format: Response format ("text" or "json") language: Language code (None for auto-detect) Returns: Transcription result """ try: # Load audio audio_data, sample_rate = sf.read(io.BytesIO(audio_file)) # Convert to mono if stereo if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) # Transcribe segments, info = self.model.transcribe( audio_data, language=language or self.language, beam_size=5 ) # Collect segments text_segments = [] full_text = [] for segment in segments: text_segments.append({ "start": segment.start, "end": segment.end, "text": segment.text.strip() }) full_text.append(segment.text.strip()) full_text = " ".join(full_text) if format == "text": return {"text": full_text} return { "text": full_text, "segments": text_segments, "language": info.language, "duration": info.duration } except Exception as e: logger.error(f"Transcription error: {e}") raise def transcribe_stream( self, audio_chunks: list, language: Optional[str] = None ) -> Dict[str, Any]: """ Transcribe streaming audio chunks. Args: audio_chunks: List of audio chunks (numpy arrays) language: Language code (None for auto-detect) Returns: Transcription result """ try: # Concatenate chunks audio_data = np.concatenate(audio_chunks) # Transcribe segments, info = self.model.transcribe( audio_data, language=language or self.language, beam_size=5 ) # Collect segments text_segments = [] full_text = [] for segment in segments: text_segments.append({ "start": segment.start, "end": segment.end, "text": segment.text.strip() }) full_text.append(segment.text.strip()) return { "text": " ".join(full_text), "segments": text_segments, "language": info.language } except Exception as e: logger.error(f"Streaming transcription error: {e}") raise # Global service instance _service: Optional[ASRService] = None def get_service() -> ASRService: """Get or create ASR service instance.""" global _service if _service is None: _service = ASRService( model_size="small", device="cpu", # Can be "cuda" if GPU available compute_type="int8", language="en" ) return _service