diff --git a/README.md b/README.md index ab1f947..ec73b51 100644 --- a/README.md +++ b/README.md @@ -329,6 +329,7 @@ nanobot/ ## πŸ—ΊοΈ Roadmap +- [x] **Voice Transcription** β€” Support for Groq Whisper (Issue #13) - [ ] **Multi-modal** β€” See and hear (images, voice, video) - [ ] **Long-term memory** β€” Never forget important context - [ ] **Better reasoning** β€” Multi-step planning and reflection diff --git a/bridge/src/whatsapp.ts b/bridge/src/whatsapp.ts index 4185632..a3a82fc 100644 --- a/bridge/src/whatsapp.ts +++ b/bridge/src/whatsapp.ts @@ -160,6 +160,11 @@ export class WhatsAppClient { return `[Document] ${message.documentMessage.caption}`; } + // Voice/Audio message + if (message.audioMessage) { + return `[Voice Message]`; + } + return null; } diff --git a/nanobot/channels/manager.py b/nanobot/channels/manager.py index 04abf5f..c32aa3d 100644 --- a/nanobot/channels/manager.py +++ b/nanobot/channels/manager.py @@ -36,6 +36,8 @@ class ChannelManager: if self.config.channels.telegram.enabled: try: from nanobot.channels.telegram import TelegramChannel + # Inject parent config for access to providers + self.config.channels.telegram.parent = self.config self.channels["telegram"] = TelegramChannel( self.config.channels.telegram, self.bus ) @@ -47,6 +49,8 @@ class ChannelManager: if self.config.channels.whatsapp.enabled: try: from nanobot.channels.whatsapp import WhatsAppChannel + # Inject parent config for access to providers + self.config.channels.whatsapp.parent = self.config self.channels["whatsapp"] = WhatsAppChannel( self.config.channels.whatsapp, self.bus ) diff --git a/nanobot/channels/telegram.py b/nanobot/channels/telegram.py index 840c250..dc2f77c 100644 --- a/nanobot/channels/telegram.py +++ b/nanobot/channels/telegram.py @@ -247,7 +247,25 @@ class TelegramChannel(BaseChannel): await file.download_to_drive(str(file_path)) media_paths.append(str(file_path)) - content_parts.append(f"[{media_type}: {file_path}]") + + # Handle voice transcription + if media_type == "voice" or media_type == "audio": + from nanobot.providers.transcription import GroqTranscriptionProvider + # Try to get Groq API key from config + groq_key = None + if hasattr(self.config, 'parent') and hasattr(self.config.parent, 'providers'): + groq_key = self.config.parent.providers.groq.api_key + + transcriber = GroqTranscriptionProvider(api_key=groq_key) + transcription = await transcriber.transcribe(file_path) + if transcription: + logger.info(f"Transcribed {media_type}: {transcription[:50]}...") + content_parts.append(f"[transcription: {transcription}]") + else: + content_parts.append(f"[{media_type}: {file_path}]") + else: + content_parts.append(f"[{media_type}: {file_path}]") + logger.debug(f"Downloaded {media_type} to {file_path}") except Exception as e: logger.error(f"Failed to download media: {e}") diff --git a/nanobot/channels/whatsapp.py b/nanobot/channels/whatsapp.py index efbd3e1..c14a6c3 100644 --- a/nanobot/channels/whatsapp.py +++ b/nanobot/channels/whatsapp.py @@ -107,6 +107,11 @@ class WhatsAppChannel(BaseChannel): # Extract just the phone number as chat_id chat_id = sender.split("@")[0] if "@" in sender else sender + # Handle voice transcription if it's a voice message + if content == "[Voice Message]": + logger.info(f"Voice message received from {chat_id}, but direct download from bridge is not yet supported.") + content = "[Voice Message: Transcription not available for WhatsApp yet]" + await self._handle_message( sender_id=chat_id, chat_id=sender, # Use full JID for replies diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index e30fbb2..ee245f1 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -50,6 +50,7 @@ class ProvidersConfig(BaseModel): anthropic: ProviderConfig = Field(default_factory=ProviderConfig) openai: ProviderConfig = Field(default_factory=ProviderConfig) openrouter: ProviderConfig = Field(default_factory=ProviderConfig) + groq: ProviderConfig = Field(default_factory=ProviderConfig) vllm: ProviderConfig = Field(default_factory=ProviderConfig) @@ -89,11 +90,12 @@ class Config(BaseSettings): return Path(self.agents.defaults.workspace).expanduser() def get_api_key(self) -> str | None: - """Get API key in priority order: OpenRouter > Anthropic > OpenAI > vLLM.""" + """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Groq > vLLM.""" return ( self.providers.openrouter.api_key or self.providers.anthropic.api_key or self.providers.openai.api_key or + self.providers.groq.api_key or self.providers.vllm.api_key or None ) diff --git a/nanobot/providers/litellm_provider.py b/nanobot/providers/litellm_provider.py index 4e7305b..f8e8456 100644 --- a/nanobot/providers/litellm_provider.py +++ b/nanobot/providers/litellm_provider.py @@ -47,6 +47,8 @@ class LiteLLMProvider(LLMProvider): os.environ.setdefault("ANTHROPIC_API_KEY", api_key) elif "openai" in default_model or "gpt" in default_model: os.environ.setdefault("OPENAI_API_KEY", api_key) + elif "groq" in default_model: + os.environ.setdefault("GROQ_API_KEY", api_key) if api_base: litellm.api_base = api_base diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py new file mode 100644 index 0000000..8ce909b --- /dev/null +++ b/nanobot/providers/transcription.py @@ -0,0 +1,65 @@ +"""Voice transcription provider using Groq.""" + +import os +from pathlib import Path +from typing import Any + +import httpx +from loguru import logger + + +class GroqTranscriptionProvider: + """ + Voice transcription provider using Groq's Whisper API. + + Groq offers extremely fast transcription with a generous free tier. + """ + + def __init__(self, api_key: str | None = None): + self.api_key = api_key or os.environ.get("GROQ_API_KEY") + self.api_url = "https://api.groq.com/openai/v1/audio/transcriptions" + + async def transcribe(self, file_path: str | Path) -> str: + """ + Transcribe an audio file using Groq. + + Args: + file_path: Path to the audio file. + + Returns: + Transcribed text. + """ + if not self.api_key: + logger.warning("Groq API key not configured for transcription") + return "" + + path = Path(file_path) + if not path.exists(): + logger.error(f"Audio file not found: {file_path}") + return "" + + try: + async with httpx.AsyncClient() as client: + with open(path, "rb") as f: + files = { + "file": (path.name, f), + "model": (None, "whisper-large-v3"), + } + headers = { + "Authorization": f"Bearer {self.api_key}", + } + + response = await client.post( + self.api_url, + headers=headers, + files=files, + timeout=60.0 + ) + + response.raise_for_status() + data = response.json() + return data.get("text", "") + + except Exception as e: + logger.error(f"Groq transcription error: {e}") + return ""