Merge PR #17: add voice transcription support with groq

2026-02-03 06:37:08 +00:00 · 2026-02-03 06:37:08 +00:00 · 9bd051f744
commit 9bd051f744
parent 30d6e4b4b6 8989adc9ae
8 changed files with 106 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -60,6 +60,12 @@
 ## 📦 Install
 **Install with [uv](https://github.com/astral-sh/uv)** (recommended for speed)
 ```bash
 uv tool install nanobot-ai
 ```
 **Install from PyPi**
 ```bash
@ -335,6 +341,7 @@ PRs welcome! The codebase is intentionally small and readable. 🤗
 **Roadmap** — Pick an item and [open a PR](https://github.com/HKUDS/nanobot/pulls)!
 - [x] **Voice Transcription** — Support for Groq Whisper (Issue #13)
 - [ ] **Multi-modal** — See and hear (images, voice, video)
 - [ ] **Long-term memory** — Never forget important context
 - [ ] **Better reasoning** — Multi-step planning and reflection
--- a/bridge/src/whatsapp.ts
+++ b/bridge/src/whatsapp.ts
@ -160,6 +160,11 @@ export class WhatsAppClient {
      return `[Document] ${message.documentMessage.caption}`;
    }
    // Voice/Audio message
    if (message.audioMessage) {
      return `[Voice Message]`;
    }
    return null;
  }
--- a/nanobot/channels/manager.py
+++ b/nanobot/channels/manager.py
@ -37,7 +37,9 @@ class ChannelManager:
            try:
                from nanobot.channels.telegram import TelegramChannel
                self.channels["telegram"] = TelegramChannel(
-                    self.config.channels.telegram, self.bus
+                    self.config.channels.telegram,
                    self.bus,
                    groq_api_key=self.config.providers.groq.api_key,
                )
                logger.info("Telegram channel enabled")
            except ImportError as e:
--- a/nanobot/channels/telegram.py
+++ b/nanobot/channels/telegram.py
@ -85,9 +85,10 @@ class TelegramChannel(BaseChannel):
    name = "telegram"
-    def __init__(self, config: TelegramConfig, bus: MessageBus):
+    def __init__(self, config: TelegramConfig, bus: MessageBus, groq_api_key: str = ""):
        super().__init__(config, bus)
        self.config: TelegramConfig = config
        self.groq_api_key = groq_api_key
        self._app: Application | None = None
        self._chat_ids: dict[str, int] = {}  # Map sender_id to chat_id for replies
@ -249,7 +250,20 @@ class TelegramChannel(BaseChannel):
                await file.download_to_drive(str(file_path))
                media_paths.append(str(file_path))
-                content_parts.append(f"[{media_type}: {file_path}]")
+                
                # Handle voice transcription
                if media_type == "voice" or media_type == "audio":
                    from nanobot.providers.transcription import GroqTranscriptionProvider
                    transcriber = GroqTranscriptionProvider(api_key=self.groq_api_key)
                    transcription = await transcriber.transcribe(file_path)
                    if transcription:
                        logger.info(f"Transcribed {media_type}: {transcription[:50]}...")
                        content_parts.append(f"[transcription: {transcription}]")
                    else:
                        content_parts.append(f"[{media_type}: {file_path}]")
                else:
                    content_parts.append(f"[{media_type}: {file_path}]")
                logger.debug(f"Downloaded {media_type} to {file_path}")
            except Exception as e:
                logger.error(f"Failed to download media: {e}")
--- a/nanobot/channels/whatsapp.py
+++ b/nanobot/channels/whatsapp.py
@ -107,6 +107,11 @@ class WhatsAppChannel(BaseChannel):
            # Extract just the phone number as chat_id
            chat_id = sender.split("@")[0] if "@" in sender else sender
            # Handle voice transcription if it's a voice message
            if content == "[Voice Message]":
                logger.info(f"Voice message received from {chat_id}, but direct download from bridge is not yet supported.")
                content = "[Voice Message: Transcription not available for WhatsApp yet]"
            await self._handle_message(
                sender_id=chat_id,
                chat_id=sender,  # Use full JID for replies
--- a/nanobot/config/schema.py
+++ b/nanobot/config/schema.py
@ -50,6 +50,7 @@ class ProvidersConfig(BaseModel):
    anthropic: ProviderConfig = Field(default_factory=ProviderConfig)
    openai: ProviderConfig = Field(default_factory=ProviderConfig)
    openrouter: ProviderConfig = Field(default_factory=ProviderConfig)
    groq: ProviderConfig = Field(default_factory=ProviderConfig)
    zhipu: ProviderConfig = Field(default_factory=ProviderConfig)
    vllm: ProviderConfig = Field(default_factory=ProviderConfig)
    gemini: ProviderConfig = Field(default_factory=ProviderConfig)
@ -91,13 +92,14 @@ class Config(BaseSettings):
        return Path(self.agents.defaults.workspace).expanduser()
    def get_api_key(self) -> str | None:
-        """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > vLLM."""
+        """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > Groq > vLLM."""
        return (
            self.providers.openrouter.api_key or
            self.providers.anthropic.api_key or
            self.providers.openai.api_key or
            self.providers.gemini.api_key or
            self.providers.zhipu.api_key or
            self.providers.groq.api_key or
            self.providers.vllm.api_key or
            None
        )
--- a/nanobot/providers/litellm_provider.py
+++ b/nanobot/providers/litellm_provider.py
@ -51,6 +51,8 @@ class LiteLLMProvider(LLMProvider):
                os.environ.setdefault("GEMINI_API_KEY", api_key)
            elif "zhipu" in default_model or "glm" in default_model or "zai" in default_model:
                os.environ.setdefault("ZHIPUAI_API_KEY", api_key)
            elif "groq" in default_model:
                os.environ.setdefault("GROQ_API_KEY", api_key)
        if api_base:
            litellm.api_base = api_base
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@ -0,0 +1,65 @@
 """Voice transcription provider using Groq."""
 import os
 from pathlib import Path
 from typing import Any
 import httpx
 from loguru import logger
 class GroqTranscriptionProvider:
    """
    Voice transcription provider using Groq's Whisper API.
    Groq offers extremely fast transcription with a generous free tier.
    """
    def __init__(self, api_key: str | None = None):
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        self.api_url = "https://api.groq.com/openai/v1/audio/transcriptions"
    async def transcribe(self, file_path: str | Path) -> str:
        """
        Transcribe an audio file using Groq.
        Args:
            file_path: Path to the audio file.
        Returns:
            Transcribed text.
        """
        if not self.api_key:
            logger.warning("Groq API key not configured for transcription")
            return ""
        path = Path(file_path)
        if not path.exists():
            logger.error(f"Audio file not found: {file_path}")
            return ""
        try:
            async with httpx.AsyncClient() as client:
                with open(path, "rb") as f:
                    files = {
                        "file": (path.name, f),
                        "model": (None, "whisper-large-v3"),
                    }
                    headers = {
                        "Authorization": f"Bearer {self.api_key}",
                    }
                    response = await client.post(
                        self.api_url,
                        headers=headers,
                        files=files,
                        timeout=60.0
                    )
                    response.raise_for_status()
                    data = response.json()
                    return data.get("text", "")
        except Exception as e:
            logger.error(f"Groq transcription error: {e}")
            return ""