Merge PR #17: add voice transcription support with groq
This commit is contained in:
commit
9bd051f744
@ -60,6 +60,12 @@
|
|||||||
|
|
||||||
## 📦 Install
|
## 📦 Install
|
||||||
|
|
||||||
|
**Install with [uv](https://github.com/astral-sh/uv)** (recommended for speed)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv tool install nanobot-ai
|
||||||
|
```
|
||||||
|
|
||||||
**Install from PyPi**
|
**Install from PyPi**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -335,6 +341,7 @@ PRs welcome! The codebase is intentionally small and readable. 🤗
|
|||||||
|
|
||||||
**Roadmap** — Pick an item and [open a PR](https://github.com/HKUDS/nanobot/pulls)!
|
**Roadmap** — Pick an item and [open a PR](https://github.com/HKUDS/nanobot/pulls)!
|
||||||
|
|
||||||
|
- [x] **Voice Transcription** — Support for Groq Whisper (Issue #13)
|
||||||
- [ ] **Multi-modal** — See and hear (images, voice, video)
|
- [ ] **Multi-modal** — See and hear (images, voice, video)
|
||||||
- [ ] **Long-term memory** — Never forget important context
|
- [ ] **Long-term memory** — Never forget important context
|
||||||
- [ ] **Better reasoning** — Multi-step planning and reflection
|
- [ ] **Better reasoning** — Multi-step planning and reflection
|
||||||
|
|||||||
@ -160,6 +160,11 @@ export class WhatsAppClient {
|
|||||||
return `[Document] ${message.documentMessage.caption}`;
|
return `[Document] ${message.documentMessage.caption}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Voice/Audio message
|
||||||
|
if (message.audioMessage) {
|
||||||
|
return `[Voice Message]`;
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -37,7 +37,9 @@ class ChannelManager:
|
|||||||
try:
|
try:
|
||||||
from nanobot.channels.telegram import TelegramChannel
|
from nanobot.channels.telegram import TelegramChannel
|
||||||
self.channels["telegram"] = TelegramChannel(
|
self.channels["telegram"] = TelegramChannel(
|
||||||
self.config.channels.telegram, self.bus
|
self.config.channels.telegram,
|
||||||
|
self.bus,
|
||||||
|
groq_api_key=self.config.providers.groq.api_key,
|
||||||
)
|
)
|
||||||
logger.info("Telegram channel enabled")
|
logger.info("Telegram channel enabled")
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
|
|||||||
@ -85,9 +85,10 @@ class TelegramChannel(BaseChannel):
|
|||||||
|
|
||||||
name = "telegram"
|
name = "telegram"
|
||||||
|
|
||||||
def __init__(self, config: TelegramConfig, bus: MessageBus):
|
def __init__(self, config: TelegramConfig, bus: MessageBus, groq_api_key: str = ""):
|
||||||
super().__init__(config, bus)
|
super().__init__(config, bus)
|
||||||
self.config: TelegramConfig = config
|
self.config: TelegramConfig = config
|
||||||
|
self.groq_api_key = groq_api_key
|
||||||
self._app: Application | None = None
|
self._app: Application | None = None
|
||||||
self._chat_ids: dict[str, int] = {} # Map sender_id to chat_id for replies
|
self._chat_ids: dict[str, int] = {} # Map sender_id to chat_id for replies
|
||||||
|
|
||||||
@ -249,7 +250,20 @@ class TelegramChannel(BaseChannel):
|
|||||||
await file.download_to_drive(str(file_path))
|
await file.download_to_drive(str(file_path))
|
||||||
|
|
||||||
media_paths.append(str(file_path))
|
media_paths.append(str(file_path))
|
||||||
content_parts.append(f"[{media_type}: {file_path}]")
|
|
||||||
|
# Handle voice transcription
|
||||||
|
if media_type == "voice" or media_type == "audio":
|
||||||
|
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||||
|
transcriber = GroqTranscriptionProvider(api_key=self.groq_api_key)
|
||||||
|
transcription = await transcriber.transcribe(file_path)
|
||||||
|
if transcription:
|
||||||
|
logger.info(f"Transcribed {media_type}: {transcription[:50]}...")
|
||||||
|
content_parts.append(f"[transcription: {transcription}]")
|
||||||
|
else:
|
||||||
|
content_parts.append(f"[{media_type}: {file_path}]")
|
||||||
|
else:
|
||||||
|
content_parts.append(f"[{media_type}: {file_path}]")
|
||||||
|
|
||||||
logger.debug(f"Downloaded {media_type} to {file_path}")
|
logger.debug(f"Downloaded {media_type} to {file_path}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to download media: {e}")
|
logger.error(f"Failed to download media: {e}")
|
||||||
|
|||||||
@ -107,6 +107,11 @@ class WhatsAppChannel(BaseChannel):
|
|||||||
# Extract just the phone number as chat_id
|
# Extract just the phone number as chat_id
|
||||||
chat_id = sender.split("@")[0] if "@" in sender else sender
|
chat_id = sender.split("@")[0] if "@" in sender else sender
|
||||||
|
|
||||||
|
# Handle voice transcription if it's a voice message
|
||||||
|
if content == "[Voice Message]":
|
||||||
|
logger.info(f"Voice message received from {chat_id}, but direct download from bridge is not yet supported.")
|
||||||
|
content = "[Voice Message: Transcription not available for WhatsApp yet]"
|
||||||
|
|
||||||
await self._handle_message(
|
await self._handle_message(
|
||||||
sender_id=chat_id,
|
sender_id=chat_id,
|
||||||
chat_id=sender, # Use full JID for replies
|
chat_id=sender, # Use full JID for replies
|
||||||
|
|||||||
@ -50,6 +50,7 @@ class ProvidersConfig(BaseModel):
|
|||||||
anthropic: ProviderConfig = Field(default_factory=ProviderConfig)
|
anthropic: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
openai: ProviderConfig = Field(default_factory=ProviderConfig)
|
openai: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
openrouter: ProviderConfig = Field(default_factory=ProviderConfig)
|
openrouter: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
|
groq: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
zhipu: ProviderConfig = Field(default_factory=ProviderConfig)
|
zhipu: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
vllm: ProviderConfig = Field(default_factory=ProviderConfig)
|
vllm: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
gemini: ProviderConfig = Field(default_factory=ProviderConfig)
|
gemini: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
@ -91,13 +92,14 @@ class Config(BaseSettings):
|
|||||||
return Path(self.agents.defaults.workspace).expanduser()
|
return Path(self.agents.defaults.workspace).expanduser()
|
||||||
|
|
||||||
def get_api_key(self) -> str | None:
|
def get_api_key(self) -> str | None:
|
||||||
"""Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > vLLM."""
|
"""Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > Groq > vLLM."""
|
||||||
return (
|
return (
|
||||||
self.providers.openrouter.api_key or
|
self.providers.openrouter.api_key or
|
||||||
self.providers.anthropic.api_key or
|
self.providers.anthropic.api_key or
|
||||||
self.providers.openai.api_key or
|
self.providers.openai.api_key or
|
||||||
self.providers.gemini.api_key or
|
self.providers.gemini.api_key or
|
||||||
self.providers.zhipu.api_key or
|
self.providers.zhipu.api_key or
|
||||||
|
self.providers.groq.api_key or
|
||||||
self.providers.vllm.api_key or
|
self.providers.vllm.api_key or
|
||||||
None
|
None
|
||||||
)
|
)
|
||||||
|
|||||||
@ -51,6 +51,8 @@ class LiteLLMProvider(LLMProvider):
|
|||||||
os.environ.setdefault("GEMINI_API_KEY", api_key)
|
os.environ.setdefault("GEMINI_API_KEY", api_key)
|
||||||
elif "zhipu" in default_model or "glm" in default_model or "zai" in default_model:
|
elif "zhipu" in default_model or "glm" in default_model or "zai" in default_model:
|
||||||
os.environ.setdefault("ZHIPUAI_API_KEY", api_key)
|
os.environ.setdefault("ZHIPUAI_API_KEY", api_key)
|
||||||
|
elif "groq" in default_model:
|
||||||
|
os.environ.setdefault("GROQ_API_KEY", api_key)
|
||||||
|
|
||||||
if api_base:
|
if api_base:
|
||||||
litellm.api_base = api_base
|
litellm.api_base = api_base
|
||||||
|
|||||||
65
nanobot/providers/transcription.py
Normal file
65
nanobot/providers/transcription.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
"""Voice transcription provider using Groq."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
class GroqTranscriptionProvider:
|
||||||
|
"""
|
||||||
|
Voice transcription provider using Groq's Whisper API.
|
||||||
|
|
||||||
|
Groq offers extremely fast transcription with a generous free tier.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, api_key: str | None = None):
|
||||||
|
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
|
||||||
|
self.api_url = "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
|
"""
|
||||||
|
Transcribe an audio file using Groq.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the audio file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Transcribed text.
|
||||||
|
"""
|
||||||
|
if not self.api_key:
|
||||||
|
logger.warning("Groq API key not configured for transcription")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
logger.error(f"Audio file not found: {file_path}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
files = {
|
||||||
|
"file": (path.name, f),
|
||||||
|
"model": (None, "whisper-large-v3"),
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
self.api_url,
|
||||||
|
headers=headers,
|
||||||
|
files=files,
|
||||||
|
timeout=60.0
|
||||||
|
)
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return data.get("text", "")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Groq transcription error: {e}")
|
||||||
|
return ""
|
||||||
Loading…
x
Reference in New Issue
Block a user