feat: add vision support for image recognition in Telegram

This commit is contained in:
Yitong Li 2026-02-02 15:32:12 +08:00
parent 229fde021a
commit f4b081b83f
2 changed files with 63 additions and 10 deletions

View File

@ -1,8 +1,12 @@
"""Context builder for assembling agent prompts."""
import base64
import mimetypes
from pathlib import Path
from typing import Any
from loguru import logger
from nanobot.agent.memory import MemoryStore
from nanobot.agent.skills import SkillsLoader
@ -114,32 +118,80 @@ When remembering something, write to {workspace_path}/memory/MEMORY.md"""
self,
history: list[dict[str, Any]],
current_message: str,
skill_names: list[str] | None = None
skill_names: list[str] | None = None,
media: list[str] | None = None,
) -> list[dict[str, Any]]:
"""
Build the complete message list for an LLM call.
Args:
history: Previous conversation messages.
current_message: The new user message.
skill_names: Optional skills to include.
media: Optional list of local file paths for images/media.
Returns:
List of messages including system prompt.
"""
messages = []
# System prompt
system_prompt = self.build_system_prompt(skill_names)
messages.append({"role": "system", "content": system_prompt})
# History
messages.extend(history)
# Current message
messages.append({"role": "user", "content": current_message})
# Current message (with optional image attachments)
user_content = self._build_user_content(current_message, media)
messages.append({"role": "user", "content": user_content})
return messages
def _build_user_content(
self, text: str, media: list[str] | None
) -> str | list[dict[str, Any]]:
"""
Build user message content, optionally with images.
Returns a plain string if no media, or a multimodal content list
with base64-encoded images.
"""
if not media:
return text
content: list[dict[str, Any]] = []
for path in media:
encoded = self._encode_image(path)
if encoded:
content.append(encoded)
if not content:
return text
content.append({"type": "text", "text": text})
return content
@staticmethod
def _encode_image(file_path: str) -> dict[str, Any] | None:
"""Encode a local image file to a base64 data URL for the LLM."""
path = Path(file_path)
if not path.is_file():
logger.warning(f"Media file not found: {file_path}")
return None
mime, _ = mimetypes.guess_type(file_path)
if not mime or not mime.startswith("image/"):
logger.warning(f"Unsupported media type for {file_path}: {mime}")
return None
data = path.read_bytes()
b64 = base64.b64encode(data).decode("utf-8")
return {
"type": "image_url",
"image_url": {"url": f"data:{mime};base64,{b64}"},
}
def add_tool_result(
self,

View File

@ -152,7 +152,8 @@ class AgentLoop:
# Build initial messages (use get_history for LLM-formatted messages)
messages = self.context.build_messages(
history=session.get_history(),
current_message=msg.content
current_message=msg.content,
media=msg.media if msg.media else None,
)
# Agent loop