feat: add vision support for image recognition in Telegram

This commit is contained in:
Yitong Li 2026-02-02 15:32:12 +08:00
parent 229fde021a
commit f4b081b83f
2 changed files with 63 additions and 10 deletions

View File

@ -1,8 +1,12 @@
"""Context builder for assembling agent prompts.""" """Context builder for assembling agent prompts."""
import base64
import mimetypes
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from loguru import logger
from nanobot.agent.memory import MemoryStore from nanobot.agent.memory import MemoryStore
from nanobot.agent.skills import SkillsLoader from nanobot.agent.skills import SkillsLoader
@ -114,32 +118,80 @@ When remembering something, write to {workspace_path}/memory/MEMORY.md"""
self, self,
history: list[dict[str, Any]], history: list[dict[str, Any]],
current_message: str, current_message: str,
skill_names: list[str] | None = None skill_names: list[str] | None = None,
media: list[str] | None = None,
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
""" """
Build the complete message list for an LLM call. Build the complete message list for an LLM call.
Args: Args:
history: Previous conversation messages. history: Previous conversation messages.
current_message: The new user message. current_message: The new user message.
skill_names: Optional skills to include. skill_names: Optional skills to include.
media: Optional list of local file paths for images/media.
Returns: Returns:
List of messages including system prompt. List of messages including system prompt.
""" """
messages = [] messages = []
# System prompt # System prompt
system_prompt = self.build_system_prompt(skill_names) system_prompt = self.build_system_prompt(skill_names)
messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "system", "content": system_prompt})
# History # History
messages.extend(history) messages.extend(history)
# Current message # Current message (with optional image attachments)
messages.append({"role": "user", "content": current_message}) user_content = self._build_user_content(current_message, media)
messages.append({"role": "user", "content": user_content})
return messages return messages
def _build_user_content(
self, text: str, media: list[str] | None
) -> str | list[dict[str, Any]]:
"""
Build user message content, optionally with images.
Returns a plain string if no media, or a multimodal content list
with base64-encoded images.
"""
if not media:
return text
content: list[dict[str, Any]] = []
for path in media:
encoded = self._encode_image(path)
if encoded:
content.append(encoded)
if not content:
return text
content.append({"type": "text", "text": text})
return content
@staticmethod
def _encode_image(file_path: str) -> dict[str, Any] | None:
"""Encode a local image file to a base64 data URL for the LLM."""
path = Path(file_path)
if not path.is_file():
logger.warning(f"Media file not found: {file_path}")
return None
mime, _ = mimetypes.guess_type(file_path)
if not mime or not mime.startswith("image/"):
logger.warning(f"Unsupported media type for {file_path}: {mime}")
return None
data = path.read_bytes()
b64 = base64.b64encode(data).decode("utf-8")
return {
"type": "image_url",
"image_url": {"url": f"data:{mime};base64,{b64}"},
}
def add_tool_result( def add_tool_result(
self, self,

View File

@ -152,7 +152,8 @@ class AgentLoop:
# Build initial messages (use get_history for LLM-formatted messages) # Build initial messages (use get_history for LLM-formatted messages)
messages = self.context.build_messages( messages = self.context.build_messages(
history=session.get_history(), history=session.get_history(),
current_message=msg.content current_message=msg.content,
media=msg.media if msg.media else None,
) )
# Agent loop # Agent loop