feat: add vision support for image recognition in Telegram

2026-02-02 15:32:12 +08:00 · 2026-02-02 15:32:12 +08:00 · f4b081b83f
commit f4b081b83f
parent 229fde021a
2 changed files with 63 additions and 10 deletions
--- a/nanobot/agent/context.py
+++ b/nanobot/agent/context.py
@ -1,8 +1,12 @@
 """Context builder for assembling agent prompts."""
 import base64
 import mimetypes
 from pathlib import Path
 from typing import Any
 from loguru import logger
 from nanobot.agent.memory import MemoryStore
 from nanobot.agent.skills import SkillsLoader
@ -114,32 +118,80 @@ When remembering something, write to {workspace_path}/memory/MEMORY.md"""
        self,
        history: list[dict[str, Any]],
        current_message: str,
-        skill_names: list[str] | None = None
+        skill_names: list[str] | None = None,
        media: list[str] | None = None,
    ) -> list[dict[str, Any]]:
        """
        Build the complete message list for an LLM call.
-        
+
        Args:
            history: Previous conversation messages.
            current_message: The new user message.
            skill_names: Optional skills to include.
-        
+            media: Optional list of local file paths for images/media.
        Returns:
            List of messages including system prompt.
        """
        messages = []
-        
+
        # System prompt
        system_prompt = self.build_system_prompt(skill_names)
        messages.append({"role": "system", "content": system_prompt})
-        
+
        # History
        messages.extend(history)
-        
+
-        # Current message
+        # Current message (with optional image attachments)
-        messages.append({"role": "user", "content": current_message})
+        user_content = self._build_user_content(current_message, media)
-        
+        messages.append({"role": "user", "content": user_content})
        return messages
    def _build_user_content(
        self, text: str, media: list[str] | None
    ) -> str | list[dict[str, Any]]:
        """
        Build user message content, optionally with images.
        Returns a plain string if no media, or a multimodal content list
        with base64-encoded images.
        """
        if not media:
            return text
        content: list[dict[str, Any]] = []
        for path in media:
            encoded = self._encode_image(path)
            if encoded:
                content.append(encoded)
        if not content:
            return text
        content.append({"type": "text", "text": text})
        return content
    @staticmethod
    def _encode_image(file_path: str) -> dict[str, Any] | None:
        """Encode a local image file to a base64 data URL for the LLM."""
        path = Path(file_path)
        if not path.is_file():
            logger.warning(f"Media file not found: {file_path}")
            return None
        mime, _ = mimetypes.guess_type(file_path)
        if not mime or not mime.startswith("image/"):
            logger.warning(f"Unsupported media type for {file_path}: {mime}")
            return None
        data = path.read_bytes()
        b64 = base64.b64encode(data).decode("utf-8")
        return {
            "type": "image_url",
            "image_url": {"url": f"data:{mime};base64,{b64}"},
        }
    def add_tool_result(
        self,
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@ -152,7 +152,8 @@ class AgentLoop:
        # Build initial messages (use get_history for LLM-formatted messages)
        messages = self.context.build_messages(
            history=session.get_history(),
-            current_message=msg.content
+            current_message=msg.content,
            media=msg.media if msg.media else None,
        )
        # Agent loop