From f4b081b83f28014c07a63b3a1b39e5d3f75398fc Mon Sep 17 00:00:00 2001
From: Yitong Li <yitongli0814@gmail.com>
Date: Mon, 2 Feb 2026 15:32:12 +0800
Subject: [PATCH] feat: add vision support for image recognition in Telegram

---
 nanobot/agent/context.py | 70 ++++++++++++++++++++++++++++++++++------
 nanobot/agent/loop.py    |  3 +-
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/nanobot/agent/context.py b/nanobot/agent/context.py
index aaba890..32585f5 100644
--- a/nanobot/agent/context.py
+++ b/nanobot/agent/context.py
@@ -1,8 +1,12 @@
 """Context builder for assembling agent prompts."""
 
+import base64
+import mimetypes
 from pathlib import Path
 from typing import Any
 
+from loguru import logger
+
 from nanobot.agent.memory import MemoryStore
 from nanobot.agent.skills import SkillsLoader
 
@@ -114,32 +118,80 @@ When remembering something, write to {workspace_path}/memory/MEMORY.md"""
         self,
         history: list[dict[str, Any]],
         current_message: str,
-        skill_names: list[str] | None = None
+        skill_names: list[str] | None = None,
+        media: list[str] | None = None,
     ) -> list[dict[str, Any]]:
         """
         Build the complete message list for an LLM call.
-        
+
         Args:
             history: Previous conversation messages.
             current_message: The new user message.
             skill_names: Optional skills to include.
-        
+            media: Optional list of local file paths for images/media.
+
         Returns:
             List of messages including system prompt.
         """
         messages = []
-        
+
         # System prompt
         system_prompt = self.build_system_prompt(skill_names)
         messages.append({"role": "system", "content": system_prompt})
-        
+
         # History
         messages.extend(history)
-        
-        # Current message
-        messages.append({"role": "user", "content": current_message})
-        
+
+        # Current message (with optional image attachments)
+        user_content = self._build_user_content(current_message, media)
+        messages.append({"role": "user", "content": user_content})
+
         return messages
+
+    def _build_user_content(
+        self, text: str, media: list[str] | None
+    ) -> str | list[dict[str, Any]]:
+        """
+        Build user message content, optionally with images.
+
+        Returns a plain string if no media, or a multimodal content list
+        with base64-encoded images.
+        """
+        if not media:
+            return text
+
+        content: list[dict[str, Any]] = []
+
+        for path in media:
+            encoded = self._encode_image(path)
+            if encoded:
+                content.append(encoded)
+
+        if not content:
+            return text
+
+        content.append({"type": "text", "text": text})
+        return content
+
+    @staticmethod
+    def _encode_image(file_path: str) -> dict[str, Any] | None:
+        """Encode a local image file to a base64 data URL for the LLM."""
+        path = Path(file_path)
+        if not path.is_file():
+            logger.warning(f"Media file not found: {file_path}")
+            return None
+
+        mime, _ = mimetypes.guess_type(file_path)
+        if not mime or not mime.startswith("image/"):
+            logger.warning(f"Unsupported media type for {file_path}: {mime}")
+            return None
+
+        data = path.read_bytes()
+        b64 = base64.b64encode(data).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {"url": f"data:{mime};base64,{b64}"},
+        }
     
     def add_tool_result(
         self,
diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py
index 6fe2cfd..3925a44 100644
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -152,7 +152,8 @@ class AgentLoop:
         # Build initial messages (use get_history for LLM-formatted messages)
         messages = self.context.build_messages(
             history=session.get_history(),
-            current_message=msg.content
+            current_message=msg.content,
+            media=msg.media if msg.media else None,
         )
         
         # Agent loop