From f4b081b83f28014c07a63b3a1b39e5d3f75398fc Mon Sep 17 00:00:00 2001 From: Yitong Li Date: Mon, 2 Feb 2026 15:32:12 +0800 Subject: [PATCH] feat: add vision support for image recognition in Telegram --- nanobot/agent/context.py | 70 ++++++++++++++++++++++++++++++++++------ nanobot/agent/loop.py | 3 +- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/nanobot/agent/context.py b/nanobot/agent/context.py index aaba890..32585f5 100644 --- a/nanobot/agent/context.py +++ b/nanobot/agent/context.py @@ -1,8 +1,12 @@ """Context builder for assembling agent prompts.""" +import base64 +import mimetypes from pathlib import Path from typing import Any +from loguru import logger + from nanobot.agent.memory import MemoryStore from nanobot.agent.skills import SkillsLoader @@ -114,32 +118,80 @@ When remembering something, write to {workspace_path}/memory/MEMORY.md""" self, history: list[dict[str, Any]], current_message: str, - skill_names: list[str] | None = None + skill_names: list[str] | None = None, + media: list[str] | None = None, ) -> list[dict[str, Any]]: """ Build the complete message list for an LLM call. - + Args: history: Previous conversation messages. current_message: The new user message. skill_names: Optional skills to include. - + media: Optional list of local file paths for images/media. + Returns: List of messages including system prompt. """ messages = [] - + # System prompt system_prompt = self.build_system_prompt(skill_names) messages.append({"role": "system", "content": system_prompt}) - + # History messages.extend(history) - - # Current message - messages.append({"role": "user", "content": current_message}) - + + # Current message (with optional image attachments) + user_content = self._build_user_content(current_message, media) + messages.append({"role": "user", "content": user_content}) + return messages + + def _build_user_content( + self, text: str, media: list[str] | None + ) -> str | list[dict[str, Any]]: + """ + Build user message content, optionally with images. + + Returns a plain string if no media, or a multimodal content list + with base64-encoded images. + """ + if not media: + return text + + content: list[dict[str, Any]] = [] + + for path in media: + encoded = self._encode_image(path) + if encoded: + content.append(encoded) + + if not content: + return text + + content.append({"type": "text", "text": text}) + return content + + @staticmethod + def _encode_image(file_path: str) -> dict[str, Any] | None: + """Encode a local image file to a base64 data URL for the LLM.""" + path = Path(file_path) + if not path.is_file(): + logger.warning(f"Media file not found: {file_path}") + return None + + mime, _ = mimetypes.guess_type(file_path) + if not mime or not mime.startswith("image/"): + logger.warning(f"Unsupported media type for {file_path}: {mime}") + return None + + data = path.read_bytes() + b64 = base64.b64encode(data).decode("utf-8") + return { + "type": "image_url", + "image_url": {"url": f"data:{mime};base64,{b64}"}, + } def add_tool_result( self, diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py index 6fe2cfd..3925a44 100644 --- a/nanobot/agent/loop.py +++ b/nanobot/agent/loop.py @@ -152,7 +152,8 @@ class AgentLoop: # Build initial messages (use get_history for LLM-formatted messages) messages = self.context.build_messages( history=session.get_history(), - current_message=msg.content + current_message=msg.content, + media=msg.media if msg.media else None, ) # Agent loop