feat: add vision support for image recognition in Telegram
This commit is contained in:
parent
229fde021a
commit
f4b081b83f
@ -1,8 +1,12 @@
|
|||||||
"""Context builder for assembling agent prompts."""
|
"""Context builder for assembling agent prompts."""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import mimetypes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
from nanobot.agent.memory import MemoryStore
|
from nanobot.agent.memory import MemoryStore
|
||||||
from nanobot.agent.skills import SkillsLoader
|
from nanobot.agent.skills import SkillsLoader
|
||||||
|
|
||||||
@ -114,32 +118,80 @@ When remembering something, write to {workspace_path}/memory/MEMORY.md"""
|
|||||||
self,
|
self,
|
||||||
history: list[dict[str, Any]],
|
history: list[dict[str, Any]],
|
||||||
current_message: str,
|
current_message: str,
|
||||||
skill_names: list[str] | None = None
|
skill_names: list[str] | None = None,
|
||||||
|
media: list[str] | None = None,
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Build the complete message list for an LLM call.
|
Build the complete message list for an LLM call.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
history: Previous conversation messages.
|
history: Previous conversation messages.
|
||||||
current_message: The new user message.
|
current_message: The new user message.
|
||||||
skill_names: Optional skills to include.
|
skill_names: Optional skills to include.
|
||||||
|
media: Optional list of local file paths for images/media.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of messages including system prompt.
|
List of messages including system prompt.
|
||||||
"""
|
"""
|
||||||
messages = []
|
messages = []
|
||||||
|
|
||||||
# System prompt
|
# System prompt
|
||||||
system_prompt = self.build_system_prompt(skill_names)
|
system_prompt = self.build_system_prompt(skill_names)
|
||||||
messages.append({"role": "system", "content": system_prompt})
|
messages.append({"role": "system", "content": system_prompt})
|
||||||
|
|
||||||
# History
|
# History
|
||||||
messages.extend(history)
|
messages.extend(history)
|
||||||
|
|
||||||
# Current message
|
# Current message (with optional image attachments)
|
||||||
messages.append({"role": "user", "content": current_message})
|
user_content = self._build_user_content(current_message, media)
|
||||||
|
messages.append({"role": "user", "content": user_content})
|
||||||
|
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
|
def _build_user_content(
|
||||||
|
self, text: str, media: list[str] | None
|
||||||
|
) -> str | list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Build user message content, optionally with images.
|
||||||
|
|
||||||
|
Returns a plain string if no media, or a multimodal content list
|
||||||
|
with base64-encoded images.
|
||||||
|
"""
|
||||||
|
if not media:
|
||||||
|
return text
|
||||||
|
|
||||||
|
content: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
for path in media:
|
||||||
|
encoded = self._encode_image(path)
|
||||||
|
if encoded:
|
||||||
|
content.append(encoded)
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
return text
|
||||||
|
|
||||||
|
content.append({"type": "text", "text": text})
|
||||||
|
return content
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _encode_image(file_path: str) -> dict[str, Any] | None:
|
||||||
|
"""Encode a local image file to a base64 data URL for the LLM."""
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.is_file():
|
||||||
|
logger.warning(f"Media file not found: {file_path}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
mime, _ = mimetypes.guess_type(file_path)
|
||||||
|
if not mime or not mime.startswith("image/"):
|
||||||
|
logger.warning(f"Unsupported media type for {file_path}: {mime}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
data = path.read_bytes()
|
||||||
|
b64 = base64.b64encode(data).decode("utf-8")
|
||||||
|
return {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:{mime};base64,{b64}"},
|
||||||
|
}
|
||||||
|
|
||||||
def add_tool_result(
|
def add_tool_result(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -152,7 +152,8 @@ class AgentLoop:
|
|||||||
# Build initial messages (use get_history for LLM-formatted messages)
|
# Build initial messages (use get_history for LLM-formatted messages)
|
||||||
messages = self.context.build_messages(
|
messages = self.context.build_messages(
|
||||||
history=session.get_history(),
|
history=session.get_history(),
|
||||||
current_message=msg.content
|
current_message=msg.content,
|
||||||
|
media=msg.media if msg.media else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Agent loop
|
# Agent loop
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user