"""Helpers to load and format uploaded markdown docs as prompt context.""" from __future__ import annotations import re from typing import Optional, List from . import documents def _normalize_filename_for_matching(filename: str) -> str: """Normalize filename for matching @filename references.""" # Convert to lowercase, replace spaces/underscores/hyphens with single underscore normalized = filename.lower() normalized = re.sub(r'[_\s\-]+', '_', normalized) # Remove .md extension for matching normalized = normalized.replace('.md', '') return normalized def _extract_filename_references(text: str) -> List[str]: """Extract @filename references from text.""" # Match @filename patterns (with or without .md extension) pattern = r'@([a-zA-Z0-9_\s\-\+\.]+)' matches = re.findall(pattern, text) # Normalize each match return [_normalize_filename_for_matching(m) for m in matches] def _extract_numeric_references(text: str) -> List[int]: """Extract numeric document references like @1, @2, @3 from text.""" # Match @ followed by digits pattern = r'@(\d+)' matches = re.findall(pattern, text) # Convert to integers (1-indexed, will be converted to 0-indexed when used) return [int(m) for m in matches] def build_docs_context( conversation_id: str, user_query: Optional[str] = None, *, max_chars: int = 8000, max_docs: int = 5 ) -> Optional[str]: """ Return a single markdown string containing (truncated) docs for a conversation. If user_query is provided and contains references: - @1, @2, @3 etc. (numeric): Include documents by their numbered position (1-indexed) - @filename (text): Include documents whose filenames match (fuzzy matching) - If both are present, numeric references take precedence Otherwise, include all documents up to max_docs. """ all_metas = documents.list_documents(conversation_id) if not all_metas: return None # Check for numeric references first (e.g., @1, @2, @3) if user_query: numeric_refs = _extract_numeric_references(user_query) if numeric_refs: # Convert 1-indexed to 0-indexed and filter filtered_metas = [] for num in numeric_refs: idx = num - 1 # Convert to 0-indexed if 0 <= idx < len(all_metas): filtered_metas.append(all_metas[idx]) if filtered_metas: all_metas = filtered_metas else: # If no numeric refs, check for filename references refs = _extract_filename_references(user_query) if refs: filtered_metas = [] for meta in all_metas: normalized = _normalize_filename_for_matching(meta.filename) # Check if any reference matches this filename if any(ref in normalized or normalized in ref for ref in refs): filtered_metas.append(meta) if filtered_metas: all_metas = filtered_metas # Limit to max_docs metas = all_metas[:max_docs] if not metas: return None chunks = [] remaining = max_chars for meta in metas: if remaining <= 0: break text = documents.read_document_text(conversation_id, meta.id) header = f"\n\n---\nDOC: {meta.filename} ({meta.bytes} bytes)\n---\n" body = text if len(header) >= remaining: break remaining -= len(header) if len(body) > remaining: body = body[: max(0, remaining - 3)] + "..." remaining -= len(body) chunks.append(header + body) return "".join(chunks).strip() if chunks else None