llm_council/backend/docs_context.py

"""Helpers to load and format uploaded markdown docs as prompt context."""

from __future__ import annotations
import re
from typing import Optional, List

from . import documents


def _normalize_filename_for_matching(filename: str) -> str:
    """Normalize filename for matching @filename references."""
    # Convert to lowercase, replace spaces/underscores/hyphens with single underscore
    normalized = filename.lower()
    normalized = re.sub(r'[_\s\-]+', '_', normalized)
    # Remove .md extension for matching
    normalized = normalized.replace('.md', '')
    return normalized


def _extract_filename_references(text: str) -> List[str]:
    """Extract @filename references from text."""
    # Match @filename patterns (with or without .md extension)
    pattern = r'@([a-zA-Z0-9_\s\-\+\.]+)'
    matches = re.findall(pattern, text)
    # Normalize each match
    return [_normalize_filename_for_matching(m) for m in matches]


def _extract_numeric_references(text: str) -> List[int]:
    """Extract numeric document references like @1, @2, @3 from text."""
    # Match @ followed by digits
    pattern = r'@(\d+)'
    matches = re.findall(pattern, text)
    # Convert to integers (1-indexed, will be converted to 0-indexed when used)
    return [int(m) for m in matches]


def build_docs_context(
    conversation_id: str,
    user_query: Optional[str] = None,
    *,
    max_chars: int = 8000,
    max_docs: int = 5
) -> Optional[str]:
    """
    Return a single markdown string containing (truncated) docs for a conversation.

    If user_query is provided and contains references:
    - @1, @2, @3 etc. (numeric): Include documents by their numbered position (1-indexed)
    - @filename (text): Include documents whose filenames match (fuzzy matching)
    - If both are present, numeric references take precedence
    Otherwise, include all documents up to max_docs.
    """
    all_metas = documents.list_documents(conversation_id)
    if not all_metas:
        return None

    # Check for numeric references first (e.g., @1, @2, @3)
    if user_query:
        numeric_refs = _extract_numeric_references(user_query)
        if numeric_refs:
            # Convert 1-indexed to 0-indexed and filter
            filtered_metas = []
            for num in numeric_refs:
                idx = num - 1  # Convert to 0-indexed
                if 0 <= idx < len(all_metas):
                    filtered_metas.append(all_metas[idx])
            if filtered_metas:
                all_metas = filtered_metas
        else:
            # If no numeric refs, check for filename references
            refs = _extract_filename_references(user_query)
            if refs:
                filtered_metas = []
                for meta in all_metas:
                    normalized = _normalize_filename_for_matching(meta.filename)
                    # Check if any reference matches this filename
                    if any(ref in normalized or normalized in ref for ref in refs):
                        filtered_metas.append(meta)
                if filtered_metas:
                    all_metas = filtered_metas

    # Limit to max_docs
    metas = all_metas[:max_docs]
    if not metas:
        return None

    chunks = []
    remaining = max_chars
    for meta in metas:
        if remaining <= 0:
            break
        text = documents.read_document_text(conversation_id, meta.id)
        header = f"\n\n---\nDOC: {meta.filename} ({meta.bytes} bytes)\n---\n"
        body = text
        if len(header) >= remaining:
            break
        remaining -= len(header)
        if len(body) > remaining:
            body = body[: max(0, remaining - 3)] + "..."
        remaining -= len(body)
        chunks.append(header + body)

    return "".join(chunks).strip() if chunks else None