llm_council/backend/documents.py

"""Markdown document storage for conversations.

Stores uploaded .md files on disk under data/docs/<conversation_id>/.
"""

from __future__ import annotations

import os
import re
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import List

from .config import DOCS_DIR, MAX_DOC_BYTES


_SAFE_NAME_RE = re.compile(r"[^a-zA-Z0-9._ -]+")


def _safe_filename(name: str) -> str:
    name = name.strip().replace("\\", "/").split("/")[-1]  # drop any path
    name = _SAFE_NAME_RE.sub("_", name)
    name = name.strip(" .")
    if not name:
        name = "document.md"
    if not name.lower().endswith(".md"):
        name = f"{name}.md"
    return name


def _conversation_dir(conversation_id: str) -> Path:
    base = Path(DOCS_DIR)
    return base / conversation_id


def ensure_docs_dir(conversation_id: str) -> Path:
    d = _conversation_dir(conversation_id)
    d.mkdir(parents=True, exist_ok=True)
    return d


@dataclass(frozen=True)
class DocumentMeta:
    id: str
    filename: str
    bytes: int


def save_markdown_document(conversation_id: str, filename: str, content: bytes) -> DocumentMeta:
    if len(content) > MAX_DOC_BYTES:
        raise ValueError(f"Document too large. Max {MAX_DOC_BYTES} bytes.")

    safe_name = _safe_filename(filename)
    doc_id = str(uuid.uuid4())

    d = ensure_docs_dir(conversation_id)
    path = d / f"{doc_id}__{safe_name}"
    path.write_bytes(content)
    return DocumentMeta(id=doc_id, filename=safe_name, bytes=len(content))


def list_documents(conversation_id: str) -> List[DocumentMeta]:
    d = _conversation_dir(conversation_id)
    if not d.exists():
        return []

    out: List[DocumentMeta] = []
    for p in sorted(d.iterdir()):
        if not p.is_file():
            continue
        if "__" not in p.name:
            continue
        doc_id, fname = p.name.split("__", 1)
        out.append(DocumentMeta(id=doc_id, filename=fname, bytes=p.stat().st_size))
    return out


def read_document_text(conversation_id: str, doc_id: str) -> str:
    d = _conversation_dir(conversation_id)
    if not d.exists():
        raise FileNotFoundError("Conversation docs not found")

    matches = [p for p in d.iterdir() if p.is_file() and p.name.startswith(f"{doc_id}__")]
    if not matches:
        raise FileNotFoundError("Document not found")

    raw = matches[0].read_bytes()
    # Best-effort UTF-8; replace invalid sequences
    return raw.decode("utf-8", errors="replace")


def delete_document(conversation_id: str, doc_id: str) -> None:
    d = _conversation_dir(conversation_id)
    if not d.exists():
        raise FileNotFoundError("Conversation docs not found")

    matches = [p for p in d.iterdir() if p.is_file() and p.name.startswith(f"{doc_id}__")]
    if not matches:
        raise FileNotFoundError("Document not found")
    matches[0].unlink()