"""Markdown document storage for conversations. Stores uploaded .md files on disk under data/docs//. """ from __future__ import annotations import os import re import uuid from dataclasses import dataclass from pathlib import Path from typing import List from .config import DOCS_DIR, MAX_DOC_BYTES _SAFE_NAME_RE = re.compile(r"[^a-zA-Z0-9._ -]+") def _safe_filename(name: str) -> str: name = name.strip().replace("\\", "/").split("/")[-1] # drop any path name = _SAFE_NAME_RE.sub("_", name) name = name.strip(" .") if not name: name = "document.md" if not name.lower().endswith(".md"): name = f"{name}.md" return name def _conversation_dir(conversation_id: str) -> Path: base = Path(DOCS_DIR) return base / conversation_id def ensure_docs_dir(conversation_id: str) -> Path: d = _conversation_dir(conversation_id) d.mkdir(parents=True, exist_ok=True) return d @dataclass(frozen=True) class DocumentMeta: id: str filename: str bytes: int def save_markdown_document(conversation_id: str, filename: str, content: bytes) -> DocumentMeta: if len(content) > MAX_DOC_BYTES: raise ValueError(f"Document too large. Max {MAX_DOC_BYTES} bytes.") safe_name = _safe_filename(filename) doc_id = str(uuid.uuid4()) d = ensure_docs_dir(conversation_id) path = d / f"{doc_id}__{safe_name}" path.write_bytes(content) return DocumentMeta(id=doc_id, filename=safe_name, bytes=len(content)) def list_documents(conversation_id: str) -> List[DocumentMeta]: d = _conversation_dir(conversation_id) if not d.exists(): return [] out: List[DocumentMeta] = [] for p in sorted(d.iterdir()): if not p.is_file(): continue if "__" not in p.name: continue doc_id, fname = p.name.split("__", 1) out.append(DocumentMeta(id=doc_id, filename=fname, bytes=p.stat().st_size)) return out def read_document_text(conversation_id: str, doc_id: str) -> str: d = _conversation_dir(conversation_id) if not d.exists(): raise FileNotFoundError("Conversation docs not found") matches = [p for p in d.iterdir() if p.is_file() and p.name.startswith(f"{doc_id}__")] if not matches: raise FileNotFoundError("Document not found") raw = matches[0].read_bytes() # Best-effort UTF-8; replace invalid sequences return raw.decode("utf-8", errors="replace") def delete_document(conversation_id: str, doc_id: str) -> None: d = _conversation_dir(conversation_id) if not d.exists(): raise FileNotFoundError("Conversation docs not found") matches = [p for p in d.iterdir() if p.is_file() and p.name.startswith(f"{doc_id}__")] if not matches: raise FileNotFoundError("Document not found") matches[0].unlink()