atlas/home-voice-agent/llm-servers/4080/config.py

#!/usr/bin/env python3
"""
Configuration for 4080 LLM Server (Work Agent).

This server runs on a remote GPU VM or locally for testing.
Configuration is loaded from .env file in the project root.
"""

import os
from pathlib import Path

# Load .env file from project root (home-voice-agent/)
try:
    from dotenv import load_dotenv
    env_path = Path(__file__).parent.parent.parent / ".env"
    load_dotenv(env_path)
except ImportError:
    # python-dotenv not installed, use environment variables only
    pass

# Ollama server endpoint
# Load from .env file or environment variable, default to localhost
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "localhost")
OLLAMA_PORT = int(os.getenv("OLLAMA_PORT", "11434"))
OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}"

# Model configuration
# Load from .env file or environment variable, default to llama3:latest
MODEL_NAME = os.getenv("OLLAMA_MODEL", "llama3:latest")
MODEL_CONTEXT_WINDOW = 8192  # 8K tokens practical limit
MAX_CONCURRENT_REQUESTS = 2

# API endpoints
API_CHAT = f"{OLLAMA_BASE_URL}/api/chat"
API_GENERATE = f"{OLLAMA_BASE_URL}/api/generate"
API_TAGS = f"{OLLAMA_BASE_URL}/api/tags"

# Timeout settings
REQUEST_TIMEOUT = 300  # 5 minutes for large requests