version: '3.8'

services:
  texteller:
    build:
      context: .
      dockerfile: Dockerfile
    container_name: texteller-server
    runtime: nvidia
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - CUDA_VISIBLE_DEVICES=0
      # Ray Serve configuration
      - RAY_NUM_REPLICAS=1
      - RAY_NCPU_PER_REPLICA=4
      - RAY_NGPU_PER_REPLICA=1
    ports:
      - "8001:8001"
    volumes:
      # Mount the model cache directory to avoid downloading models
      - ~/.cache/huggingface/hub/models--OleehyO--TexTeller:/root/.cache/huggingface/hub/models--OleehyO--TexTeller:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']  # Use first GPU (RTX 5080)
              capabilities: [gpu]
    restart: unless-stopped
    command: ["texteller", "launch", "server", "-p", "8001"]
    healthcheck:
      test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8001/', timeout=5)"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s