39 lines
1.1 KiB
YAML
39 lines
1.1 KiB
YAML
version: '3.8'
|
|
|
|
services:
|
|
texteller:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
container_name: texteller-server
|
|
runtime: nvidia
|
|
environment:
|
|
- NVIDIA_VISIBLE_DEVICES=all
|
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
|
- CUDA_VISIBLE_DEVICES=0
|
|
# Ray Serve configuration
|
|
- RAY_NUM_REPLICAS=1
|
|
- RAY_NCPU_PER_REPLICA=4
|
|
- RAY_NGPU_PER_REPLICA=1
|
|
ports:
|
|
- "8001:8001"
|
|
volumes:
|
|
# Mount the model cache directory to avoid downloading models
|
|
- ~/.cache/huggingface/hub/models--OleehyO--TexTeller:/root/.cache/huggingface/hub/models--OleehyO--TexTeller:ro
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
device_ids: ['0'] # Use first GPU (RTX 5080)
|
|
capabilities: [gpu]
|
|
restart: unless-stopped
|
|
command: ["texteller", "launch", "server", "-p", "8001"]
|
|
healthcheck:
|
|
test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8001/', timeout=5)"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 60s
|
|
|