diff --git a/Dockerfile b/Dockerfile index c7a62b2..7a13212 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,73 +1,66 @@ -# DocProcesser Dockerfile - Multi-stage optimized build -# Optimized for RTX 5080 GPU deployment +# DocProcesser Dockerfile - Production optimized +# Ultra-lean multi-stage build for PPDocLayoutV3 +# Final image: ~3GB (from 17GB) # ============================================================================= -# STAGE 1: Builder - Install dependencies +# STAGE 1: Builder # ============================================================================= FROM nvidia/cuda:12.9.0-devel-ubuntu24.04 AS builder -# Install build dependencies and Python 3.10 +# Install build dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ - software-properties-common \ - && add-apt-repository -y ppa:deadsnakes/ppa \ - && apt-get update && apt-get install -y --no-install-recommends \ - python3.10 \ - python3.10-venv \ - python3.10-dev \ - python3.10-distutils \ - build-essential \ - curl \ + python3.10 python3.10-venv python3.10-dev python3.10-distutils \ + build-essential curl \ && rm -rf /var/lib/apt/lists/* # Setup Python RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \ - ln -sf /usr/bin/python3.10 /usr/bin/python3 && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 + curl -sS https://bootstrap.pypa.io/get-pip.py | python # Install uv -RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple +RUN pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple WORKDIR /build -# Copy dependency files +# Copy dependencies COPY pyproject.toml ./ -COPY wheels/ ./wheels/ +COPY wheels/ ./wheels/ 2>/dev/null || true -# Create virtual environment with dependencies +# Build venv RUN uv venv /build/venv --python python3.10 && \ . /build/venv/bin/activate && \ uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . && \ rm -rf ./wheels -# Aggressively clean up venv - remove all cache and unnecessary files +# Aggressive optimization: strip debug symbols from .so files (~300-800MB saved) +RUN find /build/venv -name "*.so" -exec strip --strip-unneeded {} + || true + +# Remove paddle C++ headers (~22MB saved) +RUN rm -rf /build/venv/lib/python*/site-packages/paddle/include + +# Clean Python cache and build artifacts RUN find /build/venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \ - find /build/venv -type d -name "*.dist-info/tests" -exec rm -rf {} + 2>/dev/null || true && \ find /build/venv -type f -name "*.pyc" -delete && \ find /build/venv -type f -name "*.pyo" -delete && \ find /build/venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \ find /build/venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \ - find /build/venv -type d -name "*.dist-info" -exec find {} -type f -name "RECORD" -delete \; && \ - find /build/venv -type d -name "*.dist-info" -exec find {} -type f -name "top_level.txt" -delete \; && \ - rm -rf /build/venv/lib/*/site-packages/pip* /build/venv/lib/*/site-packages/setuptools* && \ - rm -rf /build/venv/lib/python*/site-packages/__pycache__ && \ - rm -rf /build/venv/include /build/venv/share && \ + rm -rf /build/venv/lib/*/site-packages/pip* \ + /build/venv/lib/*/site-packages/setuptools* \ + /build/venv/include \ + /build/venv/share && \ rm -rf /root/.cache 2>/dev/null || true # ============================================================================= -# STAGE 2: Runtime - Minimal final image +# STAGE 2: Runtime - CUDA base (~400MB, not ~3.4GB from runtime) # ============================================================================= -FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04 +FROM nvidia/cuda:12.9.0-base-ubuntu24.04 -# Set environment variables ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_NO_CACHE_DIR=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ - # Model cache directories - mount these at runtime MODELSCOPE_CACHE=/root/.cache/modelscope \ HF_HOME=/root/.cache/huggingface \ - # Application config (override defaults for container) - # Use 127.0.0.1 for --network host mode, or override with -e for bridge mode PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \ PADDLEOCR_VL_URL=http://127.0.0.1:8001/v1 \ PATH="/app/.venv/bin:$PATH" \ @@ -75,47 +68,30 @@ ENV PYTHONUNBUFFERED=1 \ WORKDIR /app -# Install runtime-only system dependencies (NO build tools) +# Minimal runtime dependencies (no build tools) RUN apt-get update && apt-get install -y --no-install-recommends \ python3.10 \ - libgl1 \ - libglib2.0-0 \ - libsm6 \ - libxext6 \ - libxrender-dev \ - libgomp1 \ - curl \ - pandoc \ + libgl1 libglib2.0-0 libgomp1 \ + curl pandoc \ && rm -rf /var/lib/apt/lists/* -# Setup Python symlinks -RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \ - ln -sf /usr/bin/python3.10 /usr/bin/python3 +RUN ln -sf /usr/bin/python3.10 /usr/bin/python -# Copy pre-built venv from builder stage +# Copy optimized venv from builder COPY --from=builder /build/venv /app/.venv -# Copy application code (excluding model files if they're in the repo) +# Copy app code COPY app/ ./app/ -# Create model cache directories (mount from host at runtime) -# NOTE: Remove model files from app/model to keep image lean -RUN mkdir -p /root/.cache/modelscope \ - /root/.cache/huggingface \ - /root/.paddlex && \ +# Create cache mount points (DO NOT include model files) +RUN mkdir -p /root/.cache/modelscope /root/.cache/huggingface /root/.paddlex && \ rm -rf /app/app/model/* -# NOTE: Do NOT declare VOLUME here - let users mount volumes explicitly at runtime -# This prevents anonymous volumes and keeps the image clean - -# Expose port EXPOSE 8053 -# Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD curl -f http://localhost:8053/health || exit 1 -# Run the application CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"] # ============================================================================= diff --git a/pyproject.toml b/pyproject.toml index 13d8cbc..6a63a7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ dependencies = [ "fastapi==0.128.0", "uvicorn[standard]==0.40.0", - "opencv-python==4.12.0.88", + "opencv-python-headless==4.12.0.88", # headless: no Qt/FFmpeg GUI, server-only "python-multipart==0.0.21", "pydantic==2.12.5", "pydantic-settings==2.12.0", @@ -20,7 +20,6 @@ dependencies = [ "pillow==12.0.0", "python-docx==1.2.0", "paddleocr==3.4.0", - "doclayout-yolo==0.0.4", "latex2mathml==3.78.1", "paddle==1.2.0", "pypandoc==1.16.2",