feat: optimize Docker image with multi-stage build

- Use multi-stage build to exclude build dependencies from final image
- Separate builder stage using devel image from runtime stage using smaller base image
- Clean venv: remove __pycache__, .pyc files, and test directories
- Remove embedded model files (243MB) from app/model/ - mount at runtime instead
- Expected size reduction: 18.9GB → 2-3GB (80-90% reduction)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
liuyuanchuang
2026-03-10 10:41:32 +08:00
parent f8173f7c0a
commit 6579cf55f5

View File

@@ -1,7 +1,55 @@
# DocProcesser Dockerfile
# DocProcesser Dockerfile - Multi-stage optimized build
# Optimized for RTX 5080 GPU deployment
# Use NVIDIA CUDA base image with Python 3.10
# =============================================================================
# STAGE 1: Builder - Install dependencies
# =============================================================================
FROM nvidia/cuda:12.9.0-devel-ubuntu24.04 AS builder
# Install build dependencies and Python 3.10
RUN apt-get update && apt-get install -y --no-install-recommends \
software-properties-common \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update && apt-get install -y --no-install-recommends \
python3.10 \
python3.10-venv \
python3.10-dev \
python3.10-distutils \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Setup Python
RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \
ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
# Install uv
RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
WORKDIR /build
# Copy dependency files
COPY pyproject.toml ./
COPY wheels/ ./wheels/
# Create virtual environment with dependencies
RUN uv venv /build/venv --python python3.10 && \
. /build/venv/bin/activate && \
uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . && \
rm -rf ./wheels
# Clean up venv - remove unnecessary files
RUN find /build/venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
find /build/venv -type d -name "*.dist-info/tests" -exec rm -rf {} + 2>/dev/null || true && \
find /build/venv -type f -name "*.pyc" -delete && \
find /build/venv -type f -name "*.pyo" -delete && \
find /build/venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
find /build/venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true
# =============================================================================
# STAGE 2: Runtime - Minimal final image
# =============================================================================
FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04
# Set environment variables
@@ -15,20 +63,15 @@ ENV PYTHONUNBUFFERED=1 \
# Application config (override defaults for container)
# Use 127.0.0.1 for --network host mode, or override with -e for bridge mode
PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \
PADDLEOCR_VL_URL=http://127.0.0.1:8001/v1
PADDLEOCR_VL_URL=http://127.0.0.1:8001/v1 \
PATH="/app/.venv/bin:$PATH" \
VIRTUAL_ENV="/app/.venv"
# Set working directory
WORKDIR /app
# Install system dependencies and Python 3.10 from deadsnakes PPA
# Install runtime-only system dependencies (NO build tools)
RUN apt-get update && apt-get install -y --no-install-recommends \
software-properties-common \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update && apt-get install -y --no-install-recommends \
python3.10 \
python3.10-venv \
python3.10-dev \
python3.10-distutils \
libgl1 \
libglib2.0-0 \
libsm6 \
@@ -37,34 +80,24 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 \
curl \
pandoc \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3.10 /usr/bin/python \
&& ln -sf /usr/bin/python3.10 /usr/bin/python3 \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
&& rm -rf /var/lib/apt/lists/*
# Install uv via pip (more reliable than install script)
RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
ENV PATH="/app/.venv/bin:$PATH"
ENV VIRTUAL_ENV="/app/.venv"
# Setup Python symlinks
RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \
ln -sf /usr/bin/python3.10 /usr/bin/python3
# Copy dependency files first for better caching
COPY pyproject.toml ./
COPY wheels/ ./wheels/
# Copy pre-built venv from builder stage
COPY --from=builder /build/venv /app/.venv
# Create virtual environment and install dependencies
RUN uv venv /app/.venv --python python3.10 \
&& uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \
&& rm -rf ./wheels
# Copy application code
# Copy application code (excluding model files if they're in the repo)
COPY app/ ./app/
# Create model cache directories (mount from host at runtime)
# NOTE: Remove model files from app/model to keep image lean
RUN mkdir -p /root/.cache/modelscope \
/root/.cache/huggingface \
/root/.paddlex \
/app/app/model/DocLayout \
/app/app/model/PP-DocLayout
/root/.paddlex && \
rm -rf /app/app/model/*
# Declare volumes for model cache (mount at runtime to avoid re-downloading)
VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"]