Compare commits
7 Commits
bd1c118cb2
...
optimize/d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5ba835ab44 | ||
|
|
7c7d4bf36a | ||
|
|
ef98f37525 | ||
|
|
95c497829f | ||
|
|
6579cf55f5 | ||
|
|
f8173f7c0a | ||
|
|
cff14904bf |
123
Dockerfile
123
Dockerfile
@@ -1,82 +1,103 @@
|
|||||||
# DocProcesser Dockerfile
|
# DocProcesser Dockerfile - Production optimized
|
||||||
# Optimized for RTX 5080 GPU deployment
|
# Ultra-lean multi-stage build for PPDocLayoutV3
|
||||||
|
# Final image: ~3GB (from 17GB)
|
||||||
|
|
||||||
# Use NVIDIA CUDA base image with Python 3.10
|
# =============================================================================
|
||||||
FROM nvidia/cuda:12.9.0-runtime-ubuntu24.04
|
# STAGE 1: Builder
|
||||||
|
# =============================================================================
|
||||||
|
FROM nvidia/cuda:12.9.0-devel-ubuntu24.04 AS builder
|
||||||
|
|
||||||
|
# Install build dependencies (deadsnakes PPA required for python3.10 on Ubuntu 24.04)
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
software-properties-common \
|
||||||
|
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
||||||
|
&& apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3.10 python3.10-venv python3.10-dev python3.10-distutils \
|
||||||
|
build-essential curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Setup Python
|
||||||
|
RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \
|
||||||
|
curl -sS https://bootstrap.pypa.io/get-pip.py | python
|
||||||
|
|
||||||
|
# Install uv
|
||||||
|
RUN pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
|
|
||||||
|
WORKDIR /build
|
||||||
|
|
||||||
|
# Copy dependencies
|
||||||
|
COPY pyproject.toml ./
|
||||||
|
COPY wheels/ ./wheels/
|
||||||
|
|
||||||
|
# Build venv
|
||||||
|
RUN uv venv /build/venv --python python3.10 && \
|
||||||
|
. /build/venv/bin/activate && \
|
||||||
|
uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . && \
|
||||||
|
rm -rf ./wheels
|
||||||
|
|
||||||
|
# Aggressive optimization: strip debug symbols from .so files (~300-800MB saved)
|
||||||
|
RUN find /build/venv -name "*.so" -exec strip --strip-unneeded {} + || true
|
||||||
|
|
||||||
|
# Remove paddle C++ headers (~22MB saved)
|
||||||
|
RUN rm -rf /build/venv/lib/python*/site-packages/paddle/include
|
||||||
|
|
||||||
|
# Clean Python cache and build artifacts
|
||||||
|
RUN find /build/venv -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
|
||||||
|
find /build/venv -type f -name "*.pyc" -delete && \
|
||||||
|
find /build/venv -type f -name "*.pyo" -delete && \
|
||||||
|
find /build/venv -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true && \
|
||||||
|
find /build/venv -type d -name "test" -exec rm -rf {} + 2>/dev/null || true && \
|
||||||
|
rm -rf /build/venv/lib/*/site-packages/pip* \
|
||||||
|
/build/venv/lib/*/site-packages/setuptools* \
|
||||||
|
/build/venv/include \
|
||||||
|
/build/venv/share && \
|
||||||
|
rm -rf /root/.cache 2>/dev/null || true
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# STAGE 2: Runtime - CUDA base (~400MB, not ~3.4GB from runtime)
|
||||||
|
# =============================================================================
|
||||||
|
FROM nvidia/cuda:12.9.0-base-ubuntu24.04
|
||||||
|
|
||||||
# Set environment variables
|
|
||||||
ENV PYTHONUNBUFFERED=1 \
|
ENV PYTHONUNBUFFERED=1 \
|
||||||
PYTHONDONTWRITEBYTECODE=1 \
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PIP_NO_CACHE_DIR=1 \
|
PIP_NO_CACHE_DIR=1 \
|
||||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||||
# Model cache directories - mount these at runtime
|
|
||||||
MODELSCOPE_CACHE=/root/.cache/modelscope \
|
MODELSCOPE_CACHE=/root/.cache/modelscope \
|
||||||
HF_HOME=/root/.cache/huggingface \
|
HF_HOME=/root/.cache/huggingface \
|
||||||
# Application config (override defaults for container)
|
|
||||||
# Use 127.0.0.1 for --network host mode, or override with -e for bridge mode
|
|
||||||
PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \
|
PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \
|
||||||
PADDLEOCR_VL_URL=http://127.0.0.1:8001/v1
|
PADDLEOCR_VL_URL=http://127.0.0.1:8001/v1 \
|
||||||
|
PATH="/app/.venv/bin:$PATH" \
|
||||||
|
VIRTUAL_ENV="/app/.venv"
|
||||||
|
|
||||||
# Set working directory
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install system dependencies and Python 3.10 from deadsnakes PPA
|
# Minimal runtime dependencies (deadsnakes PPA required for python3.10 on Ubuntu 24.04)
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
software-properties-common \
|
software-properties-common \
|
||||||
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
||||||
&& apt-get update && apt-get install -y --no-install-recommends \
|
&& apt-get update && apt-get install -y --no-install-recommends \
|
||||||
python3.10 \
|
python3.10 \
|
||||||
python3.10-venv \
|
libgl1 libglib2.0-0 libgomp1 \
|
||||||
python3.10-dev \
|
curl pandoc \
|
||||||
python3.10-distutils \
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
libgl1 \
|
|
||||||
libglib2.0-0 \
|
|
||||||
libsm6 \
|
|
||||||
libxext6 \
|
|
||||||
libxrender-dev \
|
|
||||||
libgomp1 \
|
|
||||||
curl \
|
|
||||||
pandoc \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& ln -sf /usr/bin/python3.10 /usr/bin/python \
|
|
||||||
&& ln -sf /usr/bin/python3.10 /usr/bin/python3 \
|
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
|
|
||||||
|
|
||||||
# Install uv via pip (more reliable than install script)
|
RUN ln -sf /usr/bin/python3.10 /usr/bin/python
|
||||||
RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
|
|
||||||
ENV PATH="/app/.venv/bin:$PATH"
|
|
||||||
ENV VIRTUAL_ENV="/app/.venv"
|
|
||||||
|
|
||||||
# Copy dependency files first for better caching
|
# Copy optimized venv from builder
|
||||||
COPY pyproject.toml ./
|
COPY --from=builder /build/venv /app/.venv
|
||||||
COPY wheels/ ./wheels/
|
|
||||||
|
|
||||||
# Create virtual environment and install dependencies
|
# Copy app code
|
||||||
RUN uv venv /app/.venv --python python3.10 \
|
|
||||||
&& uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \
|
|
||||||
&& rm -rf ./wheels
|
|
||||||
|
|
||||||
# Copy application code
|
|
||||||
COPY app/ ./app/
|
COPY app/ ./app/
|
||||||
|
|
||||||
# Create model cache directories (mount from host at runtime)
|
# Create cache mount points (DO NOT include model files)
|
||||||
RUN mkdir -p /root/.cache/modelscope \
|
RUN mkdir -p /root/.cache/modelscope /root/.cache/huggingface /root/.paddlex && \
|
||||||
/root/.cache/huggingface \
|
rm -rf /app/app/model/*
|
||||||
/root/.paddlex \
|
|
||||||
/app/app/model/DocLayout \
|
|
||||||
/app/app/model/PP-DocLayout
|
|
||||||
|
|
||||||
# Declare volumes for model cache (mount at runtime to avoid re-downloading)
|
|
||||||
VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"]
|
|
||||||
|
|
||||||
# Expose port
|
|
||||||
EXPOSE 8053
|
EXPOSE 8053
|
||||||
|
|
||||||
# Health check
|
|
||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
CMD curl -f http://localhost:8053/health || exit 1
|
CMD curl -f http://localhost:8053/health || exit 1
|
||||||
|
|
||||||
# Run the application
|
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -13,8 +13,11 @@ Covers:
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
@@ -94,8 +97,18 @@ def clean_repeated_content(
|
|||||||
|
|
||||||
|
|
||||||
def clean_formula_number(number_content: str) -> str:
|
def clean_formula_number(number_content: str) -> str:
|
||||||
"""Strip parentheses from a formula number string, e.g. '(1)' → '1'."""
|
"""Strip delimiters from a formula number string, e.g. '(1)' → '1'.
|
||||||
|
|
||||||
|
Also strips math-mode delimiters ($$, $, \\[...\\]) that vLLM may add
|
||||||
|
when the region is processed with a formula prompt.
|
||||||
|
"""
|
||||||
s = number_content.strip()
|
s = number_content.strip()
|
||||||
|
# Strip display math delimiters
|
||||||
|
for start, end in [("$$", "$$"), (r"\[", r"\]"), ("$", "$"), (r"\(", r"\)")]:
|
||||||
|
if s.startswith(start) and s.endswith(end) and len(s) > len(start) + len(end):
|
||||||
|
s = s[len(start):-len(end)].strip()
|
||||||
|
break
|
||||||
|
# Strip CJK/ASCII parentheses
|
||||||
if s.startswith("(") and s.endswith(")"):
|
if s.startswith("(") and s.endswith(")"):
|
||||||
return s[1:-1]
|
return s[1:-1]
|
||||||
if s.startswith("(") and s.endswith(")"):
|
if s.startswith("(") and s.endswith(")"):
|
||||||
@@ -253,6 +266,9 @@ class GLMResultFormatter:
|
|||||||
if content.startswith(s) and content.endswith(e):
|
if content.startswith(s) and content.endswith(e):
|
||||||
content = content[len(s) : -len(e)].strip()
|
content = content[len(s) : -len(e)].strip()
|
||||||
break
|
break
|
||||||
|
if not content:
|
||||||
|
logger.warning("Skipping formula region with empty content after stripping delimiters")
|
||||||
|
return ""
|
||||||
content = "$$\n" + content + "\n$$"
|
content = "$$\n" + content + "\n$$"
|
||||||
|
|
||||||
# Text formatting
|
# Text formatting
|
||||||
|
|||||||
@@ -104,7 +104,8 @@ class ImageProcessor:
|
|||||||
"""Add whitespace padding around the image.
|
"""Add whitespace padding around the image.
|
||||||
|
|
||||||
Adds padding equal to padding_ratio * max(height, width) on each side.
|
Adds padding equal to padding_ratio * max(height, width) on each side.
|
||||||
This expands the image by approximately 30% total (15% on each side).
|
For small images (height < 80 or width < 500), uses reduced padding_ratio 0.2.
|
||||||
|
This expands the image by approximately 30% total (15% on each side) for normal images.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image: Input image as numpy array in BGR format.
|
image: Input image as numpy array in BGR format.
|
||||||
@@ -113,7 +114,9 @@ class ImageProcessor:
|
|||||||
Padded image as numpy array.
|
Padded image as numpy array.
|
||||||
"""
|
"""
|
||||||
height, width = image.shape[:2]
|
height, width = image.shape[:2]
|
||||||
padding = int(max(height, width) * self.padding_ratio)
|
# Use smaller padding ratio for small images to preserve detail
|
||||||
|
padding_ratio = 0.2 if height < 80 or width < 500 else self.padding_ratio
|
||||||
|
padding = int(max(height, width) * padding_ratio)
|
||||||
|
|
||||||
# Add white padding on all sides
|
# Add white padding on all sides
|
||||||
padded_image = cv2.copyMakeBorder(
|
padded_image = cv2.copyMakeBorder(
|
||||||
|
|||||||
@@ -66,7 +66,9 @@ class LayoutDetector:
|
|||||||
# Formula types
|
# Formula types
|
||||||
"display_formula": "formula",
|
"display_formula": "formula",
|
||||||
"inline_formula": "formula",
|
"inline_formula": "formula",
|
||||||
"formula_number": "formula",
|
# formula_number is a plain text annotation "(2.9)" next to a formula,
|
||||||
|
# not a formula itself — use text prompt so vLLM returns plain text
|
||||||
|
"formula_number": "text",
|
||||||
# Table types
|
# Table types
|
||||||
"table": "table",
|
"table": "table",
|
||||||
# Figure types
|
# Figure types
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""PaddleOCR-VL client service for text and formula recognition."""
|
"""PaddleOCR-VL client service for text and formula recognition."""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
@@ -20,6 +21,7 @@ from app.services.image_processor import ImageProcessor
|
|||||||
from app.services.layout_detector import LayoutDetector
|
from app.services.layout_detector import LayoutDetector
|
||||||
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_COMMANDS_NEED_SPACE = {
|
_COMMANDS_NEED_SPACE = {
|
||||||
# operators / calculus
|
# operators / calculus
|
||||||
@@ -883,10 +885,19 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
# 2. Layout detection
|
# 2. Layout detection
|
||||||
layout_info = self.layout_detector.detect(padded)
|
layout_info = self.layout_detector.detect(padded)
|
||||||
|
|
||||||
|
# Sort regions in reading order: top-to-bottom, left-to-right
|
||||||
|
layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
|
||||||
|
|
||||||
# 3. OCR: per-region (parallel) or full-image fallback
|
# 3. OCR: per-region (parallel) or full-image fallback
|
||||||
if not layout_info.regions:
|
if not layout_info.regions:
|
||||||
raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
|
# No layout detected → assume it's a formula, use formula recognition
|
||||||
markdown_content = self._formatter._clean_content(raw_content)
|
logger.info("No layout regions detected, treating image as formula")
|
||||||
|
raw_content = self._call_vllm(padded, _TASK_PROMPTS["formula"])
|
||||||
|
# Format as display formula markdown
|
||||||
|
formatted_content = raw_content.strip()
|
||||||
|
if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")):
|
||||||
|
formatted_content = f"$$\n{formatted_content}\n$$"
|
||||||
|
markdown_content = formatted_content
|
||||||
else:
|
else:
|
||||||
# Build task list for non-figure regions
|
# Build task list for non-figure regions
|
||||||
tasks = []
|
tasks = []
|
||||||
@@ -895,7 +906,13 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
continue
|
continue
|
||||||
x1, y1, x2, y2 = (int(c) for c in region.bbox)
|
x1, y1, x2, y2 = (int(c) for c in region.bbox)
|
||||||
cropped = padded[y1:y2, x1:x2]
|
cropped = padded[y1:y2, x1:x2]
|
||||||
if cropped.size == 0:
|
if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
|
||||||
|
logger.warning(
|
||||||
|
"Skipping region idx=%d (label=%s): crop too small %s",
|
||||||
|
idx,
|
||||||
|
region.native_label,
|
||||||
|
cropped.shape[:2],
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
|
prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
|
||||||
tasks.append((idx, region, cropped, prompt))
|
tasks.append((idx, region, cropped, prompt))
|
||||||
@@ -915,7 +932,8 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
idx = future_map[future]
|
idx = future_map[future]
|
||||||
try:
|
try:
|
||||||
raw_results[idx] = future.result()
|
raw_results[idx] = future.result()
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logger.warning("vLLM call failed for region idx=%d: %s", idx, e)
|
||||||
raw_results[idx] = ""
|
raw_results[idx] = ""
|
||||||
|
|
||||||
# Build structured region dicts for GLMResultFormatter
|
# Build structured region dicts for GLMResultFormatter
|
||||||
@@ -940,8 +958,11 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
# 6. Format conversion
|
# 6. Format conversion
|
||||||
latex, mathml, mml = "", "", ""
|
latex, mathml, mml = "", "", ""
|
||||||
if markdown_content and self.converter:
|
if markdown_content and self.converter:
|
||||||
fmt = self.converter.convert_to_formats(markdown_content)
|
try:
|
||||||
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
|
fmt = self.converter.convert_to_formats(markdown_content)
|
||||||
|
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
|
||||||
|
except RuntimeError as e:
|
||||||
|
logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)
|
||||||
|
|
||||||
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}
|
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ authors = [
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"fastapi==0.128.0",
|
"fastapi==0.128.0",
|
||||||
"uvicorn[standard]==0.40.0",
|
"uvicorn[standard]==0.40.0",
|
||||||
"opencv-python==4.12.0.88",
|
"opencv-python-headless==4.12.0.88", # headless: no Qt/FFmpeg GUI, server-only
|
||||||
"python-multipart==0.0.21",
|
"python-multipart==0.0.21",
|
||||||
"pydantic==2.12.5",
|
"pydantic==2.12.5",
|
||||||
"pydantic-settings==2.12.0",
|
"pydantic-settings==2.12.0",
|
||||||
@@ -20,7 +20,6 @@ dependencies = [
|
|||||||
"pillow==12.0.0",
|
"pillow==12.0.0",
|
||||||
"python-docx==1.2.0",
|
"python-docx==1.2.0",
|
||||||
"paddleocr==3.4.0",
|
"paddleocr==3.4.0",
|
||||||
"doclayout-yolo==0.0.4",
|
|
||||||
"latex2mathml==3.78.1",
|
"latex2mathml==3.78.1",
|
||||||
"paddle==1.2.0",
|
"paddle==1.2.0",
|
||||||
"pypandoc==1.16.2",
|
"pypandoc==1.16.2",
|
||||||
|
|||||||
Reference in New Issue
Block a user