diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..c01ecbc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,55 @@ +# Git +.git +.gitignore + +# Python +.venv/ +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +.eggs/ +dist/ +build/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +test/ +tests/ + +# Linting & IDE +.ruff_cache/ +.mypy_cache/ +.cursor/ +.vscode/ +.idea/ +*.swp +*.swo + +# Environment +.env +.env.* +!.env.example + +# Documentation (not needed in container) +*.md +!README.md +openspec/ + +# Models (mounted at runtime, not built into image) +app/model/doclayout/*.pdiparams +app/model/DocLayout/ +app/model/PP-DocLayout/ + +# Misc +*.log +*.tmp +.DS_Store +Thumbs.db + +test/ + diff --git a/.gitignore b/.gitignore index e49f677..d9d72c3 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,5 @@ htmlcov/ uv.lock model/ + +test/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 3f3b60c..1586f2b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,54 +1,73 @@ # DocProcesser Dockerfile # Optimized for RTX 5080 GPU deployment -# Use NVIDIA CUDA base image with Python 3.11 +# Use NVIDIA CUDA base image with Python 3.10 FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04 # Set environment variables ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_NO_CACHE_DIR=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + # Model cache directories - mount these at runtime + MODELSCOPE_CACHE=/root/.cache/modelscope \ + HF_HOME=/root/.cache/huggingface \ + # Application config (override defaults for container) + # Use 127.0.0.1 for --network host mode, or override with -e for bridge mode + PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \ + PADDLEOCR_VL_URL=http://127.0.0.1:8000/v1 # Set working directory WORKDIR /app -# Install system dependencies +# Install system dependencies and Python 3.10 from deadsnakes PPA RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.11 \ - python3.11-venv \ - python3.11-dev \ - python3-pip \ - libgl1-mesa-glx \ + software-properties-common \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update && apt-get install -y --no-install-recommends \ + python3.10 \ + python3.10-venv \ + python3.10-dev \ + python3.10-distutils \ + libgl1 \ libglib2.0-0 \ libsm6 \ libxext6 \ libxrender-dev \ libgomp1 \ curl \ + pandoc \ && rm -rf /var/lib/apt/lists/* \ - && ln -sf /usr/bin/python3.11 /usr/bin/python \ - && ln -sf /usr/bin/python3.11 /usr/bin/python3 + && ln -sf /usr/bin/python3.10 /usr/bin/python \ + && ln -sf /usr/bin/python3.10 /usr/bin/python3 \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 -# Install uv for fast package management -RUN curl -LsSf https://astral.sh/uv/install.sh | sh -ENV PATH="/root/.local/bin:$PATH" - -# Copy dependency files first for better caching -COPY pyproject.toml ./ - -# Create virtual environment and install dependencies -RUN uv venv /app/.venv +# Install uv via pip (more reliable than install script) +RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple ENV PATH="/app/.venv/bin:$PATH" ENV VIRTUAL_ENV="/app/.venv" -RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . +# Copy dependency files first for better caching +COPY pyproject.toml ./ +COPY wheels/ ./wheels/ + +# Create virtual environment and install dependencies +RUN uv venv /app/.venv --python python3.10 \ + && uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \ + && rm -rf ./wheels # Copy application code COPY app/ ./app/ -# Create model directories (models should be mounted at runtime) -RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout +# Create model cache directories (mount from host at runtime) +RUN mkdir -p /root/.cache/modelscope \ + /root/.cache/huggingface \ + /root/.paddlex \ + /app/app/model/DocLayout \ + /app/app/model/PP-DocLayout + +# Declare volumes for model cache (mount at runtime to avoid re-downloading) +VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"] # Expose port EXPOSE 8053 @@ -60,3 +79,21 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ # Run the application CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"] +# ============================================================================= +# Usage: Mount local model cache to avoid downloading +# +# Option 1: Use host network (simplest, can access localhost services) +# docker run --gpus all --network host \ +# -v /home/yoge/.paddlex:/root/.paddlex:ro \ +# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \ +# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \ +# doc_processer:latest +# +# Option 2: Use bridge network with host.docker.internal (Linux needs --add-host) +# docker run --gpus all -p 8053:8053 \ +# --add-host=host.docker.internal:host-gateway \ +# -v /home/yoge/.paddlex:/root/.paddlex:ro \ +# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \ +# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \ +# doc_processer:latest +# ============================================================================= diff --git a/app/api/v1/endpoints/convert.py b/app/api/v1/endpoints/convert.py index 256c085..ea381fd 100644 --- a/app/api/v1/endpoints/convert.py +++ b/app/api/v1/endpoints/convert.py @@ -3,34 +3,28 @@ from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import Response -from app.core.dependencies import get_docx_converter +from app.core.dependencies import get_converter from app.schemas.convert import MarkdownToDocxRequest -from app.services.docx_converter import DocxConverter +from app.services.converter import Converter router = APIRouter() -@router.post("/docx") +@router.post("/file") async def convert_markdown_to_docx( request: MarkdownToDocxRequest, - converter: DocxConverter = Depends(get_docx_converter), + converter: Converter = Depends(get_converter), ) -> Response: """Convert markdown content to DOCX file. - Returns the generated DOCX file as a binary download. + Returns the generated DOCX file as a binary response. """ try: - docx_bytes = converter.convert(request.markdown) + docx_bytes = converter.export_to_file(request.markdown, export_type="docx") + return Response( + content=docx_bytes, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + headers={"Content-Disposition": f'attachment; filename="{request.filename}.docx"'}, + ) except Exception as e: raise HTTPException(status_code=500, detail=f"Conversion failed: {e}") - - # Determine filename - filename = request.filename or "output" - if not filename.endswith(".docx"): - filename = f"{filename}.docx" - - return Response( - content=docx_bytes, - media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", - headers={"Content-Disposition": f'attachment; filename="{filename}"'}, - ) diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py index c194213..635ebf7 100644 --- a/app/api/v1/endpoints/image.py +++ b/app/api/v1/endpoints/image.py @@ -28,24 +28,15 @@ async def process_image_ocr( - Otherwise: use PaddleOCR-VL with formula prompt 4. Convert output to LaTeX, Markdown, and MathML formats """ - try: - # 1. Load and preprocess image - image = image_processor.preprocess( - image_url=request.image_url, - image_base64=request.image_base64, - ) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - try: - # 2. Detect layout - layout_info = layout_detector.detect(image) - except RuntimeError as e: - raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}") + image = image_processor.preprocess( + image_url=request.image_url, + image_base64=request.image_base64, + ) try: # 3. Perform OCR based on layout - ocr_result = ocr_service.recognize(image, layout_info) + ocr_result = ocr_service.recognize(image) except RuntimeError as e: raise HTTPException(status_code=503, detail=str(e)) @@ -54,6 +45,4 @@ async def process_image_ocr( latex=ocr_result.get("latex", ""), markdown=ocr_result.get("markdown", ""), mathml=ocr_result.get("mathml", ""), - layout_info=layout_info, - recognition_mode=ocr_result.get("recognition_mode", ""), ) diff --git a/app/core/config.py b/app/core/config.py index af18a14..c3d81a7 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -5,6 +5,7 @@ from pathlib import Path from pydantic_settings import BaseSettings, SettingsConfigDict import torch +from typing import Optional class Settings(BaseSettings): @@ -21,11 +22,10 @@ class Settings(BaseSettings): debug: bool = False # PaddleOCR-VL Settings - paddleocr_vl_url: str = "http://localhost:8080/v1" + paddleocr_vl_url: str = "http://127.0.0.1:8000/v1" # Model Paths - doclayout_model_path: str = "app/model/DocLayout/best.pt" - pp_doclayout_model_dir: str = "app/model/PP-DocLayout/PP-DocLayoutV2" + pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2" # Image Processing max_image_size_mb: int = 10 @@ -37,11 +37,6 @@ class Settings(BaseSettings): host: str = "0.0.0.0" port: int = 8053 - @property - def doclayout_model_file(self) -> Path: - """Get the DocLayout model file path.""" - return Path(self.doclayout_model_path) - @property def pp_doclayout_dir(self) -> Path: """Get the PP-DocLayout model directory path.""" diff --git a/app/core/dependencies.py b/app/core/dependencies.py index dcd04ae..ea19022 100644 --- a/app/core/dependencies.py +++ b/app/core/dependencies.py @@ -3,20 +3,20 @@ from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector from app.services.ocr_service import OCRService -from app.services.docx_converter import DocxConverter +from app.services.converter import Converter +from app.core.config import get_settings # Global instances (initialized on startup) _layout_detector: LayoutDetector | None = None -def init_layout_detector(model_path: str) -> None: +def init_layout_detector() -> None: """Initialize the global layout detector. Called during application startup. """ global _layout_detector - _layout_detector = LayoutDetector(model_path=model_path) - _layout_detector.load_model() + _layout_detector = LayoutDetector() def get_layout_detector() -> LayoutDetector: @@ -33,10 +33,15 @@ def get_image_processor() -> ImageProcessor: def get_ocr_service() -> OCRService: """Get an OCR service instance.""" - return OCRService() + return OCRService( + vl_server_url=get_settings().paddleocr_vl_url, + layout_detector=get_layout_detector(), + image_processor=get_image_processor(), + converter=get_converter(), + ) -def get_docx_converter() -> DocxConverter: +def get_converter() -> Converter: """Get a DOCX converter instance.""" - return DocxConverter() + return Converter() diff --git a/app/main.py b/app/main.py index 174b5ae..88d9fe2 100644 --- a/app/main.py +++ b/app/main.py @@ -15,7 +15,7 @@ settings = get_settings() async def lifespan(app: FastAPI): """Application lifespan handler for startup/shutdown.""" # Startup: Load models - init_layout_detector(model_path=settings.doclayout_model_path) + init_layout_detector() yield @@ -37,3 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix) async def health_check(): """Health check endpoint.""" return {"status": "healthy"} + + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8053) \ No newline at end of file diff --git a/app/pkg/reference.docx b/app/pkg/reference.docx new file mode 100644 index 0000000..9f8bdb4 Binary files /dev/null and b/app/pkg/reference.docx differ diff --git a/app/schemas/convert.py b/app/schemas/convert.py index 09661e3..97f933e 100644 --- a/app/schemas/convert.py +++ b/app/schemas/convert.py @@ -7,7 +7,7 @@ class MarkdownToDocxRequest(BaseModel): """Request body for markdown to DOCX conversion endpoint.""" markdown: str = Field(..., description="Markdown content to convert") - filename: str | None = Field(None, description="Optional output filename (without extension)") + filename: str = Field("texpixel", description="Optional output filename (without extension)") @field_validator("markdown") @classmethod diff --git a/app/schemas/image.py b/app/schemas/image.py index ed81233..3378843 100644 --- a/app/schemas/image.py +++ b/app/schemas/image.py @@ -9,14 +9,15 @@ class LayoutRegion(BaseModel): type: str = Field(..., description="Region type: text, formula, table, figure") bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]") confidence: float = Field(..., description="Detection confidence score") + score: float = Field(..., description="Detection score") class LayoutInfo(BaseModel): """Layout detection information.""" regions: list[LayoutRegion] = Field(default_factory=list) - has_plain_text: bool = Field(False, description="Whether plain text was detected") - has_formula: bool = Field(False, description="Whether formulas were detected") + MixedRecognition: bool = Field(False, description="Whether mixed recognition was used") + # FormulaRecognition: bool = Field(False, description="Whether formula recognition (with prompt) was used") class ImageOCRRequest(BaseModel): diff --git a/app/services/converter.py b/app/services/converter.py new file mode 100644 index 0000000..4cf73a8 --- /dev/null +++ b/app/services/converter.py @@ -0,0 +1,312 @@ +"""Markdown conversion and export service using pypandoc.""" + +import os +import re +import tempfile +from dataclasses import dataclass +from typing import Literal + +import pypandoc + + +@dataclass +class ConvertResult: + """Result of markdown conversion.""" + + latex: str + mathml: str + + +@dataclass +class ExportResult: + """Result of markdown export.""" + + file_path: str + content_type: str + download_name: str + + +ExportType = Literal["docx", "pdf"] + + +class Converter: + """Service for conversion and export operations.""" + + # Pandoc input format with LaTeX math extensions + INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" + + def __init__(self): + """Initialize converter.""" + + def convert_to_formats(self, md_text: str) -> ConvertResult: + """Convert markdown to LaTeX and MathML formats. + + Args: + md_text: Markdown text to convert. + + Returns: + ConvertResult with latex and mathml fields. + + Raises: + ValueError: If md_text is empty. + RuntimeError: If conversion fails. + """ + if md_text == "": + return ConvertResult(latex="", mathml="") + + try: + # Convert to LaTeX + latex_output = pypandoc.convert_text( + md_text, + "latex", + format=self.INPUT_FORMAT, + ).rstrip("\n") + + # Convert to HTML with MathML + mathml_output = pypandoc.convert_text( + md_text, + "html", + format=self.INPUT_FORMAT, + extra_args=["--mathml"], + ).rstrip("\n") + + return ConvertResult(latex=latex_output, mathml=mathml_output) + + except Exception as e: + raise RuntimeError(f"Conversion failed: {e}") from e + + def preprocess_for_export(self, md_text: str) -> str: + """Preprocess markdown text for export to docx/pdf. + + Handles LaTeX formula formatting, matrix environments, and + other transformations needed for proper Word/PDF rendering. + + Args: + md_text: Raw markdown text. + + Returns: + Preprocessed markdown text. + """ + # Replace \[1mm] => \vspace{1mm} + md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text) + + # Add blank lines around \[...\] block formulas + md_text = re.sub( + r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", + r"\1\n\n\\[\3\\]\n\n\4", + md_text, + flags=re.DOTALL, + ) + md_text = re.sub( + r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", + r"\n\\[\2\\]\n", + md_text, + flags=re.MULTILINE | re.DOTALL, + ) + + # Remove arithmatex span wrappers + cleaned_md = re.sub(r'(.*?)', r"\1", md_text) + + # Convert inline formulas: \( \) => $ $ + cleaned_md = re.sub(r"\\\(", r"$", cleaned_md) + cleaned_md = re.sub(r"\\\)", r"$", cleaned_md) + + # Convert block formulas: \[ \] => $$ $$ + cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md) + cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md) + + # Remove spaces between $ and formula content + # Use negative lookahead/lookbehind to avoid matching $$ block formulas + cleaned_md = re.sub(r"(? str: + """Convert vmatrix/Vmatrix to left/right delimited forms. + + This fixes the vertical line height issues in Word. + """ + # vmatrix -> \left| \begin{matrix}...\end{matrix} \right| + md_text = re.sub( + r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", + r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", + md_text, + flags=re.DOTALL, + ) + + # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| + md_text = re.sub( + r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", + r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", + md_text, + flags=re.DOTALL, + ) + + return md_text + + def _fix_brace_spacing(self, md_text: str) -> str: + """Fix spacing issues with braces in equation systems. + + Removes whitespace and adds negative space for proper alignment in Word/OMML. + """ + # Fix \left\{ spacing + md_text = re.sub( + r"\\left\\\{\s+", + r"\\left\\{\\!", + md_text, + ) + + # Fix \right\} spacing + md_text = re.sub( + r"\s+\\right\\\}", + r"\\!\\right\\}", + md_text, + ) + + return md_text + + def _convert_special_environments(self, md_text: str) -> str: + """Convert cases and aligned environments to array format. + + These environments have better rendering support in Word/OMML. + """ + + def convert_cases(match: re.Match) -> str: + content = match.group(1) + return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." + + md_text = re.sub( + r"\\begin\{cases\}(.*?)\\end\{cases\}", + convert_cases, + md_text, + flags=re.DOTALL, + ) + + def convert_aligned_to_array(match: re.Match) -> str: + content = match.group(1) + # Remove leading & alignment markers (not needed in array{l}) + content = re.sub(r"(^|\\\\)\s*&", r"\1", content) + return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." + + md_text = re.sub( + r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", + convert_aligned_to_array, + md_text, + flags=re.DOTALL, + ) + + def convert_standalone_aligned(match: re.Match) -> str: + content = match.group(1) + content = re.sub(r"(^|\\\\)\s*&", r"\1", content) + return r"\begin{array}{l}" + content + r"\end{array}" + + md_text = re.sub( + r"\\begin\{aligned\}(.*?)\\end\{aligned\}", + convert_standalone_aligned, + md_text, + flags=re.DOTALL, + ) + + return md_text + + def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: + """Export markdown to docx or pdf file. + + Args: + md_text: Markdown text to export. + export_type: Export format, either 'docx' or 'pdf'. + + Returns: + bytes of the exported file. + + Raises: + ValueError: If export_type is not supported. + RuntimeError: If export fails. + + """ + + # Preprocess markdown + cleaned_md = self.preprocess_for_export(md_text) + + # Create temp file for input + with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in: + f_in.write(cleaned_md.encode("utf-8")) + md_path = f_in.name + + output_file = md_path + "." + export_type + + try: + if export_type == "docx": + self._export_docx(md_path, output_file) + with open(output_file, "rb") as f: + return f.read() + else: # pdf + self._export_pdf(md_path, output_file) + with open(output_file, "rb") as f: + return f.read() + + except Exception as e: + # Cleanup on error + self._cleanup_files(md_path, output_file) + raise RuntimeError(f"Export failed: {e}") from e + finally: + # Always cleanup input file + if os.path.exists(md_path): + os.remove(md_path) + + def _export_docx(self, input_path: str, output_path: str) -> None: + """Export to DOCX format using pypandoc.""" + extra_args = [ + "--highlight-style=pygments", + f"--reference-doc=app/pkg/reference.docx", + ] + pypandoc.convert_file( + input_path, + "docx", + format=self.INPUT_FORMAT, + outputfile=output_path, + extra_args=extra_args, + ) + + def _export_pdf(self, input_path: str, output_path: str) -> None: + """Export to PDF format using pypandoc with XeLaTeX.""" + extra_args = [ + "--pdf-engine=xelatex", + "-V", + "mainfont=Noto Sans CJK SC", + "--highlight-style=pygments", + ] + pypandoc.convert_file( + input_path, + "pdf", + format=self.INPUT_FORMAT, + outputfile=output_path, + extra_args=extra_args, + ) + + def _cleanup_files(self, *paths: str) -> None: + """Remove files if they exist.""" + for path in paths: + if os.path.exists(path): + os.remove(path) + + def cleanup_export_file(self, file_path: str) -> None: + """Cleanup exported file after sending response. + + Call this after sending the file to the client. + + Args: + file_path: Path to the exported file. + """ + if os.path.exists(file_path): + os.remove(file_path) + diff --git a/app/services/docx_converter.py b/app/services/docx_converter.py deleted file mode 100644 index 6364507..0000000 --- a/app/services/docx_converter.py +++ /dev/null @@ -1,335 +0,0 @@ -"""Markdown to DOCX conversion service. - -Reference implementation based on https://github.com/YogeLiu/markdown_2_docx -""" - -import io -import re -from dataclasses import dataclass - -from docx import Document -from docx.enum.text import WD_ALIGN_PARAGRAPH -from docx.oxml import OxmlElement -from docx.oxml.ns import qn -from docx.shared import Inches, Pt - - -@dataclass -class MarkdownElement: - """Parsed markdown element.""" - - type: str # heading, paragraph, list_item, code_block, table, math - content: str - level: int = 0 # For headings and lists - language: str = "" # For code blocks - - -class DocxConverter: - """Converts markdown content to DOCX format.""" - - def __init__(self): - """Initialize the converter.""" - self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$") - self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$") - self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$") - self.code_block_pattern = re.compile(r"^```(\w*)$") - self.inline_code_pattern = re.compile(r"`([^`]+)`") - self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*") - self.italic_pattern = re.compile(r"\*([^*]+)\*") - self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL) - self.inline_math_pattern = re.compile(r"\$([^$]+)\$") - - def convert(self, markdown: str) -> bytes: - """Convert markdown content to DOCX. - - Args: - markdown: Markdown content to convert. - - Returns: - DOCX file as bytes. - """ - doc = Document() - elements = self._parse_markdown(markdown) - - for element in elements: - self._add_element_to_doc(doc, element) - - # Save to bytes - buffer = io.BytesIO() - doc.save(buffer) - buffer.seek(0) - return buffer.getvalue() - - def _parse_markdown(self, markdown: str) -> list[MarkdownElement]: - """Parse markdown into elements. - - Args: - markdown: Markdown content. - - Returns: - List of parsed elements. - """ - elements: list[MarkdownElement] = [] - lines = markdown.split("\n") - i = 0 - in_code_block = False - code_content = [] - code_language = "" - - while i < len(lines): - line = lines[i] - - # Code block handling - code_match = self.code_block_pattern.match(line) - if code_match: - if in_code_block: - elements.append( - MarkdownElement( - type="code_block", - content="\n".join(code_content), - language=code_language, - ) - ) - code_content = [] - in_code_block = False - else: - in_code_block = True - code_language = code_match.group(1) - i += 1 - continue - - if in_code_block: - code_content.append(line) - i += 1 - continue - - # Math block ($$...$$) - if line.strip().startswith("$$"): - math_content = [] - if line.strip() == "$$": - i += 1 - while i < len(lines) and lines[i].strip() != "$$": - math_content.append(lines[i]) - i += 1 - else: - # Single line $$...$$ or start - content = line.strip()[2:] - if content.endswith("$$"): - math_content.append(content[:-2]) - else: - math_content.append(content) - i += 1 - while i < len(lines): - if lines[i].strip().endswith("$$"): - math_content.append(lines[i].strip()[:-2]) - break - math_content.append(lines[i]) - i += 1 - - elements.append( - MarkdownElement(type="math", content="\n".join(math_content)) - ) - i += 1 - continue - - # Heading - heading_match = self.heading_pattern.match(line) - if heading_match: - level = len(heading_match.group(1)) - content = heading_match.group(2) - elements.append( - MarkdownElement(type="heading", content=content, level=level) - ) - i += 1 - continue - - # Unordered list - list_match = self.list_pattern.match(line) - if list_match: - indent = len(list_match.group(1)) - content = list_match.group(2) - elements.append( - MarkdownElement(type="list_item", content=content, level=indent // 2) - ) - i += 1 - continue - - # Ordered list - ordered_match = self.ordered_list_pattern.match(line) - if ordered_match: - indent = len(ordered_match.group(1)) - content = ordered_match.group(2) - elements.append( - MarkdownElement( - type="ordered_list_item", content=content, level=indent // 2 - ) - ) - i += 1 - continue - - # Table (simple detection) - if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]: - table_lines = [line] - i += 1 - while i < len(lines) and "|" in lines[i]: - table_lines.append(lines[i]) - i += 1 - elements.append( - MarkdownElement(type="table", content="\n".join(table_lines)) - ) - continue - - # Regular paragraph - if line.strip(): - elements.append(MarkdownElement(type="paragraph", content=line)) - - i += 1 - - return elements - - def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None: - """Add a markdown element to the document. - - Args: - doc: Word document. - element: Parsed markdown element. - """ - if element.type == "heading": - self._add_heading(doc, element.content, element.level) - elif element.type == "paragraph": - self._add_paragraph(doc, element.content) - elif element.type == "list_item": - self._add_list_item(doc, element.content, element.level, ordered=False) - elif element.type == "ordered_list_item": - self._add_list_item(doc, element.content, element.level, ordered=True) - elif element.type == "code_block": - self._add_code_block(doc, element.content) - elif element.type == "table": - self._add_table(doc, element.content) - elif element.type == "math": - self._add_math(doc, element.content) - - def _add_heading(self, doc: Document, content: str, level: int) -> None: - """Add a heading to the document.""" - # Map markdown levels to Word heading styles - heading_level = min(level, 9) # Word supports up to Heading 9 - doc.add_heading(content, level=heading_level) - - def _add_paragraph(self, doc: Document, content: str) -> None: - """Add a paragraph with inline formatting.""" - para = doc.add_paragraph() - self._add_formatted_text(para, content) - - def _add_formatted_text(self, para, content: str) -> None: - """Add text with inline formatting (bold, italic, code).""" - # Simple approach: process inline patterns - remaining = content - - while remaining: - # Find next formatting marker - bold_match = self.bold_pattern.search(remaining) - italic_match = self.italic_pattern.search(remaining) - code_match = self.inline_code_pattern.search(remaining) - math_match = self.inline_math_pattern.search(remaining) - - matches = [ - (bold_match, "bold"), - (italic_match, "italic"), - (code_match, "code"), - (math_match, "math"), - ] - matches = [(m, t) for m, t in matches if m] - - if not matches: - para.add_run(remaining) - break - - # Find earliest match - earliest = min(matches, key=lambda x: x[0].start()) - match, match_type = earliest - - # Add text before match - if match.start() > 0: - para.add_run(remaining[: match.start()]) - - # Add formatted text - run = para.add_run(match.group(1)) - if match_type == "bold": - run.bold = True - elif match_type == "italic": - run.italic = True - elif match_type == "code": - run.font.name = "Courier New" - run.font.size = Pt(10) - elif match_type == "math": - run.italic = True - - remaining = remaining[match.end() :] - - def _add_list_item( - self, doc: Document, content: str, level: int, ordered: bool - ) -> None: - """Add a list item.""" - para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number") - para.paragraph_format.left_indent = Inches(0.25 * level) - self._add_formatted_text(para, content) - - def _add_code_block(self, doc: Document, content: str) -> None: - """Add a code block.""" - para = doc.add_paragraph() - para.paragraph_format.left_indent = Inches(0.5) - - run = para.add_run(content) - run.font.name = "Courier New" - run.font.size = Pt(9) - - # Add shading - shading = OxmlElement("w:shd") - shading.set(qn("w:val"), "clear") - shading.set(qn("w:fill"), "F0F0F0") - para._p.get_or_add_pPr().append(shading) - - def _add_table(self, doc: Document, content: str) -> None: - """Add a table from markdown table format.""" - lines = [l.strip() for l in content.split("\n") if l.strip()] - if len(lines) < 2: - return - - # Parse header - header = [c.strip() for c in lines[0].split("|") if c.strip()] - - # Skip separator line - data_lines = lines[2:] if len(lines) > 2 else [] - - # Create table - table = doc.add_table(rows=1, cols=len(header)) - table.style = "Table Grid" - - # Add header - header_cells = table.rows[0].cells - for i, text in enumerate(header): - header_cells[i].text = text - header_cells[i].paragraphs[0].runs[0].bold = True - - # Add data rows - for line in data_lines: - cells = [c.strip() for c in line.split("|") if c.strip()] - row_cells = table.add_row().cells - for i, text in enumerate(cells): - if i < len(row_cells): - row_cells[i].text = text - - def _add_math(self, doc: Document, content: str) -> None: - """Add a math block. - - For proper OMML rendering, this would need more complex conversion. - Currently renders as italic text with the LaTeX source. - """ - para = doc.add_paragraph() - para.alignment = WD_ALIGN_PARAGRAPH.CENTER - - run = para.add_run(content) - run.italic = True - run.font.name = "Cambria Math" - run.font.size = Pt(12) - diff --git a/app/services/image_processor.py b/app/services/image_processor.py index 34a6419..d7abed1 100644 --- a/app/services/image_processor.py +++ b/app/services/image_processor.py @@ -116,7 +116,7 @@ class ImageProcessor: else: raise ValueError("Either image_url or image_base64 must be provided") - return self.add_padding(image) + return image def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str: """Convert numpy image to base64 string. diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py index b7ed407..3cd8446 100644 --- a/app/services/layout_detector.py +++ b/app/services/layout_detector.py @@ -1,122 +1,157 @@ -"""DocLayout-YOLO wrapper for document layout detection.""" +"""PP-DocLayoutV2 wrapper for document layout detection.""" import numpy as np from app.schemas.image import LayoutInfo, LayoutRegion from app.core.config import get_settings +from paddleocr import LayoutDetection +from typing import Optional settings = get_settings() class LayoutDetector: - """Wrapper for DocLayout-YOLO model.""" + """Layout detector for PP-DocLayoutV2.""" - # Class names from DocLayout-YOLO - CLASS_NAMES = { - 0: "title", - 1: "plain_text", - 2: "abandon", - 3: "figure", - 4: "figure_caption", - 5: "table", - 6: "table_caption", - 7: "table_footnote", - 8: "isolate_formula", - 9: "formula_caption", + _layout_detector: Optional[LayoutDetection] = None + + # PP-DocLayoutV2 class ID to label mapping + CLS_ID_TO_LABEL: dict[int, str] = { + 0: "abstract", + 1: "algorithm", + 2: "aside_text", + 3: "chart", + 4: "content", + 5: "display_formula", + 6: "doc_title", + 7: "figure_title", + 8: "footer", + 9: "footer_image", + 10: "footnote", + 11: "formula_number", + 12: "header", + 13: "header_image", + 14: "image", + 15: "inline_formula", + 16: "number", + 17: "paragraph_title", + 18: "reference", + 19: "reference_content", + 20: "seal", + 21: "table", + 22: "text", + 23: "vertical_text", + 24: "vision_footnote", } - # Classes considered as plain text - PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"} + # Mapping from raw labels to normalized region types + LABEL_TO_TYPE: dict[str, str] = { + # Text types + "abstract": "text", + "algorithm": "text", + "aside_text": "text", + "content": "text", + "doc_title": "text", + "footer": "text", + "footnote": "text", + "header": "text", + "number": "text", + "paragraph_title": "text", + "reference": "text", + "reference_content": "text", + "text": "text", + "vertical_text": "text", + "vision_footnote": "text", + # Formula types + "display_formula": "formula", + "inline_formula": "formula", + "formula_number": "formula", + # Table types + "table": "table", + # Figure types + "chart": "figure", + "figure_title": "figure", + "footer_image": "figure", + "header_image": "figure", + "image": "figure", + "seal": "figure", + } - # Classes considered as formula - FORMULA_CLASSES = {"isolate_formula", "formula_caption"} - - def __init__(self, model_path: str, confidence_threshold: float = 0.2): - """Initialize the layout detector. + def __init__(self): + """Initialize layout detector. Args: - model_path: Path to the DocLayout-YOLO model weights. - confidence_threshold: Minimum confidence for detections. """ - self.model_path = model_path - self.confidence_threshold = confidence_threshold - self.model = None + _ = self._get_layout_detector() - def load_model(self) -> None: - """Load the DocLayout-YOLO model. + def _get_layout_detector(self): + """Get or create LayoutDetection instance.""" + if LayoutDetector._layout_detector is None: + LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2") + return LayoutDetector._layout_detector - Raises: - RuntimeError: If model cannot be loaded. - """ - try: - from doclayout_yolo import YOLOv10 - - self.model = YOLOv10(self.model_path) - except Exception as e: - raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e - - def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo: - """Detect document layout regions. + def detect(self, image: np.ndarray) -> LayoutInfo: + """Detect layout of the image using PP-DocLayoutV2. Args: - image: Input image as numpy array in BGR format. - image_size: Image size for prediction. + image: Input image as numpy array. Returns: - LayoutInfo with detected regions. - - Raises: - RuntimeError: If model not loaded. + LayoutInfo with detected regions and flags. """ - if self.model is None: - raise RuntimeError("Model not loaded. Call load_model() first.") - - # Run prediction - results = self.model.predict( - image, - imgsz=image_size, - conf=self.confidence_threshold, - device=settings.device, - ) + layout_detector = self._get_layout_detector() + result = layout_detector.predict(image) + # Parse the result regions: list[LayoutRegion] = [] - has_plain_text = False - has_formula = False + mixed_recognition = False - if results and len(results) > 0: - result = results[0] - if result.boxes is not None: - for box in result.boxes: - cls_id = int(box.cls[0].item()) - confidence = float(box.conf[0].item()) - bbox = box.xyxy[0].tolist() + # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}] + if isinstance(result, list) and len(result) > 0: + first_result = result[0] + if isinstance(first_result, dict) and "boxes" in first_result: + boxes = first_result.get("boxes", []) + else: + boxes = [] + else: + boxes = [] - class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}") + for box in boxes: + cls_id = box.get("cls_id") + label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other") + score = box.get("score", 0.0) + coordinate = box.get("coordinate", [0, 0, 0, 0]) - # Map to simplified type - if class_name in self.PLAIN_TEXT_CLASSES: - region_type = "text" - has_plain_text = True - elif class_name in self.FORMULA_CLASSES: - region_type = "formula" - has_formula = True - elif class_name in {"figure"}: - region_type = "figure" - elif class_name in {"table"}: - region_type = "table" - else: - region_type = class_name + # Normalize label to region type + region_type = self.LABEL_TO_TYPE.get(label, "text") - regions.append( - LayoutRegion( - type=region_type, - bbox=bbox, - confidence=confidence, - ) - ) + regions.append(LayoutRegion( + type=region_type, + bbox=coordinate, + confidence=score, + score=score, + )) - return LayoutInfo( - regions=regions, - has_plain_text=has_plain_text, - has_formula=has_formula, - ) + + mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions) + + return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition) + + +if __name__ == "__main__": + import cv2 + from app.services.image_processor import ImageProcessor + + layout_detector = LayoutDetector() + image_path = "test/timeout.png" + + image = cv2.imread(image_path) + image_processor = ImageProcessor(padding_ratio=0.15) + image = image_processor.add_padding(image) + + # Save the padded image for debugging + cv2.imwrite("debug_padded_image.png", image) + + + layout_info = layout_detector.detect(image) + print(layout_info) \ No newline at end of file diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 8c7fe41..5b65798 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -1,14 +1,12 @@ """PaddleOCR-VL client service for text and formula recognition.""" -import io -import tempfile -from pathlib import Path - -import cv2 import numpy as np - from app.core.config import get_settings -from app.schemas.image import LayoutInfo +from paddleocr import PaddleOCRVL +from typing import Optional +from app.services.layout_detector import LayoutDetector +from app.services.image_processor import ImageProcessor +from app.services.converter import Converter settings = get_settings() @@ -16,52 +14,40 @@ settings = get_settings() class OCRService: """Service for OCR using PaddleOCR-VL.""" - FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format." + _pipeline: Optional[PaddleOCRVL] = None + _layout_detector: Optional[LayoutDetector] = None def __init__( self, - vl_server_url: str | None = None, - pp_doclayout_model_dir: str | None = None, + vl_server_url: str, + layout_detector: LayoutDetector, + image_processor: ImageProcessor, + converter: Converter, ): """Initialize OCR service. Args: vl_server_url: URL of the vLLM server for PaddleOCR-VL. - pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory. + layout_detector: Layout detector instance. + image_processor: Image processor instance. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url - self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir - self._pipeline = None - - def _get_pipeline(self): + self.layout_detector = layout_detector + self.image_processor = image_processor + self.converter = converter + def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: PaddleOCRVL pipeline instance. """ - if self._pipeline is None: - from paddleocr import PaddleOCRVL - - self._pipeline = PaddleOCRVL( + if OCRService._pipeline is None: + OCRService._pipeline = PaddleOCRVL( vl_rec_backend="vllm-server", vl_rec_server_url=self.vl_server_url, layout_detection_model_name="PP-DocLayoutV2", - layout_detection_model_dir=self.pp_doclayout_model_dir, ) - return self._pipeline - - def _save_temp_image(self, image: np.ndarray) -> str: - """Save image to a temporary file. - - Args: - image: Image as numpy array in BGR format. - - Returns: - Path to temporary file. - """ - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - cv2.imwrite(f.name, image) - return f.name + return OCRService._pipeline def recognize_mixed(self, image: np.ndarray) -> dict: """Recognize mixed content (text + formulas) using PP-DocLayoutV2. @@ -77,30 +63,21 @@ class OCRService: """ try: pipeline = self._get_pipeline() - temp_path = self._save_temp_image(image) - try: - results = list(pipeline.predict(temp_path)) + output = pipeline.predict(image, use_layout_detection=True) - markdown_content = "" - for result in results: - # PaddleOCR-VL results can be saved to markdown - md_buffer = io.StringIO() - result.save_to_markdown(save_path=md_buffer) - markdown_content += md_buffer.getvalue() + markdown_content = "" - # Convert markdown to other formats - latex = self._markdown_to_latex(markdown_content) - mathml = self._extract_mathml(markdown_content) + for res in output: + markdown_content += res.markdown.get("markdown_texts", "") - return { - "markdown": markdown_content, - "latex": latex, - "mathml": mathml, - } - finally: - Path(temp_path).unlink(missing_ok=True) + convert_result = self.converter.convert_to_formats(markdown_content) + return { + "markdown": markdown_content, + "latex": convert_result.latex, + "mathml": convert_result.mathml, + } except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e @@ -116,188 +93,49 @@ class OCRService: Dict with 'latex', 'markdown', 'mathml' keys. """ try: - import httpx + pipeline = self._get_pipeline() - temp_path = self._save_temp_image(image) + output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula") - try: - # Use vLLM API directly for formula recognition - import base64 + markdown_content = "" - with open(temp_path, "rb") as f: - image_base64 = base64.b64encode(f.read()).decode("utf-8") + for res in output: + markdown_content += res.markdown.get("markdown_texts", "") - # Call vLLM server with formula prompt - response = httpx.post( - f"{self.vl_server_url}/chat/completions", - json={ - "model": "paddleocr-vl", - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": self.FORMULA_PROMPT}, - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{image_base64}"}, - }, - ], - } - ], - "max_tokens": 1024, - }, - timeout=60.0, - ) - response.raise_for_status() - result = response.json() + convert_result = self.converter.convert_to_formats(markdown_content) - latex = result["choices"][0]["message"]["content"].strip() - - # Convert latex to other formats - markdown = self._latex_to_markdown(latex) - mathml = self._latex_to_mathml(latex) - - return { - "latex": latex, - "markdown": markdown, - "mathml": mathml, - } - finally: - Path(temp_path).unlink(missing_ok=True) - - except httpx.HTTPStatusError as e: - raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e + return { + "latex": convert_result.latex, + "mathml": convert_result.mathml, + "markdown": markdown_content, + } except Exception as e: raise RuntimeError(f"Formula recognition failed: {e}") from e - def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict: - """Recognize content based on layout detection results. + def recognize(self, image: np.ndarray) -> dict: + """Recognize content using PaddleOCR-VL. Args: image: Input image as numpy array in BGR format. - layout_info: Layout detection results. Returns: - Dict with recognition results including mode used. + Dict with 'latex', 'markdown', 'mathml' keys. """ - # Decision logic: - # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2) - # - Otherwise -> use formula_recognition (VL with prompt) - if layout_info.has_plain_text: - result = self.recognize_mixed(image) - result["recognition_mode"] = "mixed_recognition" + padded_image = self.image_processor.add_padding(image) + layout_info = self.layout_detector.detect(padded_image) + if layout_info.MixedRecognition: + return self.recognize_mixed(image) else: - result = self.recognize_formula(image) - result["recognition_mode"] = "formula_recognition" + return self.recognize_formula(image) - return result - def _markdown_to_latex(self, markdown: str) -> str: - """Convert markdown to LaTeX. - - Simple conversion - wraps content in LaTeX document structure. - - Args: - markdown: Markdown content. - - Returns: - LaTeX representation. - """ - # Basic conversion: preserve math blocks, convert structure - lines = [] - in_code_block = False - - for line in markdown.split("\n"): - if line.startswith("```"): - in_code_block = not in_code_block - if in_code_block: - lines.append("\\begin{verbatim}") - else: - lines.append("\\end{verbatim}") - elif in_code_block: - lines.append(line) - elif line.startswith("# "): - lines.append(f"\\section{{{line[2:]}}}") - elif line.startswith("## "): - lines.append(f"\\subsection{{{line[3:]}}}") - elif line.startswith("### "): - lines.append(f"\\subsubsection{{{line[4:]}}}") - elif line.startswith("- "): - lines.append(f"\\item {line[2:]}") - elif line.startswith("$$"): - lines.append(line.replace("$$", "\\[").replace("$$", "\\]")) - elif "$" in line: - # Keep inline math as-is - lines.append(line) - else: - lines.append(line) - - return "\n".join(lines) - - def _latex_to_markdown(self, latex: str) -> str: - """Convert LaTeX to markdown. - - Args: - latex: LaTeX content. - - Returns: - Markdown representation. - """ - # Wrap LaTeX in markdown math block - if latex.strip(): - return f"$$\n{latex}\n$$" - return "" - - def _latex_to_mathml(self, latex: str) -> str: - """Convert LaTeX to MathML. - - Args: - latex: LaTeX content. - - Returns: - MathML representation. - """ - # Basic LaTeX to MathML conversion - # For production, consider using latex2mathml library - if not latex.strip(): - return "" - - try: - # Try to use latex2mathml if available - from latex2mathml.converter import convert - - return convert(latex) - except ImportError: - # Fallback: wrap in basic MathML structure - return f'{latex}' - except Exception: - return f'{latex}' - - def _extract_mathml(self, markdown: str) -> str: - """Extract and convert math from markdown to MathML. - - Args: - markdown: Markdown content. - - Returns: - MathML for any math content found. - """ - import re - - # Find all math blocks - math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL) - inline_math = re.findall(r"\$([^$]+)\$", markdown) - - all_math = math_blocks + inline_math - - if not all_math: - return "" - - # Convert each to MathML and combine - mathml_parts = [] - for latex in all_math: - mathml = self._latex_to_mathml(latex.strip()) - if mathml: - mathml_parts.append(mathml) - - return "\n".join(mathml_parts) +if __name__ == "__main__": + import cv2 + from app.services.image_processor import ImageProcessor + from app.services.layout_detector import LayoutDetector + image_processor = ImageProcessor(padding_ratio=0.15) + layout_detector = LayoutDetector() + ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector) + image = cv2.imread("test/image.png") + ocr_result = ocr_service.recognize(image) + print(ocr_result) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 92c9177..50a6860 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,30 +2,36 @@ name = "doc-processer" version = "0.1.0" description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX" -readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.10" license = { text = "MIT" } authors = [ { name = "YogeLiu" } ] dependencies = [ - "fastapi>=0.115.0", - "uvicorn[standard]>=0.32.0", - "opencv-python>=4.10.0", - "python-multipart>=0.0.12", - "pydantic>=2.10.0", - "pydantic-settings>=2.6.0", - "httpx>=0.28.0", - "numpy>=1.26.0", - "pillow>=10.4.0", - "python-docx>=1.1.0", - "paddleocr>=2.9.0", - "doclayout-yolo>=0.0.2", - "latex2mathml>=3.77.0", - "paddle>=1.2.0", + "fastapi==0.128.0", + "uvicorn[standard]==0.40.0", + "opencv-python==4.12.0.88", + "python-multipart==0.0.21", + "pydantic==2.12.5", + "pydantic-settings==2.12.0", + "httpx==0.28.1", + "numpy==2.2.6", + "pillow==12.0.0", + "python-docx==1.2.0", + "paddleocr==3.3.2", + "doclayout-yolo==0.0.4", + "latex2mathml==3.78.1", + "paddle==1.2.0", + "pypandoc==1.16.2", + "paddlepaddle", + "paddleocr[doc-parser]", + "safetensors" ] +[tool.uv.sources] +paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" } + [project.optional-dependencies] dev = [ "pytest>=8.0.0",