fix: refact logic

2025-12-31 17:38:32 +08:00
parent 6ac50f7d2f
commit 35928c2484
17 changed files with 678 additions and 738 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,55 @@
 # Git
 .git
 .gitignore
 # Python
 .venv/
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 *.egg-info/
 .eggs/
 dist/
 build/
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 test/
 tests/
 # Linting & IDE
 .ruff_cache/
 .mypy_cache/
 .cursor/
 .vscode/
 .idea/
 *.swp
 *.swo
 # Environment
 .env
 .env.*
 !.env.example
 # Documentation (not needed in container)
 *.md
 !README.md
 openspec/
 # Models (mounted at runtime, not built into image)
 app/model/doclayout/*.pdiparams
 app/model/DocLayout/
 app/model/PP-DocLayout/
 # Misc
 *.log
 *.tmp
 .DS_Store
 Thumbs.db
 test/
--- a/.gitignore
+++ b/.gitignore
@@ -71,3 +71,5 @@ htmlcov/
 uv.lock
 model/
 test/
--- a/81
+++ b/81
@@ -1,54 +1,73 @@
 # DocProcesser Dockerfile
 # Optimized for RTX 5080 GPU deployment
-# Use NVIDIA CUDA base image with Python 3.11
+# Use NVIDIA CUDA base image with Python 3.10
 FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04
 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=1
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    # Model cache directories - mount these at runtime
    MODELSCOPE_CACHE=/root/.cache/modelscope \
    HF_HOME=/root/.cache/huggingface \
    # Application config (override defaults for container)
    # Use 127.0.0.1 for --network host mode, or override with -e for bridge mode
    PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \
    PADDLEOCR_VL_URL=http://127.0.0.1:8000/v1
 # Set working directory
 WORKDIR /app
-# Install system dependencies
+# Install system dependencies and Python 3.10 from deadsnakes PPA
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3.11 \
+    software-properties-common \
-    python3.11-venv \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
-    python3.11-dev \
+    && apt-get update && apt-get install -y --no-install-recommends \
-    python3-pip \
+    python3.10 \
-    libgl1-mesa-glx \
+    python3.10-venv \
    python3.10-dev \
    python3.10-distutils \
    libgl1 \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgomp1 \
    curl \
    pandoc \
    && rm -rf /var/lib/apt/lists/* \
-    && ln -sf /usr/bin/python3.11 /usr/bin/python \
+    && ln -sf /usr/bin/python3.10 /usr/bin/python \
-    && ln -sf /usr/bin/python3.11 /usr/bin/python3
+    && ln -sf /usr/bin/python3.10 /usr/bin/python3 \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
-# Install uv for fast package management
+# Install uv via pip (more reliable than install script)
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
 ENV PATH="/root/.local/bin:$PATH"
 # Copy dependency files first for better caching
 COPY pyproject.toml ./
 # Create virtual environment and install dependencies
 RUN uv venv /app/.venv
 ENV PATH="/app/.venv/bin:$PATH"
 ENV VIRTUAL_ENV="/app/.venv"
-RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e .
+# Copy dependency files first for better caching
 COPY pyproject.toml ./
 COPY wheels/ ./wheels/
 # Create virtual environment and install dependencies
 RUN uv venv /app/.venv --python python3.10 \
    && uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \
    && rm -rf ./wheels
 # Copy application code
 COPY app/ ./app/
-# Create model directories (models should be mounted at runtime)
+# Create model cache directories (mount from host at runtime)
-RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout
+RUN mkdir -p /root/.cache/modelscope \
    /root/.cache/huggingface \
    /root/.paddlex \
    /app/app/model/DocLayout \
    /app/app/model/PP-DocLayout
 # Declare volumes for model cache (mount at runtime to avoid re-downloading)
 VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"]
 # Expose port
 EXPOSE 8053
@@ -60,3 +79,21 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
 # Run the application
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]
 # =============================================================================
 # Usage: Mount local model cache to avoid downloading
 #
 # Option 1: Use host network (simplest, can access localhost services)
 # docker run --gpus all --network host \
 #   -v /home/yoge/.paddlex:/root/.paddlex:ro \
 #   -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
 #   -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
 #   doc_processer:latest
 #
 # Option 2: Use bridge network with host.docker.internal (Linux needs --add-host)
 # docker run --gpus all -p 8053:8053 \
 #   --add-host=host.docker.internal:host-gateway \
 #   -v /home/yoge/.paddlex:/root/.paddlex:ro \
 #   -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
 #   -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
 #   doc_processer:latest
 # =============================================================================
--- a/app/api/v1/endpoints/convert.py
+++ b/app/api/v1/endpoints/convert.py
@@ -3,34 +3,28 @@
 from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import Response
-from app.core.dependencies import get_docx_converter
+from app.core.dependencies import get_converter
 from app.schemas.convert import MarkdownToDocxRequest
-from app.services.docx_converter import DocxConverter
+from app.services.converter import Converter
 router = APIRouter()
-@router.post("/docx")
+@router.post("/file")
 async def convert_markdown_to_docx(
    request: MarkdownToDocxRequest,
-    converter: DocxConverter = Depends(get_docx_converter),
+    converter: Converter = Depends(get_converter),
 ) -> Response:
    """Convert markdown content to DOCX file.
-    Returns the generated DOCX file as a binary download.
+    Returns the generated DOCX file as a binary response.
    """
    try:
-        docx_bytes = converter.convert(request.markdown)
+        docx_bytes = converter.export_to_file(request.markdown, export_type="docx")
        return Response(
            content=docx_bytes,
            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            headers={"Content-Disposition": f'attachment; filename="{request.filename}.docx"'},
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
    # Determine filename
    filename = request.filename or "output"
    if not filename.endswith(".docx"):
        filename = f"{filename}.docx"
    return Response(
        content=docx_bytes,
        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        headers={"Content-Disposition": f'attachment; filename="{filename}"'},
    )
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -28,24 +28,15 @@ async def process_image_ocr(
       - Otherwise: use PaddleOCR-VL with formula prompt
    4. Convert output to LaTeX, Markdown, and MathML formats
    """
    try:
        # 1. Load and preprocess image
        image = image_processor.preprocess(
            image_url=request.image_url,
            image_base64=request.image_base64,
        )
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
-    try:
+    image = image_processor.preprocess(
-        # 2. Detect layout
+        image_url=request.image_url,
-        layout_info = layout_detector.detect(image)
+        image_base64=request.image_base64,
-    except RuntimeError as e:
+    )
        raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}")
    try:
        # 3. Perform OCR based on layout
-        ocr_result = ocr_service.recognize(image, layout_info)
+        ocr_result = ocr_service.recognize(image)
    except RuntimeError as e:
        raise HTTPException(status_code=503, detail=str(e))
@@ -54,6 +45,4 @@ async def process_image_ocr(
        latex=ocr_result.get("latex", ""),
        markdown=ocr_result.get("markdown", ""),
        mathml=ocr_result.get("mathml", ""),
        layout_info=layout_info,
        recognition_mode=ocr_result.get("recognition_mode", ""),
    )
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -5,6 +5,7 @@ from pathlib import Path
 from pydantic_settings import BaseSettings, SettingsConfigDict
 import torch
 from typing import Optional
 class Settings(BaseSettings):
@@ -21,11 +22,10 @@ class Settings(BaseSettings):
    debug: bool = False
    # PaddleOCR-VL Settings
-    paddleocr_vl_url: str = "http://localhost:8080/v1"
+    paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"
    # Model Paths
-    doclayout_model_path: str = "app/model/DocLayout/best.pt"
+    pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"
    pp_doclayout_model_dir: str = "app/model/PP-DocLayout/PP-DocLayoutV2"
    # Image Processing
    max_image_size_mb: int = 10
@@ -37,11 +37,6 @@ class Settings(BaseSettings):
    host: str = "0.0.0.0"
    port: int = 8053
    @property
    def doclayout_model_file(self) -> Path:
        """Get the DocLayout model file path."""
        return Path(self.doclayout_model_path)
    @property
    def pp_doclayout_dir(self) -> Path:
        """Get the PP-DocLayout model directory path."""
--- a/app/core/dependencies.py
+++ b/app/core/dependencies.py
@@ -3,20 +3,20 @@
 from app.services.image_processor import ImageProcessor
 from app.services.layout_detector import LayoutDetector
 from app.services.ocr_service import OCRService
-from app.services.docx_converter import DocxConverter
+from app.services.converter import Converter
 from app.core.config import get_settings
 # Global instances (initialized on startup)
 _layout_detector: LayoutDetector | None = None
-def init_layout_detector(model_path: str) -> None:
+def init_layout_detector() -> None:
    """Initialize the global layout detector.
    Called during application startup.
    """
    global _layout_detector
-    _layout_detector = LayoutDetector(model_path=model_path)
+    _layout_detector = LayoutDetector()
    _layout_detector.load_model()
 def get_layout_detector() -> LayoutDetector:
@@ -33,10 +33,15 @@ def get_image_processor() -> ImageProcessor:
 def get_ocr_service() -> OCRService:
    """Get an OCR service instance."""
-    return OCRService()
+    return OCRService(
        vl_server_url=get_settings().paddleocr_vl_url,
        layout_detector=get_layout_detector(),
        image_processor=get_image_processor(),
        converter=get_converter(),
    )
-def get_docx_converter() -> DocxConverter:
+def get_converter() -> Converter:
    """Get a DOCX converter instance."""
-    return DocxConverter()
+    return Converter()
--- a/app/main.py
+++ b/app/main.py
@@ -15,7 +15,7 @@ settings = get_settings()
 async def lifespan(app: FastAPI):
    """Application lifespan handler for startup/shutdown."""
    # Startup: Load models
-    init_layout_detector(model_path=settings.doclayout_model_path)
+    init_layout_detector()
    yield
@@ -37,3 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix)
 async def health_check():
    """Health check endpoint."""
    return {"status": "healthy"}
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8053)
--- a/app/pkg/reference.docx
+++ b/app/pkg/reference.docx
--- a/app/schemas/convert.py
+++ b/app/schemas/convert.py
@@ -7,7 +7,7 @@ class MarkdownToDocxRequest(BaseModel):
    """Request body for markdown to DOCX conversion endpoint."""
    markdown: str = Field(..., description="Markdown content to convert")
-    filename: str | None = Field(None, description="Optional output filename (without extension)")
+    filename: str = Field("texpixel", description="Optional output filename (without extension)")
    @field_validator("markdown")
    @classmethod
--- a/app/schemas/image.py
+++ b/app/schemas/image.py
@@ -9,14 +9,15 @@ class LayoutRegion(BaseModel):
    type: str = Field(..., description="Region type: text, formula, table, figure")
    bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]")
    confidence: float = Field(..., description="Detection confidence score")
    score: float = Field(..., description="Detection score")
 class LayoutInfo(BaseModel):
    """Layout detection information."""
    regions: list[LayoutRegion] = Field(default_factory=list)
-    has_plain_text: bool = Field(False, description="Whether plain text was detected")
+    MixedRecognition: bool = Field(False, description="Whether mixed recognition was used")
-    has_formula: bool = Field(False, description="Whether formulas were detected")
+    # FormulaRecognition: bool = Field(False, description="Whether formula recognition (with prompt) was used")
 class ImageOCRRequest(BaseModel):
--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -0,0 +1,312 @@
 """Markdown conversion and export service using pypandoc."""
 import os
 import re
 import tempfile
 from dataclasses import dataclass
 from typing import Literal
 import pypandoc
@dataclass
 class ConvertResult:
    """Result of markdown conversion."""
    latex: str
    mathml: str
@dataclass
 class ExportResult:
    """Result of markdown export."""
    file_path: str
    content_type: str
    download_name: str
 ExportType = Literal["docx", "pdf"]
 class Converter:
    """Service for conversion and export operations."""
    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
    def __init__(self):
        """Initialize converter."""
    def convert_to_formats(self, md_text: str) -> ConvertResult:
        """Convert markdown to LaTeX and MathML formats.
        Args:
            md_text: Markdown text to convert.
        Returns:
            ConvertResult with latex and mathml fields.
        Raises:
            ValueError: If md_text is empty.
            RuntimeError: If conversion fails.
        """
        if md_text == "":
            return ConvertResult(latex="", mathml="")
        try:
            # Convert to LaTeX
            latex_output = pypandoc.convert_text(
                md_text,
                "latex",
                format=self.INPUT_FORMAT,
            ).rstrip("\n")
            # Convert to HTML with MathML
            mathml_output = pypandoc.convert_text(
                md_text,
                "html",
                format=self.INPUT_FORMAT,
                extra_args=["--mathml"],
            ).rstrip("\n")
            return ConvertResult(latex=latex_output, mathml=mathml_output)
        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e
    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.
        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.
        Args:
            md_text: Raw markdown text.
        Returns:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
        md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
        # Add blank lines around \[...\] block formulas
        md_text = re.sub(
            r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
            r"\1\n\n\\[\3\\]\n\n\4",
            md_text,
            flags=re.DOTALL,
        )
        md_text = re.sub(
            r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
            r"\n\\[\2\\]\n",
            md_text,
            flags=re.MULTILINE | re.DOTALL,
        )
        # Remove arithmatex span wrappers
        cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
        # Convert inline formulas: \( \) => $ $
        cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
        cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
        # Convert block formulas: \[ \] => $$ $$
        cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
        cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
        # Remove spaces between $ and formula content
        # Use negative lookahead/lookbehind to avoid matching $$ block formulas
        cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)
        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)
        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)
        return cleaned_md
    def _convert_matrix_environments(self, md_text: str) -> str:
        """Convert vmatrix/Vmatrix to left/right delimited forms.
        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
        md_text = re.sub(
            r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
            flags=re.DOTALL,
        )
        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
        md_text = re.sub(
            r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
            flags=re.DOTALL,
        )
        return md_text
    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.
        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
        # Fix \left\{ spacing
        md_text = re.sub(
            r"\\left\\\{\s+",
            r"\\left\\{\\!",
            md_text,
        )
        # Fix \right\} spacing
        md_text = re.sub(
            r"\s+\\right\\\}",
            r"\\!\\right\\}",
            md_text,
        )
        return md_text
    def _convert_special_environments(self, md_text: str) -> str:
        """Convert cases and aligned environments to array format.
        These environments have better rendering support in Word/OMML.
        """
        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
        md_text = re.sub(
            r"\\begin\{cases\}(.*?)\\end\{cases\}",
            convert_cases,
            md_text,
            flags=re.DOTALL,
        )
        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
            # Remove leading & alignment markers (not needed in array{l})
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
        md_text = re.sub(
            r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
            convert_aligned_to_array,
            md_text,
            flags=re.DOTALL,
        )
        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"
        md_text = re.sub(
            r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
            convert_standalone_aligned,
            md_text,
            flags=re.DOTALL,
        )
        return md_text
    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.
        Args:
            md_text: Markdown text to export.
            export_type: Export format, either 'docx' or 'pdf'.
        Returns:
            bytes of the exported file.
        Raises:
            ValueError: If export_type is not supported.
            RuntimeError: If export fails.
        """
        # Preprocess markdown
        cleaned_md = self.preprocess_for_export(md_text)
        # Create temp file for input
        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
            f_in.write(cleaned_md.encode("utf-8"))
            md_path = f_in.name
        output_file = md_path + "." + export_type
        try:
            if export_type == "docx":
                self._export_docx(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
            else:  # pdf
                self._export_pdf(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
        except Exception as e:
            # Cleanup on error
            self._cleanup_files(md_path, output_file)
            raise RuntimeError(f"Export failed: {e}") from e
        finally:
            # Always cleanup input file
            if os.path.exists(md_path):
                os.remove(md_path)
    def _export_docx(self, input_path: str, output_path: str) -> None:
        """Export to DOCX format using pypandoc."""
        extra_args = [
            "--highlight-style=pygments",
            f"--reference-doc=app/pkg/reference.docx",
        ]
        pypandoc.convert_file(
            input_path,
            "docx",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )
    def _export_pdf(self, input_path: str, output_path: str) -> None:
        """Export to PDF format using pypandoc with XeLaTeX."""
        extra_args = [
            "--pdf-engine=xelatex",
            "-V",
            "mainfont=Noto Sans CJK SC",
            "--highlight-style=pygments",
        ]
        pypandoc.convert_file(
            input_path,
            "pdf",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )
    def _cleanup_files(self, *paths: str) -> None:
        """Remove files if they exist."""
        for path in paths:
            if os.path.exists(path):
                os.remove(path)
    def cleanup_export_file(self, file_path: str) -> None:
        """Cleanup exported file after sending response.
        Call this after sending the file to the client.
        Args:
            file_path: Path to the exported file.
        """
        if os.path.exists(file_path):
            os.remove(file_path)
--- a/app/services/docx_converter.py
+++ b/app/services/docx_converter.py
@@ -1,335 +0,0 @@
 """Markdown to DOCX conversion service.
 Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
 """
 import io
 import re
 from dataclasses import dataclass
 from docx import Document
 from docx.enum.text import WD_ALIGN_PARAGRAPH
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
 from docx.shared import Inches, Pt
@dataclass
 class MarkdownElement:
    """Parsed markdown element."""
    type: str  # heading, paragraph, list_item, code_block, table, math
    content: str
    level: int = 0  # For headings and lists
    language: str = ""  # For code blocks
 class DocxConverter:
    """Converts markdown content to DOCX format."""
    def __init__(self):
        """Initialize the converter."""
        self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
        self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
        self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
        self.code_block_pattern = re.compile(r"^```(\w*)$")
        self.inline_code_pattern = re.compile(r"`([^`]+)`")
        self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
        self.italic_pattern = re.compile(r"\*([^*]+)\*")
        self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
        self.inline_math_pattern = re.compile(r"\$([^$]+)\$")
    def convert(self, markdown: str) -> bytes:
        """Convert markdown content to DOCX.
        Args:
            markdown: Markdown content to convert.
        Returns:
            DOCX file as bytes.
        """
        doc = Document()
        elements = self._parse_markdown(markdown)
        for element in elements:
            self._add_element_to_doc(doc, element)
        # Save to bytes
        buffer = io.BytesIO()
        doc.save(buffer)
        buffer.seek(0)
        return buffer.getvalue()
    def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
        """Parse markdown into elements.
        Args:
            markdown: Markdown content.
        Returns:
            List of parsed elements.
        """
        elements: list[MarkdownElement] = []
        lines = markdown.split("\n")
        i = 0
        in_code_block = False
        code_content = []
        code_language = ""
        while i < len(lines):
            line = lines[i]
            # Code block handling
            code_match = self.code_block_pattern.match(line)
            if code_match:
                if in_code_block:
                    elements.append(
                        MarkdownElement(
                            type="code_block",
                            content="\n".join(code_content),
                            language=code_language,
                        )
                    )
                    code_content = []
                    in_code_block = False
                else:
                    in_code_block = True
                    code_language = code_match.group(1)
                i += 1
                continue
            if in_code_block:
                code_content.append(line)
                i += 1
                continue
            # Math block ($$...$$)
            if line.strip().startswith("$$"):
                math_content = []
                if line.strip() == "$$":
                    i += 1
                    while i < len(lines) and lines[i].strip() != "$$":
                        math_content.append(lines[i])
                        i += 1
                else:
                    # Single line $$...$$ or start
                    content = line.strip()[2:]
                    if content.endswith("$$"):
                        math_content.append(content[:-2])
                    else:
                        math_content.append(content)
                        i += 1
                        while i < len(lines):
                            if lines[i].strip().endswith("$$"):
                                math_content.append(lines[i].strip()[:-2])
                                break
                            math_content.append(lines[i])
                            i += 1
                elements.append(
                    MarkdownElement(type="math", content="\n".join(math_content))
                )
                i += 1
                continue
            # Heading
            heading_match = self.heading_pattern.match(line)
            if heading_match:
                level = len(heading_match.group(1))
                content = heading_match.group(2)
                elements.append(
                    MarkdownElement(type="heading", content=content, level=level)
                )
                i += 1
                continue
            # Unordered list
            list_match = self.list_pattern.match(line)
            if list_match:
                indent = len(list_match.group(1))
                content = list_match.group(2)
                elements.append(
                    MarkdownElement(type="list_item", content=content, level=indent // 2)
                )
                i += 1
                continue
            # Ordered list
            ordered_match = self.ordered_list_pattern.match(line)
            if ordered_match:
                indent = len(ordered_match.group(1))
                content = ordered_match.group(2)
                elements.append(
                    MarkdownElement(
                        type="ordered_list_item", content=content, level=indent // 2
                    )
                )
                i += 1
                continue
            # Table (simple detection)
            if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
                table_lines = [line]
                i += 1
                while i < len(lines) and "|" in lines[i]:
                    table_lines.append(lines[i])
                    i += 1
                elements.append(
                    MarkdownElement(type="table", content="\n".join(table_lines))
                )
                continue
            # Regular paragraph
            if line.strip():
                elements.append(MarkdownElement(type="paragraph", content=line))
            i += 1
        return elements
    def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
        """Add a markdown element to the document.
        Args:
            doc: Word document.
            element: Parsed markdown element.
        """
        if element.type == "heading":
            self._add_heading(doc, element.content, element.level)
        elif element.type == "paragraph":
            self._add_paragraph(doc, element.content)
        elif element.type == "list_item":
            self._add_list_item(doc, element.content, element.level, ordered=False)
        elif element.type == "ordered_list_item":
            self._add_list_item(doc, element.content, element.level, ordered=True)
        elif element.type == "code_block":
            self._add_code_block(doc, element.content)
        elif element.type == "table":
            self._add_table(doc, element.content)
        elif element.type == "math":
            self._add_math(doc, element.content)
    def _add_heading(self, doc: Document, content: str, level: int) -> None:
        """Add a heading to the document."""
        # Map markdown levels to Word heading styles
        heading_level = min(level, 9)  # Word supports up to Heading 9
        doc.add_heading(content, level=heading_level)
    def _add_paragraph(self, doc: Document, content: str) -> None:
        """Add a paragraph with inline formatting."""
        para = doc.add_paragraph()
        self._add_formatted_text(para, content)
    def _add_formatted_text(self, para, content: str) -> None:
        """Add text with inline formatting (bold, italic, code)."""
        # Simple approach: process inline patterns
        remaining = content
        while remaining:
            # Find next formatting marker
            bold_match = self.bold_pattern.search(remaining)
            italic_match = self.italic_pattern.search(remaining)
            code_match = self.inline_code_pattern.search(remaining)
            math_match = self.inline_math_pattern.search(remaining)
            matches = [
                (bold_match, "bold"),
                (italic_match, "italic"),
                (code_match, "code"),
                (math_match, "math"),
            ]
            matches = [(m, t) for m, t in matches if m]
            if not matches:
                para.add_run(remaining)
                break
            # Find earliest match
            earliest = min(matches, key=lambda x: x[0].start())
            match, match_type = earliest
            # Add text before match
            if match.start() > 0:
                para.add_run(remaining[: match.start()])
            # Add formatted text
            run = para.add_run(match.group(1))
            if match_type == "bold":
                run.bold = True
            elif match_type == "italic":
                run.italic = True
            elif match_type == "code":
                run.font.name = "Courier New"
                run.font.size = Pt(10)
            elif match_type == "math":
                run.italic = True
            remaining = remaining[match.end() :]
    def _add_list_item(
        self, doc: Document, content: str, level: int, ordered: bool
    ) -> None:
        """Add a list item."""
        para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
        para.paragraph_format.left_indent = Inches(0.25 * level)
        self._add_formatted_text(para, content)
    def _add_code_block(self, doc: Document, content: str) -> None:
        """Add a code block."""
        para = doc.add_paragraph()
        para.paragraph_format.left_indent = Inches(0.5)
        run = para.add_run(content)
        run.font.name = "Courier New"
        run.font.size = Pt(9)
        # Add shading
        shading = OxmlElement("w:shd")
        shading.set(qn("w:val"), "clear")
        shading.set(qn("w:fill"), "F0F0F0")
        para._p.get_or_add_pPr().append(shading)
    def _add_table(self, doc: Document, content: str) -> None:
        """Add a table from markdown table format."""
        lines = [l.strip() for l in content.split("\n") if l.strip()]
        if len(lines) < 2:
            return
        # Parse header
        header = [c.strip() for c in lines[0].split("|") if c.strip()]
        # Skip separator line
        data_lines = lines[2:] if len(lines) > 2 else []
        # Create table
        table = doc.add_table(rows=1, cols=len(header))
        table.style = "Table Grid"
        # Add header
        header_cells = table.rows[0].cells
        for i, text in enumerate(header):
            header_cells[i].text = text
            header_cells[i].paragraphs[0].runs[0].bold = True
        # Add data rows
        for line in data_lines:
            cells = [c.strip() for c in line.split("|") if c.strip()]
            row_cells = table.add_row().cells
            for i, text in enumerate(cells):
                if i < len(row_cells):
                    row_cells[i].text = text
    def _add_math(self, doc: Document, content: str) -> None:
        """Add a math block.
        For proper OMML rendering, this would need more complex conversion.
        Currently renders as italic text with the LaTeX source.
        """
        para = doc.add_paragraph()
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.add_run(content)
        run.italic = True
        run.font.name = "Cambria Math"
        run.font.size = Pt(12)
--- a/app/services/image_processor.py
+++ b/app/services/image_processor.py
@@ -116,7 +116,7 @@ class ImageProcessor:
        else:
            raise ValueError("Either image_url or image_base64 must be provided")
-        return self.add_padding(image)
+        return image
    def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str:
        """Convert numpy image to base64 string.
--- a/app/services/layout_detector.py
+++ b/app/services/layout_detector.py
@@ -1,122 +1,157 @@
-"""DocLayout-YOLO wrapper for document layout detection."""
+"""PP-DocLayoutV2 wrapper for document layout detection."""
 import numpy as np
 from app.schemas.image import LayoutInfo, LayoutRegion
 from app.core.config import get_settings
 from paddleocr import LayoutDetection
 from typing import Optional
 settings = get_settings()
 class LayoutDetector:
-    """Wrapper for DocLayout-YOLO model."""
+    """Layout detector for PP-DocLayoutV2."""
-    # Class names from DocLayout-YOLO
+    _layout_detector: Optional[LayoutDetection] = None
-    CLASS_NAMES = {
+
-        0: "title",
+    # PP-DocLayoutV2 class ID to label mapping
-        1: "plain_text",
+    CLS_ID_TO_LABEL: dict[int, str] = {
-        2: "abandon",
+        0: "abstract",
-        3: "figure",
+        1: "algorithm",
-        4: "figure_caption",
+        2: "aside_text",
-        5: "table",
+        3: "chart",
-        6: "table_caption",
+        4: "content",
-        7: "table_footnote",
+        5: "display_formula",
-        8: "isolate_formula",
+        6: "doc_title",
-        9: "formula_caption",
+        7: "figure_title",
        8: "footer",
        9: "footer_image",
        10: "footnote",
        11: "formula_number",
        12: "header",
        13: "header_image",
        14: "image",
        15: "inline_formula",
        16: "number",
        17: "paragraph_title",
        18: "reference",
        19: "reference_content",
        20: "seal",
        21: "table",
        22: "text",
        23: "vertical_text",
        24: "vision_footnote",
    }
-    # Classes considered as plain text
+    # Mapping from raw labels to normalized region types
-    PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"}
+    LABEL_TO_TYPE: dict[str, str] = {
        # Text types
        "abstract": "text",
        "algorithm": "text",
        "aside_text": "text",
        "content": "text",
        "doc_title": "text",
        "footer": "text",
        "footnote": "text",
        "header": "text",
        "number": "text",
        "paragraph_title": "text",
        "reference": "text",
        "reference_content": "text",
        "text": "text",
        "vertical_text": "text",
        "vision_footnote": "text",
        # Formula types
        "display_formula": "formula",
        "inline_formula": "formula",
        "formula_number": "formula",
        # Table types
        "table": "table",
        # Figure types
        "chart": "figure",
        "figure_title": "figure",
        "footer_image": "figure",
        "header_image": "figure",
        "image": "figure",
        "seal": "figure",
    }
-    # Classes considered as formula
+    def __init__(self):
-    FORMULA_CLASSES = {"isolate_formula", "formula_caption"}
+        """Initialize layout detector.
    def __init__(self, model_path: str, confidence_threshold: float = 0.2):
        """Initialize the layout detector.
        Args:
            model_path: Path to the DocLayout-YOLO model weights.
            confidence_threshold: Minimum confidence for detections.
        """
-        self.model_path = model_path
+        _ = self._get_layout_detector()
        self.confidence_threshold = confidence_threshold
        self.model = None
-    def load_model(self) -> None:
+    def _get_layout_detector(self):
-        """Load the DocLayout-YOLO model.
+        """Get or create LayoutDetection instance."""
        if LayoutDetector._layout_detector is None:
            LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
        return LayoutDetector._layout_detector
-        Raises:
+    def detect(self, image: np.ndarray) -> LayoutInfo:
-            RuntimeError: If model cannot be loaded.
+        """Detect layout of the image using PP-DocLayoutV2.
        """
        try:
            from doclayout_yolo import YOLOv10
            self.model = YOLOv10(self.model_path)
        except Exception as e:
            raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e
    def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo:
        """Detect document layout regions.
        Args:
-            image: Input image as numpy array in BGR format.
+            image: Input image as numpy array.
            image_size: Image size for prediction.
        Returns:
-            LayoutInfo with detected regions.
+            LayoutInfo with detected regions and flags.
        Raises:
            RuntimeError: If model not loaded.
        """
-        if self.model is None:
+        layout_detector = self._get_layout_detector()
-            raise RuntimeError("Model not loaded. Call load_model() first.")
+        result = layout_detector.predict(image)
        # Run prediction
        results = self.model.predict(
            image,
            imgsz=image_size,
            conf=self.confidence_threshold,
            device=settings.device,
        )
        # Parse the result
        regions: list[LayoutRegion] = []
-        has_plain_text = False
+        mixed_recognition = False
        has_formula = False
-        if results and len(results) > 0:
+        # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]
-            result = results[0]
+        if isinstance(result, list) and len(result) > 0:
-            if result.boxes is not None:
+            first_result = result[0]
-                for box in result.boxes:
+            if isinstance(first_result, dict) and "boxes" in first_result:
-                    cls_id = int(box.cls[0].item())
+                boxes = first_result.get("boxes", [])
-                    confidence = float(box.conf[0].item())
+            else:
-                    bbox = box.xyxy[0].tolist()
+                boxes = []
        else:
            boxes = []
-                    class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}")
+        for box in boxes:
            cls_id = box.get("cls_id")
            label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")
            score = box.get("score", 0.0)
            coordinate = box.get("coordinate", [0, 0, 0, 0])
-                    # Map to simplified type
+            # Normalize label to region type
-                    if class_name in self.PLAIN_TEXT_CLASSES:
+            region_type = self.LABEL_TO_TYPE.get(label, "text")
                        region_type = "text"
                        has_plain_text = True
                    elif class_name in self.FORMULA_CLASSES:
                        region_type = "formula"
                        has_formula = True
                    elif class_name in {"figure"}:
                        region_type = "figure"
                    elif class_name in {"table"}:
                        region_type = "table"
                    else:
                        region_type = class_name
-                    regions.append(
+            regions.append(LayoutRegion(
-                        LayoutRegion(
+                type=region_type,
-                            type=region_type,
+                bbox=coordinate,
-                            bbox=bbox,
+                confidence=score,
-                            confidence=confidence,
+                score=score,
-                        )
+            ))
                    )
-        return LayoutInfo(
+
-            regions=regions,
+        mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)
-            has_plain_text=has_plain_text,
+
-            has_formula=has_formula,
+        return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)
-        )
+
 if __name__ == "__main__":
    import cv2
    from app.services.image_processor import ImageProcessor
    layout_detector = LayoutDetector()
    image_path = "test/timeout.png"
    image = cv2.imread(image_path)
    image_processor = ImageProcessor(padding_ratio=0.15)
    image = image_processor.add_padding(image)
    # Save the padded image for debugging
    cv2.imwrite("debug_padded_image.png", image)
    layout_info = layout_detector.detect(image)
    print(layout_info)
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,14 +1,12 @@
 """PaddleOCR-VL client service for text and formula recognition."""
 import io
 import tempfile
 from pathlib import Path
 import cv2
 import numpy as np
 from app.core.config import get_settings
-from app.schemas.image import LayoutInfo
+from paddleocr import PaddleOCRVL
 from typing import Optional
 from app.services.layout_detector import LayoutDetector
 from app.services.image_processor import ImageProcessor
 from app.services.converter import Converter
 settings = get_settings()
@@ -16,52 +14,40 @@ settings = get_settings()
 class OCRService:
    """Service for OCR using PaddleOCR-VL."""
-    FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
+    _pipeline: Optional[PaddleOCRVL] = None
    _layout_detector: Optional[LayoutDetector] = None
    def __init__(
        self,
-        vl_server_url: str | None = None,
+        vl_server_url: str,
-        pp_doclayout_model_dir: str | None = None,
+        layout_detector: LayoutDetector,
        image_processor: ImageProcessor,
        converter: Converter,
    ):
        """Initialize OCR service.
        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
-            pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
+            layout_detector: Layout detector instance.
            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
-        self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
+        self.layout_detector = layout_detector 
-        self._pipeline = None
+        self.image_processor = image_processor
-
+        self.converter = converter
-    def _get_pipeline(self):
+    def _get_pipeline(self):    
        """Get or create PaddleOCR-VL pipeline.
        Returns:
            PaddleOCRVL pipeline instance.
        """
-        if self._pipeline is None:
+        if OCRService._pipeline is None:
-            from paddleocr import PaddleOCRVL
+            OCRService._pipeline = PaddleOCRVL(
            self._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
                layout_detection_model_dir=self.pp_doclayout_model_dir,
            )
-        return self._pipeline
+        return OCRService._pipeline
    def _save_temp_image(self, image: np.ndarray) -> str:
        """Save image to a temporary file.
        Args:
            image: Image as numpy array in BGR format.
        Returns:
            Path to temporary file.
        """
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
            cv2.imwrite(f.name, image)
            return f.name
    def recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.
@@ -77,30 +63,21 @@ class OCRService:
        """
        try:
            pipeline = self._get_pipeline()
            temp_path = self._save_temp_image(image)
-            try:
+            output = pipeline.predict(image, use_layout_detection=True)
                results = list(pipeline.predict(temp_path))
-                markdown_content = ""
+            markdown_content = ""
                for result in results:
                    # PaddleOCR-VL results can be saved to markdown
                    md_buffer = io.StringIO()
                    result.save_to_markdown(save_path=md_buffer)
                    markdown_content += md_buffer.getvalue()
-                # Convert markdown to other formats
+            for res in output:
-                latex = self._markdown_to_latex(markdown_content)
+                markdown_content += res.markdown.get("markdown_texts", "")
                mathml = self._extract_mathml(markdown_content)
-                return {
+            convert_result  = self.converter.convert_to_formats(markdown_content)
                    "markdown": markdown_content,
                    "latex": latex,
                    "mathml": mathml,
                }
            finally:
                Path(temp_path).unlink(missing_ok=True)
            return {
                "markdown": markdown_content,
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
            }
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e
@@ -116,188 +93,49 @@ class OCRService:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
-            import httpx
+            pipeline = self._get_pipeline()
-            temp_path = self._save_temp_image(image)
+            output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")
-            try:
+            markdown_content = ""
                # Use vLLM API directly for formula recognition
                import base64
-                with open(temp_path, "rb") as f:
+            for res in output:
-                    image_base64 = base64.b64encode(f.read()).decode("utf-8")
+                markdown_content += res.markdown.get("markdown_texts", "")
-                # Call vLLM server with formula prompt
+            convert_result = self.converter.convert_to_formats(markdown_content)
                response = httpx.post(
                    f"{self.vl_server_url}/chat/completions",
                    json={
                        "model": "paddleocr-vl",
                        "messages": [
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": self.FORMULA_PROMPT},
                                    {
                                        "type": "image_url",
                                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                                    },
                                ],
                            }
                        ],
                        "max_tokens": 1024,
                    },
                    timeout=60.0,
                )
                response.raise_for_status()
                result = response.json()
-                latex = result["choices"][0]["message"]["content"].strip()
+            return {
-
+                "latex": convert_result.latex,
-                # Convert latex to other formats
+                "mathml": convert_result.mathml,
-                markdown = self._latex_to_markdown(latex)
+                "markdown": markdown_content,
-                mathml = self._latex_to_mathml(latex)
+            }
                return {
                    "latex": latex,
                    "markdown": markdown,
                    "mathml": mathml,
                }
            finally:
                Path(temp_path).unlink(missing_ok=True)
        except httpx.HTTPStatusError as e:
            raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e
-    def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
+    def recognize(self, image: np.ndarray) -> dict:
-        """Recognize content based on layout detection results.
+        """Recognize content using PaddleOCR-VL.
        Args:
            image: Input image as numpy array in BGR format.
            layout_info: Layout detection results.
        Returns:
-            Dict with recognition results including mode used.
+            Dict with 'latex', 'markdown', 'mathml' keys.
        """
-        # Decision logic:
+        padded_image = self.image_processor.add_padding(image)
-        # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
+        layout_info = self.layout_detector.detect(padded_image)
-        # - Otherwise -> use formula_recognition (VL with prompt)
+        if layout_info.MixedRecognition:
-        if layout_info.has_plain_text:
+            return self.recognize_mixed(image)
            result = self.recognize_mixed(image)
            result["recognition_mode"] = "mixed_recognition"
        else:
-            result = self.recognize_formula(image)
+            return self.recognize_formula(image)
            result["recognition_mode"] = "formula_recognition"
        return result
-    def _markdown_to_latex(self, markdown: str) -> str:
+if __name__ == "__main__":
-        """Convert markdown to LaTeX.
+    import cv2
-
+    from app.services.image_processor import ImageProcessor
-        Simple conversion - wraps content in LaTeX document structure.
+    from app.services.layout_detector import LayoutDetector
-
+    image_processor = ImageProcessor(padding_ratio=0.15)
-        Args:
+    layout_detector = LayoutDetector()
-            markdown: Markdown content.
+    ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
-
+    image = cv2.imread("test/image.png")
-        Returns:
+    ocr_result = ocr_service.recognize(image)
-            LaTeX representation.
+    print(ocr_result)
        """
        # Basic conversion: preserve math blocks, convert structure
        lines = []
        in_code_block = False
        for line in markdown.split("\n"):
            if line.startswith("```"):
                in_code_block = not in_code_block
                if in_code_block:
                    lines.append("\\begin{verbatim}")
                else:
                    lines.append("\\end{verbatim}")
            elif in_code_block:
                lines.append(line)
            elif line.startswith("# "):
                lines.append(f"\\section{{{line[2:]}}}")
            elif line.startswith("## "):
                lines.append(f"\\subsection{{{line[3:]}}}")
            elif line.startswith("### "):
                lines.append(f"\\subsubsection{{{line[4:]}}}")
            elif line.startswith("- "):
                lines.append(f"\\item {line[2:]}")
            elif line.startswith("$$"):
                lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
            elif "$" in line:
                # Keep inline math as-is
                lines.append(line)
            else:
                lines.append(line)
        return "\n".join(lines)
    def _latex_to_markdown(self, latex: str) -> str:
        """Convert LaTeX to markdown.
        Args:
            latex: LaTeX content.
        Returns:
            Markdown representation.
        """
        # Wrap LaTeX in markdown math block
        if latex.strip():
            return f"$$\n{latex}\n$$"
        return ""
    def _latex_to_mathml(self, latex: str) -> str:
        """Convert LaTeX to MathML.
        Args:
            latex: LaTeX content.
        Returns:
            MathML representation.
        """
        # Basic LaTeX to MathML conversion
        # For production, consider using latex2mathml library
        if not latex.strip():
            return ""
        try:
            # Try to use latex2mathml if available
            from latex2mathml.converter import convert
            return convert(latex)
        except ImportError:
            # Fallback: wrap in basic MathML structure
            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
        except Exception:
            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
    def _extract_mathml(self, markdown: str) -> str:
        """Extract and convert math from markdown to MathML.
        Args:
            markdown: Markdown content.
        Returns:
            MathML for any math content found.
        """
        import re
        # Find all math blocks
        math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
        inline_math = re.findall(r"\$([^$]+)\$", markdown)
        all_math = math_blocks + inline_math
        if not all_math:
            return ""
        # Convert each to MathML and combine
        mathml_parts = []
        for latex in all_math:
            mathml = self._latex_to_mathml(latex.strip())
            if mathml:
                mathml_parts.append(mathml)
        return "\n".join(mathml_parts)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,30 +2,36 @@
 name = "doc-processer"
 version = "0.1.0"
 description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX"
-readme = "README.md"
+requires-python = ">=3.10"
 requires-python = ">=3.11"
 license = { text = "MIT" }
 authors = [
    { name = "YogeLiu" }
 ]
 dependencies = [
-    "fastapi>=0.115.0",
+    "fastapi==0.128.0",
-    "uvicorn[standard]>=0.32.0",
+    "uvicorn[standard]==0.40.0",
-    "opencv-python>=4.10.0",
+    "opencv-python==4.12.0.88",
-    "python-multipart>=0.0.12",
+    "python-multipart==0.0.21",
-    "pydantic>=2.10.0",
+    "pydantic==2.12.5",
-    "pydantic-settings>=2.6.0",
+    "pydantic-settings==2.12.0",
-    "httpx>=0.28.0",
+    "httpx==0.28.1",
-    "numpy>=1.26.0",
+    "numpy==2.2.6",
-    "pillow>=10.4.0",
+    "pillow==12.0.0",
-    "python-docx>=1.1.0",
+    "python-docx==1.2.0",
-    "paddleocr>=2.9.0",
+    "paddleocr==3.3.2",
-    "doclayout-yolo>=0.0.2",
+    "doclayout-yolo==0.0.4",
-    "latex2mathml>=3.77.0",
+    "latex2mathml==3.78.1",
-    "paddle>=1.2.0",
+    "paddle==1.2.0",
    "pypandoc==1.16.2",
    "paddlepaddle",
    "paddleocr[doc-parser]",
    "safetensors"
 ]
 [tool.uv.sources]
 paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
 [project.optional-dependencies]
 dev = [
    "pytest>=8.0.0",