fix: refact logic

2025-12-31 17:38:32 +08:00
parent 6ac50f7d2f
commit 35928c2484
17 changed files with 678 additions and 738 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,55 @@
+# Git
+.git
+.gitignore
+
+# Python
+.venv/
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+.eggs/
+dist/
+build/
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+test/
+tests/
+
+# Linting & IDE
+.ruff_cache/
+.mypy_cache/
+.cursor/
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Environment
+.env
+.env.*
+!.env.example
+
+# Documentation (not needed in container)
+*.md
+!README.md
+openspec/
+
+# Models (mounted at runtime, not built into image)
+app/model/doclayout/*.pdiparams
+app/model/DocLayout/
+app/model/PP-DocLayout/
+
+# Misc
+*.log
+*.tmp
+.DS_Store
+Thumbs.db
+
+test/
+
--- a/.gitignore
+++ b/.gitignore
@@ -71,3 +71,5 @@ htmlcov/
 uv.lock

 model/
+
+test/
--- a/81
+++ b/81
@@ -1,54 +1,73 @@
 # DocProcesser Dockerfile
 # Optimized for RTX 5080 GPU deployment

-# Use NVIDIA CUDA base image with Python 3.11
+# Use NVIDIA CUDA base image with Python 3.10
 FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04

 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=1
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    # Model cache directories - mount these at runtime
+    MODELSCOPE_CACHE=/root/.cache/modelscope \
+    HF_HOME=/root/.cache/huggingface \
+    # Application config (override defaults for container)
+    # Use 127.0.0.1 for --network host mode, or override with -e for bridge mode
+    PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \
+    PADDLEOCR_VL_URL=http://127.0.0.1:8000/v1

 # Set working directory
 WORKDIR /app

-# Install system dependencies
+# Install system dependencies and Python 3.10 from deadsnakes PPA
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3.11 \
-    python3.11-venv \
-    python3.11-dev \
-    python3-pip \
-    libgl1-mesa-glx \
+    software-properties-common \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
+    && apt-get update && apt-get install -y --no-install-recommends \
+    python3.10 \
+    python3.10-venv \
+    python3.10-dev \
+    python3.10-distutils \
+    libgl1 \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgomp1 \
    curl \
+    pandoc \
    && rm -rf /var/lib/apt/lists/* \
-    && ln -sf /usr/bin/python3.11 /usr/bin/python \
-    && ln -sf /usr/bin/python3.11 /usr/bin/python3
+    && ln -sf /usr/bin/python3.10 /usr/bin/python \
+    && ln -sf /usr/bin/python3.10 /usr/bin/python3 \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10

-# Install uv for fast package management
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-ENV PATH="/root/.local/bin:$PATH"
-
-# Copy dependency files first for better caching
-COPY pyproject.toml ./
-
-# Create virtual environment and install dependencies
-RUN uv venv /app/.venv
+# Install uv via pip (more reliable than install script)
+RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
 ENV PATH="/app/.venv/bin:$PATH"
 ENV VIRTUAL_ENV="/app/.venv"

-RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e .
+# Copy dependency files first for better caching
+COPY pyproject.toml ./
+COPY wheels/ ./wheels/
+
+# Create virtual environment and install dependencies
+RUN uv venv /app/.venv --python python3.10 \
+    && uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \
+    && rm -rf ./wheels

 # Copy application code
 COPY app/ ./app/

-# Create model directories (models should be mounted at runtime)
-RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout
+# Create model cache directories (mount from host at runtime)
+RUN mkdir -p /root/.cache/modelscope \
+    /root/.cache/huggingface \
+    /root/.paddlex \
+    /app/app/model/DocLayout \
+    /app/app/model/PP-DocLayout
+
+# Declare volumes for model cache (mount at runtime to avoid re-downloading)
+VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"]

 # Expose port
 EXPOSE 8053
@@ -60,3 +79,21 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
 # Run the application
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]

+# =============================================================================
+# Usage: Mount local model cache to avoid downloading
+#
+# Option 1: Use host network (simplest, can access localhost services)
+# docker run --gpus all --network host \
+#   -v /home/yoge/.paddlex:/root/.paddlex:ro \
+#   -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
+#   -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
+#   doc_processer:latest
+#
+# Option 2: Use bridge network with host.docker.internal (Linux needs --add-host)
+# docker run --gpus all -p 8053:8053 \
+#   --add-host=host.docker.internal:host-gateway \
+#   -v /home/yoge/.paddlex:/root/.paddlex:ro \
+#   -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
+#   -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
+#   doc_processer:latest
+# =============================================================================
--- a/app/api/v1/endpoints/convert.py
+++ b/app/api/v1/endpoints/convert.py
@@ -3,34 +3,28 @@
 from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import Response

-from app.core.dependencies import get_docx_converter
+from app.core.dependencies import get_converter
 from app.schemas.convert import MarkdownToDocxRequest
-from app.services.docx_converter import DocxConverter
+from app.services.converter import Converter

 router = APIRouter()


-@router.post("/docx")
+@router.post("/file")
 async def convert_markdown_to_docx(
    request: MarkdownToDocxRequest,
-    converter: DocxConverter = Depends(get_docx_converter),
+    converter: Converter = Depends(get_converter),
 ) -> Response:
    """Convert markdown content to DOCX file.

-    Returns the generated DOCX file as a binary download.
+    Returns the generated DOCX file as a binary response.
    """
    try:
-        docx_bytes = converter.convert(request.markdown)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
-
-    # Determine filename
-    filename = request.filename or "output"
-    if not filename.endswith(".docx"):
-        filename = f"{filename}.docx"
-
+        docx_bytes = converter.export_to_file(request.markdown, export_type="docx")
        return Response(
            content=docx_bytes,
            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        headers={"Content-Disposition": f'attachment; filename="{filename}"'},
+            headers={"Content-Disposition": f'attachment; filename="{request.filename}.docx"'},
        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -28,24 +28,15 @@ async def process_image_ocr(
       - Otherwise: use PaddleOCR-VL with formula prompt
    4. Convert output to LaTeX, Markdown, and MathML formats
    """
-    try:
-        # 1. Load and preprocess image
+
    image = image_processor.preprocess(
        image_url=request.image_url,
        image_base64=request.image_base64,
    )
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-
-    try:
-        # 2. Detect layout
-        layout_info = layout_detector.detect(image)
-    except RuntimeError as e:
-        raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}")

    try:
        # 3. Perform OCR based on layout
-        ocr_result = ocr_service.recognize(image, layout_info)
+        ocr_result = ocr_service.recognize(image)
    except RuntimeError as e:
        raise HTTPException(status_code=503, detail=str(e))

@@ -54,6 +45,4 @@ async def process_image_ocr(
        latex=ocr_result.get("latex", ""),
        markdown=ocr_result.get("markdown", ""),
        mathml=ocr_result.get("mathml", ""),
-        layout_info=layout_info,
-        recognition_mode=ocr_result.get("recognition_mode", ""),
    )
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -5,6 +5,7 @@ from pathlib import Path

 from pydantic_settings import BaseSettings, SettingsConfigDict
 import torch
+from typing import Optional


 class Settings(BaseSettings):
@@ -21,11 +22,10 @@ class Settings(BaseSettings):
    debug: bool = False

    # PaddleOCR-VL Settings
-    paddleocr_vl_url: str = "http://localhost:8080/v1"
+    paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"

    # Model Paths
-    doclayout_model_path: str = "app/model/DocLayout/best.pt"
-    pp_doclayout_model_dir: str = "app/model/PP-DocLayout/PP-DocLayoutV2"
+    pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"

    # Image Processing
    max_image_size_mb: int = 10
@@ -37,11 +37,6 @@ class Settings(BaseSettings):
    host: str = "0.0.0.0"
    port: int = 8053

-    @property
-    def doclayout_model_file(self) -> Path:
-        """Get the DocLayout model file path."""
-        return Path(self.doclayout_model_path)
-
    @property
    def pp_doclayout_dir(self) -> Path:
        """Get the PP-DocLayout model directory path."""
--- a/app/core/dependencies.py
+++ b/app/core/dependencies.py
@@ -3,20 +3,20 @@
 from app.services.image_processor import ImageProcessor
 from app.services.layout_detector import LayoutDetector
 from app.services.ocr_service import OCRService
-from app.services.docx_converter import DocxConverter
+from app.services.converter import Converter
+from app.core.config import get_settings

 # Global instances (initialized on startup)
 _layout_detector: LayoutDetector | None = None


-def init_layout_detector(model_path: str) -> None:
+def init_layout_detector() -> None:
    """Initialize the global layout detector.

    Called during application startup.
    """
    global _layout_detector
-    _layout_detector = LayoutDetector(model_path=model_path)
-    _layout_detector.load_model()
+    _layout_detector = LayoutDetector()


 def get_layout_detector() -> LayoutDetector:
@@ -33,10 +33,15 @@ def get_image_processor() -> ImageProcessor:

 def get_ocr_service() -> OCRService:
    """Get an OCR service instance."""
-    return OCRService()
+    return OCRService(
+        vl_server_url=get_settings().paddleocr_vl_url,
+        layout_detector=get_layout_detector(),
+        image_processor=get_image_processor(),
+        converter=get_converter(),
+    )


-def get_docx_converter() -> DocxConverter:
+def get_converter() -> Converter:
    """Get a DOCX converter instance."""
-    return DocxConverter()
+    return Converter()

--- a/app/main.py
+++ b/app/main.py
@@ -15,7 +15,7 @@ settings = get_settings()
 async def lifespan(app: FastAPI):
    """Application lifespan handler for startup/shutdown."""
    # Startup: Load models
-    init_layout_detector(model_path=settings.doclayout_model_path)
+    init_layout_detector()

    yield

@@ -37,3 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix)
 async def health_check():
    """Health check endpoint."""
    return {"status": "healthy"}
+    
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8053)
--- a/app/pkg/reference.docx
+++ b/app/pkg/reference.docx
--- a/app/schemas/convert.py
+++ b/app/schemas/convert.py
@@ -7,7 +7,7 @@ class MarkdownToDocxRequest(BaseModel):
    """Request body for markdown to DOCX conversion endpoint."""

    markdown: str = Field(..., description="Markdown content to convert")
-    filename: str | None = Field(None, description="Optional output filename (without extension)")
+    filename: str = Field("texpixel", description="Optional output filename (without extension)")

    @field_validator("markdown")
    @classmethod
--- a/app/schemas/image.py
+++ b/app/schemas/image.py
@@ -9,14 +9,15 @@ class LayoutRegion(BaseModel):
    type: str = Field(..., description="Region type: text, formula, table, figure")
    bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]")
    confidence: float = Field(..., description="Detection confidence score")
+    score: float = Field(..., description="Detection score")


 class LayoutInfo(BaseModel):
    """Layout detection information."""

    regions: list[LayoutRegion] = Field(default_factory=list)
-    has_plain_text: bool = Field(False, description="Whether plain text was detected")
-    has_formula: bool = Field(False, description="Whether formulas were detected")
+    MixedRecognition: bool = Field(False, description="Whether mixed recognition was used")
+    # FormulaRecognition: bool = Field(False, description="Whether formula recognition (with prompt) was used")


 class ImageOCRRequest(BaseModel):
--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -0,0 +1,312 @@
+"""Markdown conversion and export service using pypandoc."""
+
+import os
+import re
+import tempfile
+from dataclasses import dataclass
+from typing import Literal
+
+import pypandoc
+
+
+@dataclass
+class ConvertResult:
+    """Result of markdown conversion."""
+
+    latex: str
+    mathml: str
+
+
+@dataclass
+class ExportResult:
+    """Result of markdown export."""
+
+    file_path: str
+    content_type: str
+    download_name: str
+
+
+ExportType = Literal["docx", "pdf"]
+
+
+class Converter:
+    """Service for conversion and export operations."""
+
+    # Pandoc input format with LaTeX math extensions
+    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
+
+    def __init__(self):
+        """Initialize converter."""
+
+    def convert_to_formats(self, md_text: str) -> ConvertResult:
+        """Convert markdown to LaTeX and MathML formats.
+
+        Args:
+            md_text: Markdown text to convert.
+
+        Returns:
+            ConvertResult with latex and mathml fields.
+
+        Raises:
+            ValueError: If md_text is empty.
+            RuntimeError: If conversion fails.
+        """
+        if md_text == "":
+            return ConvertResult(latex="", mathml="")
+
+        try:
+            # Convert to LaTeX
+            latex_output = pypandoc.convert_text(
+                md_text,
+                "latex",
+                format=self.INPUT_FORMAT,
+            ).rstrip("\n")
+
+            # Convert to HTML with MathML
+            mathml_output = pypandoc.convert_text(
+                md_text,
+                "html",
+                format=self.INPUT_FORMAT,
+                extra_args=["--mathml"],
+            ).rstrip("\n")
+
+            return ConvertResult(latex=latex_output, mathml=mathml_output)
+
+        except Exception as e:
+            raise RuntimeError(f"Conversion failed: {e}") from e
+
+    def preprocess_for_export(self, md_text: str) -> str:
+        """Preprocess markdown text for export to docx/pdf.
+
+        Handles LaTeX formula formatting, matrix environments, and
+        other transformations needed for proper Word/PDF rendering.
+
+        Args:
+            md_text: Raw markdown text.
+
+        Returns:
+            Preprocessed markdown text.
+        """
+        # Replace \[1mm] => \vspace{1mm}
+        md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
+
+        # Add blank lines around \[...\] block formulas
+        md_text = re.sub(
+            r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
+            r"\1\n\n\\[\3\\]\n\n\4",
+            md_text,
+            flags=re.DOTALL,
+        )
+        md_text = re.sub(
+            r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
+            r"\n\\[\2\\]\n",
+            md_text,
+            flags=re.MULTILINE | re.DOTALL,
+        )
+
+        # Remove arithmatex span wrappers
+        cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
+
+        # Convert inline formulas: \( \) => $ $
+        cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
+        cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
+
+        # Convert block formulas: \[ \] => $$ $$
+        cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
+        cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
+
+        # Remove spaces between $ and formula content
+        # Use negative lookahead/lookbehind to avoid matching $$ block formulas
+        cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
+
+        # Convert matrix environments for better Word rendering
+        cleaned_md = self._convert_matrix_environments(cleaned_md)
+
+        # Fix brace spacing for equation systems
+        cleaned_md = self._fix_brace_spacing(cleaned_md)
+
+        # Convert cases and aligned environments
+        cleaned_md = self._convert_special_environments(cleaned_md)
+
+        return cleaned_md
+
+    def _convert_matrix_environments(self, md_text: str) -> str:
+        """Convert vmatrix/Vmatrix to left/right delimited forms.
+
+        This fixes the vertical line height issues in Word.
+        """
+        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
+        md_text = re.sub(
+            r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
+            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
+            md_text,
+            flags=re.DOTALL,
+        )
+
+        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
+        md_text = re.sub(
+            r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
+            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
+            md_text,
+            flags=re.DOTALL,
+        )
+
+        return md_text
+
+    def _fix_brace_spacing(self, md_text: str) -> str:
+        """Fix spacing issues with braces in equation systems.
+
+        Removes whitespace and adds negative space for proper alignment in Word/OMML.
+        """
+        # Fix \left\{ spacing
+        md_text = re.sub(
+            r"\\left\\\{\s+",
+            r"\\left\\{\\!",
+            md_text,
+        )
+
+        # Fix \right\} spacing
+        md_text = re.sub(
+            r"\s+\\right\\\}",
+            r"\\!\\right\\}",
+            md_text,
+        )
+
+        return md_text
+
+    def _convert_special_environments(self, md_text: str) -> str:
+        """Convert cases and aligned environments to array format.
+
+        These environments have better rendering support in Word/OMML.
+        """
+
+        def convert_cases(match: re.Match) -> str:
+            content = match.group(1)
+            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
+
+        md_text = re.sub(
+            r"\\begin\{cases\}(.*?)\\end\{cases\}",
+            convert_cases,
+            md_text,
+            flags=re.DOTALL,
+        )
+
+        def convert_aligned_to_array(match: re.Match) -> str:
+            content = match.group(1)
+            # Remove leading & alignment markers (not needed in array{l})
+            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
+
+        md_text = re.sub(
+            r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
+            convert_aligned_to_array,
+            md_text,
+            flags=re.DOTALL,
+        )
+
+        def convert_standalone_aligned(match: re.Match) -> str:
+            content = match.group(1)
+            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+            return r"\begin{array}{l}" + content + r"\end{array}"
+
+        md_text = re.sub(
+            r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
+            convert_standalone_aligned,
+            md_text,
+            flags=re.DOTALL,
+        )
+
+        return md_text
+
+    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
+        """Export markdown to docx or pdf file.
+
+        Args:
+            md_text: Markdown text to export.
+            export_type: Export format, either 'docx' or 'pdf'.
+
+        Returns:
+            bytes of the exported file.
+
+        Raises:
+            ValueError: If export_type is not supported.
+            RuntimeError: If export fails.
+
+        """
+
+        # Preprocess markdown
+        cleaned_md = self.preprocess_for_export(md_text)
+
+        # Create temp file for input
+        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
+            f_in.write(cleaned_md.encode("utf-8"))
+            md_path = f_in.name
+
+        output_file = md_path + "." + export_type
+
+        try:
+            if export_type == "docx":
+                self._export_docx(md_path, output_file)
+                with open(output_file, "rb") as f:
+                    return f.read()
+            else:  # pdf
+                self._export_pdf(md_path, output_file)
+                with open(output_file, "rb") as f:
+                    return f.read()
+
+        except Exception as e:
+            # Cleanup on error
+            self._cleanup_files(md_path, output_file)
+            raise RuntimeError(f"Export failed: {e}") from e
+        finally:
+            # Always cleanup input file
+            if os.path.exists(md_path):
+                os.remove(md_path)
+
+    def _export_docx(self, input_path: str, output_path: str) -> None:
+        """Export to DOCX format using pypandoc."""
+        extra_args = [
+            "--highlight-style=pygments",
+            f"--reference-doc=app/pkg/reference.docx",
+        ]
+        pypandoc.convert_file(
+            input_path,
+            "docx",
+            format=self.INPUT_FORMAT,
+            outputfile=output_path,
+            extra_args=extra_args,
+        )
+
+    def _export_pdf(self, input_path: str, output_path: str) -> None:
+        """Export to PDF format using pypandoc with XeLaTeX."""
+        extra_args = [
+            "--pdf-engine=xelatex",
+            "-V",
+            "mainfont=Noto Sans CJK SC",
+            "--highlight-style=pygments",
+        ]
+        pypandoc.convert_file(
+            input_path,
+            "pdf",
+            format=self.INPUT_FORMAT,
+            outputfile=output_path,
+            extra_args=extra_args,
+        )
+
+    def _cleanup_files(self, *paths: str) -> None:
+        """Remove files if they exist."""
+        for path in paths:
+            if os.path.exists(path):
+                os.remove(path)
+
+    def cleanup_export_file(self, file_path: str) -> None:
+        """Cleanup exported file after sending response.
+
+        Call this after sending the file to the client.
+
+        Args:
+            file_path: Path to the exported file.
+        """
+        if os.path.exists(file_path):
+            os.remove(file_path)
+
--- a/app/services/docx_converter.py
+++ b/app/services/docx_converter.py
@@ -1,335 +0,0 @@
-"""Markdown to DOCX conversion service.
-
-Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
-"""
-
-import io
-import re
-from dataclasses import dataclass
-
-from docx import Document
-from docx.enum.text import WD_ALIGN_PARAGRAPH
-from docx.oxml import OxmlElement
-from docx.oxml.ns import qn
-from docx.shared import Inches, Pt
-
-
-@dataclass
-class MarkdownElement:
-    """Parsed markdown element."""
-
-    type: str  # heading, paragraph, list_item, code_block, table, math
-    content: str
-    level: int = 0  # For headings and lists
-    language: str = ""  # For code blocks
-
-
-class DocxConverter:
-    """Converts markdown content to DOCX format."""
-
-    def __init__(self):
-        """Initialize the converter."""
-        self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
-        self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
-        self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
-        self.code_block_pattern = re.compile(r"^```(\w*)$")
-        self.inline_code_pattern = re.compile(r"`([^`]+)`")
-        self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
-        self.italic_pattern = re.compile(r"\*([^*]+)\*")
-        self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
-        self.inline_math_pattern = re.compile(r"\$([^$]+)\$")
-
-    def convert(self, markdown: str) -> bytes:
-        """Convert markdown content to DOCX.
-
-        Args:
-            markdown: Markdown content to convert.
-
-        Returns:
-            DOCX file as bytes.
-        """
-        doc = Document()
-        elements = self._parse_markdown(markdown)
-
-        for element in elements:
-            self._add_element_to_doc(doc, element)
-
-        # Save to bytes
-        buffer = io.BytesIO()
-        doc.save(buffer)
-        buffer.seek(0)
-        return buffer.getvalue()
-
-    def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
-        """Parse markdown into elements.
-
-        Args:
-            markdown: Markdown content.
-
-        Returns:
-            List of parsed elements.
-        """
-        elements: list[MarkdownElement] = []
-        lines = markdown.split("\n")
-        i = 0
-        in_code_block = False
-        code_content = []
-        code_language = ""
-
-        while i < len(lines):
-            line = lines[i]
-
-            # Code block handling
-            code_match = self.code_block_pattern.match(line)
-            if code_match:
-                if in_code_block:
-                    elements.append(
-                        MarkdownElement(
-                            type="code_block",
-                            content="\n".join(code_content),
-                            language=code_language,
-                        )
-                    )
-                    code_content = []
-                    in_code_block = False
-                else:
-                    in_code_block = True
-                    code_language = code_match.group(1)
-                i += 1
-                continue
-
-            if in_code_block:
-                code_content.append(line)
-                i += 1
-                continue
-
-            # Math block ($$...$$)
-            if line.strip().startswith("$$"):
-                math_content = []
-                if line.strip() == "$$":
-                    i += 1
-                    while i < len(lines) and lines[i].strip() != "$$":
-                        math_content.append(lines[i])
-                        i += 1
-                else:
-                    # Single line $$...$$ or start
-                    content = line.strip()[2:]
-                    if content.endswith("$$"):
-                        math_content.append(content[:-2])
-                    else:
-                        math_content.append(content)
-                        i += 1
-                        while i < len(lines):
-                            if lines[i].strip().endswith("$$"):
-                                math_content.append(lines[i].strip()[:-2])
-                                break
-                            math_content.append(lines[i])
-                            i += 1
-
-                elements.append(
-                    MarkdownElement(type="math", content="\n".join(math_content))
-                )
-                i += 1
-                continue
-
-            # Heading
-            heading_match = self.heading_pattern.match(line)
-            if heading_match:
-                level = len(heading_match.group(1))
-                content = heading_match.group(2)
-                elements.append(
-                    MarkdownElement(type="heading", content=content, level=level)
-                )
-                i += 1
-                continue
-
-            # Unordered list
-            list_match = self.list_pattern.match(line)
-            if list_match:
-                indent = len(list_match.group(1))
-                content = list_match.group(2)
-                elements.append(
-                    MarkdownElement(type="list_item", content=content, level=indent // 2)
-                )
-                i += 1
-                continue
-
-            # Ordered list
-            ordered_match = self.ordered_list_pattern.match(line)
-            if ordered_match:
-                indent = len(ordered_match.group(1))
-                content = ordered_match.group(2)
-                elements.append(
-                    MarkdownElement(
-                        type="ordered_list_item", content=content, level=indent // 2
-                    )
-                )
-                i += 1
-                continue
-
-            # Table (simple detection)
-            if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
-                table_lines = [line]
-                i += 1
-                while i < len(lines) and "|" in lines[i]:
-                    table_lines.append(lines[i])
-                    i += 1
-                elements.append(
-                    MarkdownElement(type="table", content="\n".join(table_lines))
-                )
-                continue
-
-            # Regular paragraph
-            if line.strip():
-                elements.append(MarkdownElement(type="paragraph", content=line))
-
-            i += 1
-
-        return elements
-
-    def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
-        """Add a markdown element to the document.
-
-        Args:
-            doc: Word document.
-            element: Parsed markdown element.
-        """
-        if element.type == "heading":
-            self._add_heading(doc, element.content, element.level)
-        elif element.type == "paragraph":
-            self._add_paragraph(doc, element.content)
-        elif element.type == "list_item":
-            self._add_list_item(doc, element.content, element.level, ordered=False)
-        elif element.type == "ordered_list_item":
-            self._add_list_item(doc, element.content, element.level, ordered=True)
-        elif element.type == "code_block":
-            self._add_code_block(doc, element.content)
-        elif element.type == "table":
-            self._add_table(doc, element.content)
-        elif element.type == "math":
-            self._add_math(doc, element.content)
-
-    def _add_heading(self, doc: Document, content: str, level: int) -> None:
-        """Add a heading to the document."""
-        # Map markdown levels to Word heading styles
-        heading_level = min(level, 9)  # Word supports up to Heading 9
-        doc.add_heading(content, level=heading_level)
-
-    def _add_paragraph(self, doc: Document, content: str) -> None:
-        """Add a paragraph with inline formatting."""
-        para = doc.add_paragraph()
-        self._add_formatted_text(para, content)
-
-    def _add_formatted_text(self, para, content: str) -> None:
-        """Add text with inline formatting (bold, italic, code)."""
-        # Simple approach: process inline patterns
-        remaining = content
-
-        while remaining:
-            # Find next formatting marker
-            bold_match = self.bold_pattern.search(remaining)
-            italic_match = self.italic_pattern.search(remaining)
-            code_match = self.inline_code_pattern.search(remaining)
-            math_match = self.inline_math_pattern.search(remaining)
-
-            matches = [
-                (bold_match, "bold"),
-                (italic_match, "italic"),
-                (code_match, "code"),
-                (math_match, "math"),
-            ]
-            matches = [(m, t) for m, t in matches if m]
-
-            if not matches:
-                para.add_run(remaining)
-                break
-
-            # Find earliest match
-            earliest = min(matches, key=lambda x: x[0].start())
-            match, match_type = earliest
-
-            # Add text before match
-            if match.start() > 0:
-                para.add_run(remaining[: match.start()])
-
-            # Add formatted text
-            run = para.add_run(match.group(1))
-            if match_type == "bold":
-                run.bold = True
-            elif match_type == "italic":
-                run.italic = True
-            elif match_type == "code":
-                run.font.name = "Courier New"
-                run.font.size = Pt(10)
-            elif match_type == "math":
-                run.italic = True
-
-            remaining = remaining[match.end() :]
-
-    def _add_list_item(
-        self, doc: Document, content: str, level: int, ordered: bool
-    ) -> None:
-        """Add a list item."""
-        para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
-        para.paragraph_format.left_indent = Inches(0.25 * level)
-        self._add_formatted_text(para, content)
-
-    def _add_code_block(self, doc: Document, content: str) -> None:
-        """Add a code block."""
-        para = doc.add_paragraph()
-        para.paragraph_format.left_indent = Inches(0.5)
-
-        run = para.add_run(content)
-        run.font.name = "Courier New"
-        run.font.size = Pt(9)
-
-        # Add shading
-        shading = OxmlElement("w:shd")
-        shading.set(qn("w:val"), "clear")
-        shading.set(qn("w:fill"), "F0F0F0")
-        para._p.get_or_add_pPr().append(shading)
-
-    def _add_table(self, doc: Document, content: str) -> None:
-        """Add a table from markdown table format."""
-        lines = [l.strip() for l in content.split("\n") if l.strip()]
-        if len(lines) < 2:
-            return
-
-        # Parse header
-        header = [c.strip() for c in lines[0].split("|") if c.strip()]
-
-        # Skip separator line
-        data_lines = lines[2:] if len(lines) > 2 else []
-
-        # Create table
-        table = doc.add_table(rows=1, cols=len(header))
-        table.style = "Table Grid"
-
-        # Add header
-        header_cells = table.rows[0].cells
-        for i, text in enumerate(header):
-            header_cells[i].text = text
-            header_cells[i].paragraphs[0].runs[0].bold = True
-
-        # Add data rows
-        for line in data_lines:
-            cells = [c.strip() for c in line.split("|") if c.strip()]
-            row_cells = table.add_row().cells
-            for i, text in enumerate(cells):
-                if i < len(row_cells):
-                    row_cells[i].text = text
-
-    def _add_math(self, doc: Document, content: str) -> None:
-        """Add a math block.
-
-        For proper OMML rendering, this would need more complex conversion.
-        Currently renders as italic text with the LaTeX source.
-        """
-        para = doc.add_paragraph()
-        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
-
-        run = para.add_run(content)
-        run.italic = True
-        run.font.name = "Cambria Math"
-        run.font.size = Pt(12)
-
--- a/app/services/image_processor.py
+++ b/app/services/image_processor.py
@@ -116,7 +116,7 @@ class ImageProcessor:
        else:
            raise ValueError("Either image_url or image_base64 must be provided")

-        return self.add_padding(image)
+        return image

    def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str:
        """Convert numpy image to base64 string.
--- a/app/services/layout_detector.py
+++ b/app/services/layout_detector.py
@@ -1,122 +1,157 @@
-"""DocLayout-YOLO wrapper for document layout detection."""
+"""PP-DocLayoutV2 wrapper for document layout detection."""

 import numpy as np

 from app.schemas.image import LayoutInfo, LayoutRegion
 from app.core.config import get_settings
+from paddleocr import LayoutDetection
+from typing import Optional

 settings = get_settings()


 class LayoutDetector:
-    """Wrapper for DocLayout-YOLO model."""
+    """Layout detector for PP-DocLayoutV2."""

-    # Class names from DocLayout-YOLO
-    CLASS_NAMES = {
-        0: "title",
-        1: "plain_text",
-        2: "abandon",
-        3: "figure",
-        4: "figure_caption",
-        5: "table",
-        6: "table_caption",
-        7: "table_footnote",
-        8: "isolate_formula",
-        9: "formula_caption",
+    _layout_detector: Optional[LayoutDetection] = None
+
+    # PP-DocLayoutV2 class ID to label mapping
+    CLS_ID_TO_LABEL: dict[int, str] = {
+        0: "abstract",
+        1: "algorithm",
+        2: "aside_text",
+        3: "chart",
+        4: "content",
+        5: "display_formula",
+        6: "doc_title",
+        7: "figure_title",
+        8: "footer",
+        9: "footer_image",
+        10: "footnote",
+        11: "formula_number",
+        12: "header",
+        13: "header_image",
+        14: "image",
+        15: "inline_formula",
+        16: "number",
+        17: "paragraph_title",
+        18: "reference",
+        19: "reference_content",
+        20: "seal",
+        21: "table",
+        22: "text",
+        23: "vertical_text",
+        24: "vision_footnote",
    }

-    # Classes considered as plain text
-    PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"}
+    # Mapping from raw labels to normalized region types
+    LABEL_TO_TYPE: dict[str, str] = {
+        # Text types
+        "abstract": "text",
+        "algorithm": "text",
+        "aside_text": "text",
+        "content": "text",
+        "doc_title": "text",
+        "footer": "text",
+        "footnote": "text",
+        "header": "text",
+        "number": "text",
+        "paragraph_title": "text",
+        "reference": "text",
+        "reference_content": "text",
+        "text": "text",
+        "vertical_text": "text",
+        "vision_footnote": "text",
+        # Formula types
+        "display_formula": "formula",
+        "inline_formula": "formula",
+        "formula_number": "formula",
+        # Table types
+        "table": "table",
+        # Figure types
+        "chart": "figure",
+        "figure_title": "figure",
+        "footer_image": "figure",
+        "header_image": "figure",
+        "image": "figure",
+        "seal": "figure",
+    }

-    # Classes considered as formula
-    FORMULA_CLASSES = {"isolate_formula", "formula_caption"}
-
-    def __init__(self, model_path: str, confidence_threshold: float = 0.2):
-        """Initialize the layout detector.
+    def __init__(self):
+        """Initialize layout detector.

        Args:
-            model_path: Path to the DocLayout-YOLO model weights.
-            confidence_threshold: Minimum confidence for detections.
        """
-        self.model_path = model_path
-        self.confidence_threshold = confidence_threshold
-        self.model = None
+        _ = self._get_layout_detector()

-    def load_model(self) -> None:
-        """Load the DocLayout-YOLO model.
+    def _get_layout_detector(self):
+        """Get or create LayoutDetection instance."""
+        if LayoutDetector._layout_detector is None:
+            LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
+        return LayoutDetector._layout_detector

-        Raises:
-            RuntimeError: If model cannot be loaded.
-        """
-        try:
-            from doclayout_yolo import YOLOv10
-
-            self.model = YOLOv10(self.model_path)
-        except Exception as e:
-            raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e
-
-    def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo:
-        """Detect document layout regions.
+    def detect(self, image: np.ndarray) -> LayoutInfo:
+        """Detect layout of the image using PP-DocLayoutV2.

        Args:
-            image: Input image as numpy array in BGR format.
-            image_size: Image size for prediction.
+            image: Input image as numpy array.

        Returns:
-            LayoutInfo with detected regions.
-
-        Raises:
-            RuntimeError: If model not loaded.
+            LayoutInfo with detected regions and flags.
        """
-        if self.model is None:
-            raise RuntimeError("Model not loaded. Call load_model() first.")
-
-        # Run prediction
-        results = self.model.predict(
-            image,
-            imgsz=image_size,
-            conf=self.confidence_threshold,
-            device=settings.device,
-        )
+        layout_detector = self._get_layout_detector()
+        result = layout_detector.predict(image)

+        # Parse the result
        regions: list[LayoutRegion] = []
-        has_plain_text = False
-        has_formula = False
+        mixed_recognition = False

-        if results and len(results) > 0:
-            result = results[0]
-            if result.boxes is not None:
-                for box in result.boxes:
-                    cls_id = int(box.cls[0].item())
-                    confidence = float(box.conf[0].item())
-                    bbox = box.xyxy[0].tolist()
-
-                    class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}")
-
-                    # Map to simplified type
-                    if class_name in self.PLAIN_TEXT_CLASSES:
-                        region_type = "text"
-                        has_plain_text = True
-                    elif class_name in self.FORMULA_CLASSES:
-                        region_type = "formula"
-                        has_formula = True
-                    elif class_name in {"figure"}:
-                        region_type = "figure"
-                    elif class_name in {"table"}:
-                        region_type = "table"
+        # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]
+        if isinstance(result, list) and len(result) > 0:
+            first_result = result[0]
+            if isinstance(first_result, dict) and "boxes" in first_result:
+                boxes = first_result.get("boxes", [])
            else:
-                        region_type = class_name
+                boxes = []
+        else:
+            boxes = []

-                    regions.append(
-                        LayoutRegion(
+        for box in boxes:
+            cls_id = box.get("cls_id")
+            label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")
+            score = box.get("score", 0.0)
+            coordinate = box.get("coordinate", [0, 0, 0, 0])
+
+            # Normalize label to region type
+            region_type = self.LABEL_TO_TYPE.get(label, "text")
+
+            regions.append(LayoutRegion(
                type=region_type,
-                            bbox=bbox,
-                            confidence=confidence,
-                        )
-                    )
+                bbox=coordinate,
+                confidence=score,
+                score=score,
+            ))

-        return LayoutInfo(
-            regions=regions,
-            has_plain_text=has_plain_text,
-            has_formula=has_formula,
-        )
+
+        mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)
+
+        return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)
+
+
+if __name__ == "__main__":
+    import cv2
+    from app.services.image_processor import ImageProcessor
+
+    layout_detector = LayoutDetector()
+    image_path = "test/timeout.png"
+
+    image = cv2.imread(image_path)
+    image_processor = ImageProcessor(padding_ratio=0.15)
+    image = image_processor.add_padding(image)
+
+    # Save the padded image for debugging
+    cv2.imwrite("debug_padded_image.png", image)
+
+
+    layout_info = layout_detector.detect(image)
+    print(layout_info)
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,14 +1,12 @@
 """PaddleOCR-VL client service for text and formula recognition."""

-import io
-import tempfile
-from pathlib import Path
-
-import cv2
 import numpy as np
-
 from app.core.config import get_settings
-from app.schemas.image import LayoutInfo
+from paddleocr import PaddleOCRVL
+from typing import Optional
+from app.services.layout_detector import LayoutDetector
+from app.services.image_processor import ImageProcessor
+from app.services.converter import Converter

 settings = get_settings()

@@ -16,52 +14,40 @@ settings = get_settings()
 class OCRService:
    """Service for OCR using PaddleOCR-VL."""

-    FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
+    _pipeline: Optional[PaddleOCRVL] = None
+    _layout_detector: Optional[LayoutDetector] = None

    def __init__(
        self,
-        vl_server_url: str | None = None,
-        pp_doclayout_model_dir: str | None = None,
+        vl_server_url: str,
+        layout_detector: LayoutDetector,
+        image_processor: ImageProcessor,
+        converter: Converter,
    ):
        """Initialize OCR service.

        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
-            pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
+            layout_detector: Layout detector instance.
+            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
-        self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
-        self._pipeline = None
-
+        self.layout_detector = layout_detector 
+        self.image_processor = image_processor
+        self.converter = converter
    def _get_pipeline(self):    
        """Get or create PaddleOCR-VL pipeline.

        Returns:
            PaddleOCRVL pipeline instance.
        """
-        if self._pipeline is None:
-            from paddleocr import PaddleOCRVL
-
-            self._pipeline = PaddleOCRVL(
+        if OCRService._pipeline is None:
+            OCRService._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
-                layout_detection_model_dir=self.pp_doclayout_model_dir,
            )
-        return self._pipeline
-
-    def _save_temp_image(self, image: np.ndarray) -> str:
-        """Save image to a temporary file.
-
-        Args:
-            image: Image as numpy array in BGR format.
-
-        Returns:
-            Path to temporary file.
-        """
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
-            cv2.imwrite(f.name, image)
-            return f.name
+        return OCRService._pipeline

    def recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.
@@ -77,30 +63,21 @@ class OCRService:
        """
        try:
            pipeline = self._get_pipeline()
-            temp_path = self._save_temp_image(image)

-            try:
-                results = list(pipeline.predict(temp_path))
+            output = pipeline.predict(image, use_layout_detection=True)

            markdown_content = ""
-                for result in results:
-                    # PaddleOCR-VL results can be saved to markdown
-                    md_buffer = io.StringIO()
-                    result.save_to_markdown(save_path=md_buffer)
-                    markdown_content += md_buffer.getvalue()

-                # Convert markdown to other formats
-                latex = self._markdown_to_latex(markdown_content)
-                mathml = self._extract_mathml(markdown_content)
+            for res in output:
+                markdown_content += res.markdown.get("markdown_texts", "")
+
+            convert_result  = self.converter.convert_to_formats(markdown_content)

            return {
                "markdown": markdown_content,
-                    "latex": latex,
-                    "mathml": mathml,
+                "latex": convert_result.latex,
+                "mathml": convert_result.mathml,
            }
-            finally:
-                Path(temp_path).unlink(missing_ok=True)
-
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

@@ -116,188 +93,49 @@ class OCRService:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
-            import httpx
+            pipeline = self._get_pipeline()

-            temp_path = self._save_temp_image(image)
+            output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")

-            try:
-                # Use vLLM API directly for formula recognition
-                import base64
+            markdown_content = ""

-                with open(temp_path, "rb") as f:
-                    image_base64 = base64.b64encode(f.read()).decode("utf-8")
+            for res in output:
+                markdown_content += res.markdown.get("markdown_texts", "")

-                # Call vLLM server with formula prompt
-                response = httpx.post(
-                    f"{self.vl_server_url}/chat/completions",
-                    json={
-                        "model": "paddleocr-vl",
-                        "messages": [
-                            {
-                                "role": "user",
-                                "content": [
-                                    {"type": "text", "text": self.FORMULA_PROMPT},
-                                    {
-                                        "type": "image_url",
-                                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
-                                    },
-                                ],
-                            }
-                        ],
-                        "max_tokens": 1024,
-                    },
-                    timeout=60.0,
-                )
-                response.raise_for_status()
-                result = response.json()
-
-                latex = result["choices"][0]["message"]["content"].strip()
-
-                # Convert latex to other formats
-                markdown = self._latex_to_markdown(latex)
-                mathml = self._latex_to_mathml(latex)
+            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
-                    "latex": latex,
-                    "markdown": markdown,
-                    "mathml": mathml,
+                "latex": convert_result.latex,
+                "mathml": convert_result.mathml,
+                "markdown": markdown_content,
            }
-            finally:
-                Path(temp_path).unlink(missing_ok=True)
-
-        except httpx.HTTPStatusError as e:
-            raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e

-    def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
-        """Recognize content based on layout detection results.
+    def recognize(self, image: np.ndarray) -> dict:
+        """Recognize content using PaddleOCR-VL.

        Args:
            image: Input image as numpy array in BGR format.
-            layout_info: Layout detection results.

        Returns:
-            Dict with recognition results including mode used.
+            Dict with 'latex', 'markdown', 'mathml' keys.
        """
-        # Decision logic:
-        # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
-        # - Otherwise -> use formula_recognition (VL with prompt)
-        if layout_info.has_plain_text:
-            result = self.recognize_mixed(image)
-            result["recognition_mode"] = "mixed_recognition"
+        padded_image = self.image_processor.add_padding(image)
+        layout_info = self.layout_detector.detect(padded_image)
+        if layout_info.MixedRecognition:
+            return self.recognize_mixed(image)
        else:
-            result = self.recognize_formula(image)
-            result["recognition_mode"] = "formula_recognition"
+            return self.recognize_formula(image)

-        return result

-    def _markdown_to_latex(self, markdown: str) -> str:
-        """Convert markdown to LaTeX.
-
-        Simple conversion - wraps content in LaTeX document structure.
-
-        Args:
-            markdown: Markdown content.
-
-        Returns:
-            LaTeX representation.
-        """
-        # Basic conversion: preserve math blocks, convert structure
-        lines = []
-        in_code_block = False
-
-        for line in markdown.split("\n"):
-            if line.startswith("```"):
-                in_code_block = not in_code_block
-                if in_code_block:
-                    lines.append("\\begin{verbatim}")
-                else:
-                    lines.append("\\end{verbatim}")
-            elif in_code_block:
-                lines.append(line)
-            elif line.startswith("# "):
-                lines.append(f"\\section{{{line[2:]}}}")
-            elif line.startswith("## "):
-                lines.append(f"\\subsection{{{line[3:]}}}")
-            elif line.startswith("### "):
-                lines.append(f"\\subsubsection{{{line[4:]}}}")
-            elif line.startswith("- "):
-                lines.append(f"\\item {line[2:]}")
-            elif line.startswith("$$"):
-                lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
-            elif "$" in line:
-                # Keep inline math as-is
-                lines.append(line)
-            else:
-                lines.append(line)
-
-        return "\n".join(lines)
-
-    def _latex_to_markdown(self, latex: str) -> str:
-        """Convert LaTeX to markdown.
-
-        Args:
-            latex: LaTeX content.
-
-        Returns:
-            Markdown representation.
-        """
-        # Wrap LaTeX in markdown math block
-        if latex.strip():
-            return f"$$\n{latex}\n$$"
-        return ""
-
-    def _latex_to_mathml(self, latex: str) -> str:
-        """Convert LaTeX to MathML.
-
-        Args:
-            latex: LaTeX content.
-
-        Returns:
-            MathML representation.
-        """
-        # Basic LaTeX to MathML conversion
-        # For production, consider using latex2mathml library
-        if not latex.strip():
-            return ""
-
-        try:
-            # Try to use latex2mathml if available
-            from latex2mathml.converter import convert
-
-            return convert(latex)
-        except ImportError:
-            # Fallback: wrap in basic MathML structure
-            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
-        except Exception:
-            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
-
-    def _extract_mathml(self, markdown: str) -> str:
-        """Extract and convert math from markdown to MathML.
-
-        Args:
-            markdown: Markdown content.
-
-        Returns:
-            MathML for any math content found.
-        """
-        import re
-
-        # Find all math blocks
-        math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
-        inline_math = re.findall(r"\$([^$]+)\$", markdown)
-
-        all_math = math_blocks + inline_math
-
-        if not all_math:
-            return ""
-
-        # Convert each to MathML and combine
-        mathml_parts = []
-        for latex in all_math:
-            mathml = self._latex_to_mathml(latex.strip())
-            if mathml:
-                mathml_parts.append(mathml)
-
-        return "\n".join(mathml_parts)
+if __name__ == "__main__":
+    import cv2
+    from app.services.image_processor import ImageProcessor
+    from app.services.layout_detector import LayoutDetector
+    image_processor = ImageProcessor(padding_ratio=0.15)
+    layout_detector = LayoutDetector()
+    ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
+    image = cv2.imread("test/image.png")
+    ocr_result = ocr_service.recognize(image)
+    print(ocr_result)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,30 +2,36 @@
 name = "doc-processer"
 version = "0.1.0"
 description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX"
-readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.10"
 license = { text = "MIT" }
 authors = [
    { name = "YogeLiu" }
 ]

 dependencies = [
-    "fastapi>=0.115.0",
-    "uvicorn[standard]>=0.32.0",
-    "opencv-python>=4.10.0",
-    "python-multipart>=0.0.12",
-    "pydantic>=2.10.0",
-    "pydantic-settings>=2.6.0",
-    "httpx>=0.28.0",
-    "numpy>=1.26.0",
-    "pillow>=10.4.0",
-    "python-docx>=1.1.0",
-    "paddleocr>=2.9.0",
-    "doclayout-yolo>=0.0.2",
-    "latex2mathml>=3.77.0",
-    "paddle>=1.2.0",
+    "fastapi==0.128.0",
+    "uvicorn[standard]==0.40.0",
+    "opencv-python==4.12.0.88",
+    "python-multipart==0.0.21",
+    "pydantic==2.12.5",
+    "pydantic-settings==2.12.0",
+    "httpx==0.28.1",
+    "numpy==2.2.6",
+    "pillow==12.0.0",
+    "python-docx==1.2.0",
+    "paddleocr==3.3.2",
+    "doclayout-yolo==0.0.4",
+    "latex2mathml==3.78.1",
+    "paddle==1.2.0",
+    "pypandoc==1.16.2",
+    "paddlepaddle",
+    "paddleocr[doc-parser]",
+    "safetensors"
 ]

+[tool.uv.sources]
+paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
+
 [project.optional-dependencies]
 dev = [
    "pytest>=8.0.0",