fix: add preprocess

feat: add omml api
feat: update port config
2026-02-04 12:45:34 +08:00 · 2026-02-04 12:35:14 +08:00 · 2026-02-04 12:06:17 +08:00 · 2026-02-04 12:00:06 +08:00 · 2026-01-14 14:18:00 +08:00 · 2026-01-05 21:37:51 +08:00
16 changed files with 1330 additions and 135 deletions
--- a/app/api/v1/endpoints/convert.py
+++ b/app/api/v1/endpoints/convert.py
@@ -1,10 +1,10 @@
-"""Markdown to DOCX conversion endpoint."""
+"""Format conversion endpoints."""

 from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import Response

 from app.core.dependencies import get_converter
-from app.schemas.convert import MarkdownToDocxRequest
+from app.schemas.convert import MarkdownToDocxRequest, LatexToOmmlRequest, LatexToOmmlResponse
 from app.services.converter import Converter

 router = APIRouter()
@@ -28,3 +28,39 @@ async def convert_markdown_to_docx(
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
+
+
+@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
+async def convert_latex_to_omml(
+    request: LatexToOmmlRequest,
+    converter: Converter = Depends(get_converter),
+) -> LatexToOmmlResponse:
+    """Convert LaTeX formula to OMML (Office Math Markup Language).
+
+    OMML is the math format used by Microsoft Word and other Office applications.
+    This endpoint is separate from the main OCR endpoint due to the performance
+    overhead of OMML conversion (requires creating a temporary DOCX file).
+
+    Args:
+        request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
+
+    Returns:
+        OMML representation of the formula.
+
+    Example:
+        ```bash
+        curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \\
+          -H "Content-Type: application/json" \\
+          -d '{"latex": "\\\\frac{a}{b} + \\\\sqrt{c}"}'
+        ```
+    """
+    if not request.latex or not request.latex.strip():
+        raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
+
+    try:
+        omml = converter.convert_to_omml(request.latex)
+        return LatexToOmmlResponse(omml=omml)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except RuntimeError as e:
+        raise HTTPException(status_code=503, detail=str(e))
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -2,11 +2,11 @@

 from fastapi import APIRouter, Depends, HTTPException

-from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service
+from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service
 from app.schemas.image import ImageOCRRequest, ImageOCRResponse
 from app.services.image_processor import ImageProcessor
 from app.services.layout_detector import LayoutDetector
-from app.services.ocr_service import OCRService
+from app.services.ocr_service import OCRService, MineruOCRService

 router = APIRouter()

@@ -16,7 +16,8 @@ async def process_image_ocr(
    request: ImageOCRRequest,
    image_processor: ImageProcessor = Depends(get_image_processor),
    layout_detector: LayoutDetector = Depends(get_layout_detector),
-    ocr_service: OCRService = Depends(get_ocr_service),
+    mineru_service: MineruOCRService = Depends(get_mineru_ocr_service),
+    paddle_service: OCRService = Depends(get_ocr_service),
 ) -> ImageOCRResponse:
    """Process an image and extract content as LaTeX, Markdown, and MathML.

@@ -27,6 +28,9 @@ async def process_image_ocr(
       - If plain text exists: use PP-DocLayoutV2 for mixed recognition
       - Otherwise: use PaddleOCR-VL with formula prompt
    4. Convert output to LaTeX, Markdown, and MathML formats
+
+    Note: OMML conversion is not included due to performance overhead.
+    Use the /convert/latex-to-omml endpoint to convert LaTeX to OMML separately.
    """

    image = image_processor.preprocess(
@@ -35,14 +39,18 @@ async def process_image_ocr(
    )

    try:
-        # 3. Perform OCR based on layout
-        ocr_result = ocr_service.recognize(image)
+        if request.model_name == "mineru":
+            ocr_result = mineru_service.recognize(image)
+        elif request.model_name == "paddle":
+            ocr_result = paddle_service.recognize(image)
+        else:
+            raise HTTPException(status_code=400, detail="Invalid model name")
    except RuntimeError as e:
        raise HTTPException(status_code=503, detail=str(e))

-    # 4. Return response
    return ImageOCRResponse(
        latex=ocr_result.get("latex", ""),
        markdown=ocr_result.get("markdown", ""),
        mathml=ocr_result.get("mathml", ""),
+        mml=ocr_result.get("mml", ""),
    )
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -24,6 +24,9 @@ class Settings(BaseSettings):
    # PaddleOCR-VL Settings
    paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"

+    # MinerOCR Settings
+    miner_ocr_api_url: str = "http://127.0.0.1:8000/file_parse"
+
    # Model Paths
    pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"

--- a/app/core/dependencies.py
+++ b/app/core/dependencies.py
@@ -2,7 +2,7 @@

 from app.services.image_processor import ImageProcessor
 from app.services.layout_detector import LayoutDetector
-from app.services.ocr_service import OCRService
+from app.services.ocr_service import OCRService, MineruOCRService
 from app.services.converter import Converter
 from app.core.config import get_settings

@@ -45,3 +45,14 @@ def get_converter() -> Converter:
    """Get a DOCX converter instance."""
    return Converter()

+
+def get_mineru_ocr_service() -> MineruOCRService:
+    """Get a MinerOCR service instance."""
+    settings = get_settings()
+    api_url = getattr(settings, 'miner_ocr_api_url', 'http://127.0.0.1:8000/file_parse')
+    return MineruOCRService(
+        api_url=api_url,
+        converter=get_converter(),
+        image_processor=get_image_processor(),
+    )
+
--- a/app/main.py
+++ b/app/main.py
@@ -37,9 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix)
 async def health_check():
    """Health check endpoint."""
    return {"status": "healthy"}
-    


 if __name__ == "__main__":
    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8053)
+
+    uvicorn.run(app, host="0.0.0.0", port=settings.port)
--- a/app/schemas/convert.py
+++ b/app/schemas/convert.py
@@ -1,4 +1,4 @@
-"""Request and response schemas for markdown to DOCX conversion endpoint."""
+"""Request and response schemas for format conversion endpoints."""

 from pydantic import BaseModel, Field, field_validator

@@ -17,3 +17,23 @@ class MarkdownToDocxRequest(BaseModel):
            raise ValueError("Markdown content cannot be empty")
        return v

+
+class LatexToOmmlRequest(BaseModel):
+    """Request body for LaTeX to OMML conversion endpoint."""
+
+    latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)")
+
+    @field_validator("latex")
+    @classmethod
+    def validate_latex_not_empty(cls, v: str) -> str:
+        """Validate that LaTeX formula is not empty."""
+        if not v or not v.strip():
+            raise ValueError("LaTeX formula cannot be empty")
+        return v
+
+
+class LatexToOmmlResponse(BaseModel):
+    """Response body for LaTeX to OMML conversion endpoint."""
+
+    omml: str = Field("", description="OMML (Office Math Markup Language) representation")
+
--- a/app/schemas/image.py
+++ b/app/schemas/image.py
@@ -25,6 +25,7 @@ class ImageOCRRequest(BaseModel):

    image_url: str | None = Field(None, description="URL to fetch the image from")
    image_base64: str | None = Field(None, description="Base64-encoded image data")
+    model_name: str = Field("mineru", description="Name of the model to use for OCR")

    @model_validator(mode="after")
    def validate_input(self):
@@ -39,11 +40,10 @@ class ImageOCRRequest(BaseModel):
 class ImageOCRResponse(BaseModel):
    """Response body for image OCR endpoint."""

-    latex: str = Field("", description="LaTeX representation of the content")
+    latex: str = Field("", description="LaTeX representation of the content (empty if mixed content)")
    markdown: str = Field("", description="Markdown representation of the content")
-    mathml: str = Field("", description="MathML representation (empty if no math detected)")
+    mathml: str = Field("", description="Standard MathML representation (empty if mixed content)")
+    mml: str = Field("", description="XML MathML with mml: namespace prefix (empty if mixed content)")
    layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
-    recognition_mode: str = Field(
-        "", description="Recognition mode used: mixed_recognition or formula_recognition"
-    )
+    recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition")

--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -4,17 +4,29 @@ import os
 import re
 import tempfile
 from dataclasses import dataclass
+from functools import lru_cache
 from typing import Literal

 import pypandoc
+from latex2mathml.converter import convert as latex_to_mathml


@dataclass
 class ConvertResult:
-    """Result of markdown conversion."""
+    """Result of markdown conversion.
+
+    Only populated when input contains pure LaTeX formula.
+    All fields are empty strings when input contains mixed content (text + formula).
+
+    Attributes:
+        latex: Pure LaTeX formula code (without delimiters).
+        mathml: Standard MathML format.
+        mml: XML MathML with mml: namespace prefix (mml:math).
+    """

    latex: str
    mathml: str
+    mml: str


@dataclass
@@ -28,59 +40,430 @@ class ExportResult:

 ExportType = Literal["docx", "pdf"]

+# MathML namespace
+MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
+OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
+
+# XSLT for MathML to mml: namespace conversion
+MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:mml="http://www.w3.org/1998/Math/MathML"
+    xmlns:m="http://www.w3.org/1998/Math/MathML"
+    exclude-result-prefixes="m">
+
+    <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
+
+    <!-- Match root math element -->
+    <xsl:template match="m:math|math">
+        <mml:math>
+            <xsl:apply-templates select="@*|node()"/>
+        </mml:math>
+    </xsl:template>
+
+    <!-- Match all other MathML elements -->
+    <xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
+        <xsl:element name="mml:{local-name()}">
+            <xsl:apply-templates select="@*|node()"/>
+        </xsl:element>
+    </xsl:template>
+
+    <!-- Copy attributes -->
+    <xsl:template match="@*">
+        <xsl:if test="local-name() != 'xmlns'">
+            <xsl:copy/>
+        </xsl:if>
+    </xsl:template>
+
+    <!-- Copy text nodes -->
+    <xsl:template match="text()">
+        <xsl:value-of select="."/>
+    </xsl:template>
+
+</xsl:stylesheet>
+"""
+

 class Converter:
-    """Service for conversion and export operations."""
+    """Service for conversion and export operations.
+
+    Conversion rules:
+    - Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
+    - Mixed content (text + formula) returns empty results for all formats.
+    - OMML conversion is provided as a separate method due to performance overhead.
+
+    Performance optimizations:
+    - Pre-compiled regex patterns
+    - XSLT-based MML conversion
+    - Cached XSLT transforms
+    - Direct Pandoc OMML output (avoids DOCX parsing)
+    """

    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"

+    # Pre-compiled regex patterns for formula detection
+    _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
+    _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
+    _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
+    _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
+    _RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
+
+    # Pre-compiled regex patterns for preprocessing
+    _RE_VSPACE = re.compile(r"\\\[1mm\]")
+    _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
+    _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
+    _RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
+    _RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
+    _RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
+    _RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
+    _RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
+    _RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
+    _RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
+    _RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
+    _RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
+    _RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
+    _RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
+
+    # Cached XSLT transform
+    _mml_xslt_transform = None
+
    def __init__(self):
        """Initialize converter."""

+    @classmethod
+    def _get_mml_xslt_transform(cls):
+        """Get cached XSLT transform for MathML to mml: conversion."""
+        if cls._mml_xslt_transform is None:
+            from lxml import etree
+            xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
+            cls._mml_xslt_transform = etree.XSLT(xslt_doc)
+        return cls._mml_xslt_transform
+
+    def _is_formula_only(self, text: str) -> bool:
+        """Check if text contains only a LaTeX formula (no mixed content).
+
+        A text is considered formula-only if it matches one of these patterns:
+        - Display math: $$...$$ or \\[...\\]
+        - Inline math: $...$ or \\(...\\)
+
+        Args:
+            text: Input text to check.
+
+        Returns:
+            True if the text contains only a LaTeX formula, False otherwise.
+        """
+        text = text.strip()
+
+        if not text:
+            return False
+
+        # Strict patterns: entire text must be a single formula with delimiters
+        # Using pre-compiled patterns with fullmatch semantics
+        if self._RE_DISPLAY_DOLLAR.fullmatch(text):
+            return True
+        if self._RE_DISPLAY_BRACKET.fullmatch(text):
+            return True
+        if self._RE_INLINE_DOLLAR.fullmatch(text):
+            return True
+        if self._RE_INLINE_PAREN.fullmatch(text):
+            return True
+
+        return False
+
    def convert_to_formats(self, md_text: str) -> ConvertResult:
-        """Convert markdown to LaTeX and MathML formats.
+        """Convert markdown to LaTeX, MathML, and MML formats.
+
+        Only converts when input contains a pure LaTeX formula.
+        Mixed content (text + formula) returns empty strings for all fields.

        Args:
            md_text: Markdown text to convert.

        Returns:
-            ConvertResult with latex and mathml fields.
+            ConvertResult with latex, mathml, and mml fields.
+            All fields are empty if input is not a pure formula.

        Raises:
-            ValueError: If md_text is empty.
-            RuntimeError: If conversion fails.
+            RuntimeError: If conversion fails for a valid formula.
        """
-        if md_text == "":
-            return ConvertResult(latex="", mathml="")
+        # Empty input returns empty result
+        if not md_text or not md_text.strip():
+            return ConvertResult(latex="", mathml="", mml="")
+
+        # Check if input is formula-only
+        if not self._is_formula_only(md_text):
+            # Mixed content: cannot convert to formula formats
+            return ConvertResult(latex="", mathml="", mml="")

        try:
-            # Convert to LaTeX
-            latex_output = pypandoc.convert_text(
-                md_text,
-                "latex",
-                format=self.INPUT_FORMAT,
-            ).rstrip("\n")
+            # Extract the LaTeX formula content (remove delimiters)
+            latex_formula = self._extract_latex_formula(md_text)

-            # Convert to HTML with MathML
-            mathml_output = pypandoc.convert_text(
-                md_text,
-                "html",
-                format=self.INPUT_FORMAT,
-                extra_args=["--mathml"],
-            ).rstrip("\n")
+            # Convert to MathML
+            mathml = self._latex_to_mathml(latex_formula)

-            return ConvertResult(latex=latex_output, mathml=mathml_output)
+            # Convert MathML to mml:math format (with namespace prefix)
+            mml = self._mathml_to_mml(mathml)
+
+            return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)

        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e

+    def convert_to_omml(self, latex_formula: str) -> str:
+        """Convert LaTeX formula to OMML (Office Math Markup Language).
+
+        This is a separate method due to the performance overhead of OMML conversion,
+        which requires creating a temporary DOCX file.
+
+        The formula is preprocessed using the same logic as export_to_file to ensure
+        proper conversion.
+
+        Args:
+            latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
+
+        Returns:
+            OMML representation as XML string.
+
+        Raises:
+            ValueError: If latex_formula is empty.
+            RuntimeError: If conversion fails.
+        """
+        if not latex_formula or not latex_formula.strip():
+            raise ValueError("LaTeX formula cannot be empty")
+
+        # Preprocess formula using the same preprocessing as export
+        preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
+        
+        return self._latex_to_omml(preprocessed)
+
+    def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
+        """Preprocess LaTeX formula for OMML conversion.
+
+        Applies the same preprocessing steps as preprocess_for_export to ensure
+        consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
+
+        Args:
+            latex_formula: Pure LaTeX formula.
+
+        Returns:
+            Preprocessed LaTeX formula.
+        """
+        # Use the same preprocessing methods as export
+        # 1. Convert matrix environments
+        latex_formula = self._convert_matrix_environments(latex_formula)
+        
+        # 2. Fix array column specifiers (remove spaces)
+        latex_formula = self._fix_array_column_specifiers(latex_formula)
+        
+        # 3. Fix brace spacing
+        latex_formula = self._fix_brace_spacing(latex_formula)
+        
+        # 4. Convert special environments (cases, aligned)
+        latex_formula = self._convert_special_environments(latex_formula)
+        
+        return latex_formula
+
+    def _extract_latex_formula(self, text: str) -> str:
+        """Extract LaTeX formula from text by removing delimiters.
+
+        Args:
+            text: Text containing LaTeX formula with delimiters.
+
+        Returns:
+            Pure LaTeX formula without delimiters.
+        """
+        text = text.strip()
+
+        # Remove display math delimiters: $$...$$ or \[...\]
+        if text.startswith("$$") and text.endswith("$$"):
+            return text[2:-2].strip()
+        if text.startswith("\\[") and text.endswith("\\]"):
+            return text[2:-2].strip()
+
+        # Remove inline math delimiters: $...$ or \(...\)
+        if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
+            return text[1:-1].strip()
+        if text.startswith("\\(") and text.endswith("\\)"):
+            return text[2:-2].strip()
+
+        # If no delimiters, return as-is
+        return text.strip()
+
+    @staticmethod
+    @lru_cache(maxsize=256)
+    def _latex_to_mathml_cached(latex_formula: str) -> str:
+        """Cached conversion of LaTeX formula to MathML.
+
+        Uses LRU cache to avoid recomputing for repeated formulas.
+        """
+        try:
+            # Use latex2mathml library for conversion (fast, pure Python)
+            return latex_to_mathml(latex_formula)
+        except Exception as e:
+            # Fallback: try with Pandoc (slower, but more robust)
+            try:
+                mathml_html = pypandoc.convert_text(
+                    f"${latex_formula}$",
+                    "html",
+                    format="markdown+tex_math_dollars",
+                    extra_args=["--mathml"],
+                )
+                # Extract just the <math> element from the HTML
+                match = Converter._RE_MATH_ELEMENT.search(mathml_html)
+                if match:
+                    return match.group(0)
+                return mathml_html.rstrip("\n")
+            except Exception as pandoc_error:
+                raise RuntimeError(
+                    f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
+                ) from e
+
+    def _latex_to_mathml(self, latex_formula: str) -> str:
+        """Convert LaTeX formula to standard MathML.
+
+        Args:
+            latex_formula: Pure LaTeX formula (without delimiters).
+
+        Returns:
+            Standard MathML representation.
+        """
+        return self._latex_to_mathml_cached(latex_formula)
+
+    def _mathml_to_mml(self, mathml: str) -> str:
+        """Convert standard MathML to mml:math format with namespace prefix.
+
+        Uses XSLT for efficient transformation. Transforms:
+        - <math ...> to <mml:math xmlns:mml="..." ...>
+        - All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
+
+        Args:
+            mathml: Standard MathML string.
+
+        Returns:
+            MathML with mml: namespace prefix.
+        """
+        if not mathml:
+            return ""
+
+        try:
+            from lxml import etree
+
+            # Parse MathML
+            root = etree.fromstring(mathml.encode("utf-8"))
+
+            # Apply XSLT transformation (cached)
+            transform = self._get_mml_xslt_transform()
+            result_tree = transform(root)
+
+            # Serialize to string
+            return str(result_tree)
+
+        except Exception:
+            # Fallback: simple string replacement (less robust but no lxml dependency)
+            result = mathml
+            # Add namespace to root math element
+            result = re.sub(
+                r"<math\b",
+                f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
+                result,
+            )
+            result = re.sub(r"</math>", "</mml:math>", result)
+
+            # Add mml: prefix to all other elements using a single regex
+            # Match opening tags
+            result = re.sub(
+                r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
+                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
+                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
+                r"maction|semantics|annotation|annotation-xml)\b",
+                r"<mml:\1",
+                result,
+            )
+            # Match closing tags
+            result = re.sub(
+                r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
+                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
+                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
+                r"maction|semantics|annotation|annotation-xml)>",
+                r"</mml:\1>",
+                result,
+            )
+
+            return result
+
+    def _latex_to_omml(self, latex_formula: str) -> str:
+        """Convert LaTeX formula to OMML (Office Math Markup Language).
+
+        Uses Pandoc to create DOCX in memory and extracts OMML from it.
+        Optimized to minimize disk I/O by using in-memory zip processing.
+
+        Args:
+            latex_formula: Pure LaTeX formula (without delimiters).
+
+        Returns:
+            OMML representation as XML string.
+        """
+        import io
+        import zipfile
+
+        try:
+            from lxml import etree
+
+            # Convert to DOCX bytes using Pandoc
+            # We still need a temp file for input, but output goes to temp file too
+            # Then we process the DOCX in memory
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
+                f.write(f"$${latex_formula}$$\n")
+                temp_md = f.name
+
+            temp_docx = temp_md.replace(".md", ".docx")
+
+            try:
+                pypandoc.convert_file(
+                    temp_md,
+                    "docx",
+                    format=self.INPUT_FORMAT,
+                    outputfile=temp_docx,
+                )
+
+                # Read DOCX into memory and process as ZIP
+                with open(temp_docx, "rb") as f:
+                    docx_bytes = f.read()
+
+                # Extract document.xml from DOCX (which is a ZIP file)
+                with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
+                    document_xml = zf.read("word/document.xml")
+
+                # Parse XML and extract OMML
+                root = etree.fromstring(document_xml)
+
+                # Find all oMath elements
+                omml_parts = []
+                for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
+                    omml_parts.append(etree.tostring(math, encoding="unicode"))
+
+                return "\n".join(omml_parts)
+
+            finally:
+                # Cleanup temp files
+                if os.path.exists(temp_md):
+                    os.remove(temp_md)
+                if os.path.exists(temp_docx):
+                    os.remove(temp_docx)
+
+        except Exception as e:
+            raise RuntimeError(f"OMML conversion failed: {e}") from e
+
    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

+        Uses pre-compiled regex patterns for better performance.
+
        Args:
            md_text: Raw markdown text.

@@ -88,46 +471,39 @@ class Converter:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
-        md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
+        md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
-        md_text = re.sub(
-            r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
-            r"\1\n\n\\[\3\\]\n\n\4",
-            md_text,
-            flags=re.DOTALL,
-        )
-        md_text = re.sub(
-            r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
-            r"\n\\[\2\\]\n",
-            md_text,
-            flags=re.MULTILINE | re.DOTALL,
-        )
+        md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
+        md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)

        # Remove arithmatex span wrappers
-        cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
+        cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
-        cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
-        cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
+        cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")

        # Convert block formulas: \[ \] => $$ $$
-        cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
-        cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
+        cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")

        # Remove spaces between $ and formula content
-        # Use negative lookahead/lookbehind to avoid matching $$ block formulas
-        cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
+        cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)

+        # Fix array environment column specifiers (remove spaces)
+        cleaned_md = self._fix_array_column_specifiers(cleaned_md)
+
        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)

        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)

+        # Handle LaTeX \tag{} commands for equation numbering
+        cleaned_md = self._convert_tag_commands(cleaned_md)
+
        return cleaned_md

    def _convert_matrix_environments(self, md_text: str) -> str:
@@ -136,42 +512,41 @@ class Converter:
        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
-        md_text = re.sub(
-            r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
+        md_text = self._RE_VMATRIX.sub(
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
-            flags=re.DOTALL,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
-        md_text = re.sub(
-            r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
+        md_text = self._RE_VMATRIX_DOUBLE.sub(
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
-            flags=re.DOTALL,
        )

        return md_text

+    def _fix_array_column_specifiers(self, md_text: str) -> str:
+        """Fix array environment column specifiers by removing spaces.
+
+        Pandoc's OMML converter doesn't accept spaces between column alignment
+        specifiers in array environments. This converts patterns like
+        {c c c c} to {cccc}.
+        """
+
+        def remove_spaces_in_specifier(match: re.Match) -> str:
+            """Remove spaces from column specifier."""
+            specifier = match.group(1)
+            return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
+
+        return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
+
    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
-        # Fix \left\{ spacing
-        md_text = re.sub(
-            r"\\left\\\{\s+",
-            r"\\left\\{\\!",
-            md_text,
-        )
-
-        # Fix \right\} spacing
-        md_text = re.sub(
-            r"\s+\\right\\\}",
-            r"\\!\\right\\}",
-            md_text,
-        )
-
+        md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
+        md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
@@ -179,45 +554,45 @@ class Converter:

        These environments have better rendering support in Word/OMML.
        """
+        # Pre-compiled pattern for alignment marker removal
+        _re_align_marker = re.compile(r"(^|\\\\)\s*&")

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

-        md_text = re.sub(
-            r"\\begin\{cases\}(.*?)\\end\{cases\}",
-            convert_cases,
-            md_text,
-            flags=re.DOTALL,
-        )
+        md_text = self._RE_CASES.sub(convert_cases, md_text)

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
-            # Remove leading & alignment markers (not needed in array{l})
-            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+            content = _re_align_marker.sub(r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

-        md_text = re.sub(
-            r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
-            convert_aligned_to_array,
-            md_text,
-            flags=re.DOTALL,
-        )
+        md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
-            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+            content = _re_align_marker.sub(r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

-        md_text = re.sub(
-            r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
-            convert_standalone_aligned,
-            md_text,
-            flags=re.DOTALL,
-        )
+        md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)

        return md_text

+    def _convert_tag_commands(self, md_text: str) -> str:
+        """Convert LaTeX \\tag{} commands to Word-compatible format.
+
+        The \\tag{} command is not supported in Word OMML format, so we convert it to
+        use simple spacing (\\quad) to push the equation number to the right side.
+        """
+
+        def convert_tag(match: re.Match) -> str:
+            formula_content = match.group(1)
+            tag_content = match.group(2)
+            return f"$${formula_content} \\quad ({tag_content})$$"
+
+        return self._RE_TAG.sub(convert_tag, md_text)
+
    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.

@@ -309,4 +684,3 @@ class Converter:
        """
        if os.path.exists(file_path):
            os.remove(file_path)
-
--- a/app/services/image_processor.py
+++ b/app/services/image_processor.py
@@ -25,6 +25,38 @@ class ImageProcessor:
        """
        self.padding_ratio = padding_ratio or settings.image_padding_ratio

+    def _convert_to_bgr(self, pil_image: Image.Image) -> np.ndarray:
+        """Convert PIL Image to BGR numpy array, handling alpha channel.
+
+        Args:
+            pil_image: PIL Image object.
+
+        Returns:
+            Image as numpy array in BGR format.
+        """
+        # Handle RGBA images (PNG with transparency)
+        if pil_image.mode == "RGBA":
+            # Create white background and paste image on top
+            background = Image.new("RGB", pil_image.size, (255, 255, 255))
+            background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha as mask
+            pil_image = background
+        elif pil_image.mode == "LA":
+            # Grayscale with alpha
+            background = Image.new("L", pil_image.size, 255)
+            background.paste(pil_image, mask=pil_image.split()[1])
+            pil_image = background.convert("RGB")
+        elif pil_image.mode == "P":
+            # Palette mode, may have transparency
+            pil_image = pil_image.convert("RGBA")
+            background = Image.new("RGB", pil_image.size, (255, 255, 255))
+            background.paste(pil_image, mask=pil_image.split()[3])
+            pil_image = background
+        elif pil_image.mode != "RGB":
+            # Convert other modes to RGB
+            pil_image = pil_image.convert("RGB")
+
+        return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+
    def load_image_from_url(self, url: str) -> np.ndarray:
        """Load image from URL.

@@ -40,8 +72,8 @@ class ImageProcessor:
        try:
            with urlopen(url, timeout=30) as response:
                image_data = response.read()
-            image = Image.open(io.BytesIO(image_data))
-            return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            pil_image = Image.open(io.BytesIO(image_data))
+            return self._convert_to_bgr(pil_image)
        except Exception as e:
            raise ValueError(f"Failed to load image from URL: {e}") from e

@@ -63,8 +95,8 @@ class ImageProcessor:
                base64_str = base64_str.split(",", 1)[1]

            image_data = base64.b64decode(base64_str)
-            image = Image.open(io.BytesIO(image_data))
-            return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            pil_image = Image.open(io.BytesIO(image_data))
+            return self._convert_to_bgr(pil_image)
        except Exception as e:
            raise ValueError(f"Failed to decode base64 image: {e}") from e

--- a/app/services/layout_detector.py
+++ b/app/services/layout_detector.py
@@ -140,18 +140,39 @@ class LayoutDetector:

 if __name__ == "__main__":
    import cv2
+    from app.core.config import get_settings
    from app.services.image_processor import ImageProcessor
-
+    from app.services.converter import Converter
+    from app.services.ocr_service import OCRService
+    
+    settings = get_settings()
+    
+    # Initialize dependencies
    layout_detector = LayoutDetector()
-    image_path = "test/timeout.png"
-
+    image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)
+    converter = Converter()
+    
+    # Initialize OCR service
+    ocr_service = OCRService(
+        vl_server_url=settings.paddleocr_vl_url,
+        layout_detector=layout_detector,
+        image_processor=image_processor,
+        converter=converter,
+    )
+    
+    # Load test image
+    image_path = "test/complex_formula.png"
    image = cv2.imread(image_path)
-    image_processor = ImageProcessor(padding_ratio=0.15)
-    image = image_processor.add_padding(image)
-
-    # Save the padded image for debugging
-    cv2.imwrite("debug_padded_image.png", image)
-
-
-    layout_info = layout_detector.detect(image)
-    print(layout_info)
+    
+    if image is None:
+        print(f"Failed to load image: {image_path}")
+    else:
+        print(f"Image loaded: {image.shape}")
+        
+        # Run OCR recognition
+        result = ocr_service.recognize(image)
+        
+        print("\n=== OCR Result ===")
+        print(f"Markdown:\n{result['markdown']}")
+        print(f"\nLaTeX:\n{result['latex']}")
+        print(f"\nMathML:\n{result['mathml']}")
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,17 +1,121 @@
 """PaddleOCR-VL client service for text and formula recognition."""

+import re
 import numpy as np
+import cv2
+import requests
+from io import BytesIO
 from app.core.config import get_settings
 from paddleocr import PaddleOCRVL
 from typing import Optional
 from app.services.layout_detector import LayoutDetector
 from app.services.image_processor import ImageProcessor
 from app.services.converter import Converter
+from abc import ABC, abstractmethod

 settings = get_settings()

+_COMMANDS_NEED_SPACE = {
+    # operators / calculus
+    "cdot",
+    "times",
+    "div",
+    "pm",
+    "mp",
+    "int",
+    "iint",
+    "iiint",
+    "oint",
+    "sum",
+    "prod",
+    "lim",
+    # common functions
+    "sin",
+    "cos",
+    "tan",
+    "cot",
+    "sec",
+    "csc",
+    "log",
+    "ln",
+    "exp",
+    # misc
+    "partial",
+    "nabla",
+}

-class OCRService:
+_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
+_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
+
+# stage2: differentials inside math segments
+_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
+_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
+
+
+def _split_glued_command_token(token: str) -> str:
+    """Split OCR-glued LaTeX command token by whitelist longest-prefix.
+
+    Examples:
+    - \\cdotdS -> \\cdot dS
+    - \\intdx  -> \\int dx
+    """
+    if not token.startswith("\\"):
+        return token
+
+    body = token[1:]
+    if len(body) < 2:
+        return token
+
+    best = None
+    # longest prefix that is in whitelist
+    for i in range(1, len(body)):
+        prefix = body[:i]
+        if prefix in _COMMANDS_NEED_SPACE:
+            best = prefix
+
+    if not best:
+        return token
+
+    suffix = body[len(best) :]
+    if not suffix:
+        return token
+
+    return f"\\{best} {suffix}"
+
+
+def _postprocess_math(expr: str) -> str:
+    """Postprocess a *math* expression (already inside $...$ or $$...$$)."""
+    # stage1: split glued command tokens (e.g. \cdotdS)
+    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
+    # stage2: normalize differentials (keep conservative)
+    expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
+    expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
+    return expr
+
+
+def _postprocess_markdown(markdown_content: str) -> str:
+    """Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
+    if not markdown_content:
+        return markdown_content
+
+    def _fix_segment(m: re.Match) -> str:
+        seg = m.group(0)
+        if seg.startswith("$$") and seg.endswith("$$"):
+            return f"$${_postprocess_math(seg[2:-2])}$$"
+        if seg.startswith("$") and seg.endswith("$"):
+            return f"${_postprocess_math(seg[1:-1])}$"
+        return seg
+
+    return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
+
+
+class OCRServiceBase(ABC):
+    @abstractmethod
+    def recognize(self, image: np.ndarray) -> dict:
+        pass
+
+
+class OCRService(OCRServiceBase):
    """Service for OCR using PaddleOCR-VL."""

    _pipeline: Optional[PaddleOCRVL] = None
@@ -32,10 +136,11 @@ class OCRService:
            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
-        self.layout_detector = layout_detector 
+        self.layout_detector = layout_detector
        self.image_processor = image_processor
        self.converter = converter
-    def _get_pipeline(self):    
+
+    def _get_pipeline(self):
        """Get or create PaddleOCR-VL pipeline.

        Returns:
@@ -49,7 +154,7 @@ class OCRService:
            )
        return OCRService._pipeline

-    def recognize_mixed(self, image: np.ndarray) -> dict:
+    def _recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.

        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
@@ -71,17 +176,19 @@ class OCRService:
            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

-            convert_result  = self.converter.convert_to_formats(markdown_content)
+            markdown_content = _postprocess_markdown(markdown_content)
+            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
                "markdown": markdown_content,
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
+                "mml": convert_result.mml,
            }
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

-    def recognize_formula(self, image: np.ndarray) -> dict:
+    def _recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using PaddleOCR-VL with prompt.

        This mode uses PaddleOCR-VL directly with a formula recognition prompt.
@@ -102,11 +209,13 @@ class OCRService:
            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

+            markdown_content = _postprocess_markdown(markdown_content)
            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
+                "mml": convert_result.mml,
                "markdown": markdown_content,
            }
        except Exception as e:
@@ -124,18 +233,109 @@ class OCRService:
        padded_image = self.image_processor.add_padding(image)
        layout_info = self.layout_detector.detect(padded_image)
        if layout_info.MixedRecognition:
-            return self.recognize_mixed(image)
+            return self._recognize_mixed(image)
        else:
-            return self.recognize_formula(image)
+            return self._recognize_formula(image)
+
+
+class MineruOCRService(OCRServiceBase):
+    """Service for OCR using local file_parse API."""
+
+    def __init__(
+        self,
+        api_url: str = "http://127.0.0.1:8000/file_parse",
+        image_processor: Optional[ImageProcessor] = None,
+        converter: Optional[Converter] = None,
+    ):
+        """Initialize Local API service.
+
+        Args:
+            api_url: URL of the local file_parse API endpoint.
+            converter: Optional converter instance for format conversion.
+        """
+        self.api_url = api_url
+        self.image_processor = image_processor
+        self.converter = converter
+
+    def recognize(self, image: np.ndarray) -> dict:
+        """Recognize content using local file_parse API.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+
+        Returns:
+            Dict with 'markdown', 'latex', 'mathml' keys.
+        """
+        try:
+            if self.image_processor:
+                image = self.image_processor.add_padding(image)
+
+            # Convert numpy array to image bytes
+            success, encoded_image = cv2.imencode(".png", image)
+            if not success:
+                raise RuntimeError("Failed to encode image")
+
+            image_bytes = BytesIO(encoded_image.tobytes())
+
+            # Prepare multipart form data
+            files = {"files": ("image.png", image_bytes, "image/png")}
+
+            data = {
+                "return_middle_json": "false",
+                "return_model_output": "false",
+                "return_md": "true",
+                "return_images": "false",
+                "end_page_id": "99999",
+                "start_page_id": "0",
+                "lang_list": "en",
+                "server_url": "string",
+                "return_content_list": "false",
+                "backend": "hybrid-auto-engine",
+                "table_enable": "true",
+                "response_format_zip": "false",
+                "formula_enable": "true",
+                "parse_method": "ocr",
+            }
+
+            # Make API request
+            response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30)
+            response.raise_for_status()
+
+            result = response.json()
+
+            # Extract markdown content from response
+            markdown_content = ""
+            if "results" in result and "image" in result["results"]:
+                markdown_content = result["results"]["image"].get("md_content", "")
+
+            # markdown_content = _postprocess_markdown(markdown_content)
+
+            # Convert to other formats if converter is available
+            latex = ""
+            mathml = ""
+            mml = ""
+            if self.converter and markdown_content:
+                convert_result = self.converter.convert_to_formats(markdown_content)
+                latex = convert_result.latex
+                mathml = convert_result.mathml
+                mml = convert_result.mml
+
+            return {
+                "markdown": markdown_content,
+                "latex": latex,
+                "mathml": mathml,
+                "mml": mml,
+            }
+
+        except requests.RequestException as e:
+            raise RuntimeError(f"Local API request failed: {e}") from e
+        except Exception as e:
+            raise RuntimeError(f"Recognition failed: {e}") from e


 if __name__ == "__main__":
-    import cv2
-    from app.services.image_processor import ImageProcessor
-    from app.services.layout_detector import LayoutDetector
-    image_processor = ImageProcessor(padding_ratio=0.15)
-    layout_detector = LayoutDetector()
-    ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
-    image = cv2.imread("test/image.png")
-    ocr_result = ocr_service.recognize(image)
-    print(ocr_result)
+    mineru_service = MineruOCRService()
+    image = cv2.imread("test/complex_formula.png")
+    image_numpy = np.array(image)
+    ocr_result = mineru_service.recognize(image_numpy)
+    print(ocr_result)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,8 @@ dependencies = [
    "pypandoc==1.16.2",
    "paddlepaddle",
    "paddleocr[doc-parser]",
-    "safetensors"
+    "safetensors",
+    "lxml>=5.0.0"
 ]

 [tool.uv.sources]
--- a/test_array_fix.py
+++ b/test_array_fix.py
@@ -0,0 +1,102 @@
+"""Test script for array column specifier fix."""
+
+from app.services.converter import Converter
+
+
+def test_array_specifier_fix():
+    """Test that array column specifiers with spaces are fixed."""
+    
+    converter = Converter()
+    
+    # The problematic LaTeX from the error
+    latex_formula = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
+    
+    print("Testing array column specifier fix")
+    print("=" * 80)
+    print(f"\nOriginal LaTeX (first 200 chars):\n{latex_formula[:200]}...")
+    
+    # Test preprocessing
+    print("\n" + "-" * 80)
+    print("Step 1: Preprocessing")
+    preprocessed = converter._preprocess_formula_for_omml(latex_formula)
+    
+    # Check if spaces were removed from array specifiers
+    if "{c c c c}" in preprocessed:
+        print("✗ FAILED: Spaces not removed from array specifiers")
+        print(f"Found: {preprocessed[preprocessed.find('{c c c c}'):preprocessed.find('{c c c c}')+10]}")
+    elif "{cccc}" in preprocessed:
+        print("✓ SUCCESS: Spaces removed from array specifiers")
+        print(f"Changed '{{{\"c c c c\"}}}' → '{{cccc}}'")
+    else:
+        print("? Could not find array specifier in preprocessed output")
+    
+    # Test OMML conversion
+    print("\n" + "-" * 80)
+    print("Step 2: OMML Conversion")
+    try:
+        omml = converter.convert_to_omml(latex_formula)
+        print(f"✓ SUCCESS: OMML conversion completed")
+        print(f"OMML length: {len(omml)} characters")
+        print(f"OMML preview (first 300 chars):\n{omml[:300]}...")
+        
+        # Check if it contains oMath element
+        if "oMath" in omml:
+            print("\n✓ Valid OMML: Contains oMath element")
+        else:
+            print("\n✗ WARNING: OMML might be incomplete (no oMath element found)")
+            
+    except Exception as e:
+        print(f"✗ FAILED: OMML conversion error")
+        print(f"Error: {e}")
+        return False
+    
+    print("\n" + "=" * 80)
+    print("✓ All tests passed!")
+    return True
+
+
+def test_simple_array():
+    """Test with a simpler array example."""
+    
+    converter = Converter()
+    
+    print("\nTesting simple array")
+    print("=" * 80)
+    
+    # Simple array with spaces in column specifier
+    latex_formula = r"\begin{array}{c c c} a & b & c \\ d & e & f \end{array}"
+    
+    print(f"LaTeX: {latex_formula}")
+    
+    try:
+        omml = converter.convert_to_omml(latex_formula)
+        print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
+        print(f"Preview: {omml[:200]}...")
+        return True
+    except Exception as e:
+        print(f"✗ FAILED: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    print("Array Column Specifier Fix Test Suite\n")
+    
+    try:
+        test1 = test_simple_array()
+        test2 = test_array_specifier_fix()
+        
+        if test1 and test2:
+            print("\n" + "=" * 80)
+            print("✓✓✓ ALL TESTS PASSED ✓✓✓")
+            print("=" * 80)
+        else:
+            print("\n" + "=" * 80)
+            print("✗✗✗ SOME TESTS FAILED ✗✗✗")
+            print("=" * 80)
+            
+    except KeyboardInterrupt:
+        print("\n\nTests interrupted by user")
+    except Exception as e:
+        print(f"\n\nTest suite error: {e}")
+        import traceback
+        traceback.print_exc()
--- a/test_converter.py
+++ b/test_converter.py
@@ -0,0 +1,57 @@
+"""Test script for converter functionality."""
+
+from app.services.converter import Converter
+
+
+def test_latex_only_conversion():
+    """Test conversion of LaTeX-only content."""
+    converter = Converter()
+
+    # Test case 1: Display math with $$...$$
+    latex_input = "$$E = mc^2$$"
+    result = converter.convert_to_formats(latex_input)
+
+    print("Test 1: Display math ($$...$$)")
+    print(f"Input: {latex_input}")
+    print(f"LaTeX: {result.latex}")
+    print(f"MathML: {result.mathml[:100]}...")
+    print(f"MML: {result.mml[:100]}...")
+    print(f"OMML: {result.omml[:100] if result.omml else 'Empty'}...")
+    print()
+
+    # Test case 2: Inline math with $...$
+    latex_input2 = "$\\frac{a}{b}$"
+    result2 = converter.convert_to_formats(latex_input2)
+
+    print("Test 2: Inline math ($...$)")
+    print(f"Input: {latex_input2}")
+    print(f"LaTeX: {result2.latex}")
+    print(f"MathML: {result2.mathml[:100]}...")
+    print()
+
+    # Test case 3: Complex formula
+    latex_input3 = "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$"
+    result3 = converter.convert_to_formats(latex_input3)
+
+    print("Test 3: Complex formula")
+    print(f"Input: {latex_input3}")
+    print(f"LaTeX: {result3.latex}")
+    print(f"MathML: {result3.mathml[:150]}...")
+    print(f"OMML length: {len(result3.omml)}")
+    print()
+
+    # Test case 4: Regular markdown (not LaTeX-only)
+    markdown_input = "# Hello\n\nThis is a test with math: $x = 2$"
+    result4 = converter.convert_to_formats(markdown_input)
+
+    print("Test 4: Regular markdown")
+    print(f"Input: {markdown_input}")
+    print(f"LaTeX: {result4.latex[:100]}...")
+    print(f"MathML: {result4.mathml[:100]}...")
+    print(f"MML: {result4.mml}")
+    print(f"OMML: {result4.omml}")
+    print()
+
+
+if __name__ == "__main__":
+    test_latex_only_conversion()
--- a/test_omml_api.py
+++ b/test_omml_api.py
@@ -0,0 +1,112 @@
+"""Test script for OMML conversion API endpoint."""
+
+import requests
+import json
+
+
+def test_latex_to_omml():
+    """Test the /convert/latex-to-omml endpoint."""
+    
+    # Test cases
+    test_cases = [
+        {
+            "name": "Simple fraction",
+            "latex": "\\frac{a}{b}",
+        },
+        {
+            "name": "Quadratic formula",
+            "latex": "x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}",
+        },
+        {
+            "name": "Integral",
+            "latex": "\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}",
+        },
+        {
+            "name": "Matrix",
+            "latex": "\\begin{matrix} a & b \\\\ c & d \\end{matrix}",
+        },
+    ]
+    
+    base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
+    
+    print("Testing OMML Conversion API")
+    print("=" * 80)
+    
+    for i, test_case in enumerate(test_cases, 1):
+        print(f"\nTest {i}: {test_case['name']}")
+        print("-" * 80)
+        print(f"LaTeX: {test_case['latex']}")
+        
+        try:
+            response = requests.post(
+                base_url,
+                json={"latex": test_case["latex"]},
+                headers={"Content-Type": "application/json"},
+                timeout=10,
+            )
+            
+            if response.status_code == 200:
+                result = response.json()
+                omml = result.get("omml", "")
+                
+                print(f"✓ Status: {response.status_code}")
+                print(f"OMML length: {len(omml)} characters")
+                print(f"OMML preview: {omml[:150]}...")
+                
+            else:
+                print(f"✗ Status: {response.status_code}")
+                print(f"Error: {response.text}")
+                
+        except requests.exceptions.RequestException as e:
+            print(f"✗ Request failed: {e}")
+        except Exception as e:
+            print(f"✗ Error: {e}")
+    
+    print("\n" + "=" * 80)
+
+
+def test_invalid_input():
+    """Test error handling with invalid input."""
+    
+    print("\nTesting Error Handling")
+    print("=" * 80)
+    
+    base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
+    
+    # Empty LaTeX
+    print("\nTest: Empty LaTeX")
+    response = requests.post(
+        base_url,
+        json={"latex": ""},
+        headers={"Content-Type": "application/json"},
+    )
+    print(f"Status: {response.status_code}")
+    print(f"Response: {response.json()}")
+    
+    # Missing LaTeX field
+    print("\nTest: Missing LaTeX field")
+    response = requests.post(
+        base_url,
+        json={},
+        headers={"Content-Type": "application/json"},
+    )
+    print(f"Status: {response.status_code}")
+    print(f"Response: {response.json()}")
+    
+    print("\n" + "=" * 80)
+
+
+if __name__ == "__main__":
+    print("OMML API Test Suite")
+    print("Make sure the API server is running on http://localhost:8000")
+    print()
+    
+    try:
+        test_latex_to_omml()
+        test_invalid_input()
+        print("\n✓ All tests completed!")
+        
+    except KeyboardInterrupt:
+        print("\n\n✗ Tests interrupted by user")
+    except Exception as e:
+        print(f"\n✗ Test suite failed: {e}")
--- a/test_omml_preprocessing.py
+++ b/test_omml_preprocessing.py
@@ -0,0 +1,218 @@
+"""Comprehensive test for OMML conversion with preprocessing."""
+
+from app.services.converter import Converter
+
+
+def test_case_1_array_with_spaces():
+    """Test: Array with spaces in column specifier (the original issue)."""
+    print("\n" + "=" * 80)
+    print("Test 1: Array with spaces in column specifier")
+    print("=" * 80)
+    
+    converter = Converter()
+    
+    # The problematic LaTeX from the error
+    latex = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
+    
+    print(f"LaTeX length: {len(latex)} chars")
+    print(f"Preview: {latex[:100]}...")
+    
+    try:
+        omml = converter.convert_to_omml(latex)
+        print(f"\n✓ SUCCESS: Converted to OMML")
+        print(f"OMML length: {len(omml)} chars")
+        
+        if "oMath" in omml:
+            print("✓ Valid OMML structure detected")
+        
+        # Check preprocessing worked
+        preprocessed = converter._preprocess_formula_for_omml(latex)
+        if "{c c c c}" not in preprocessed and "{cccc}" in preprocessed:
+            print("✓ Array column specifiers fixed: '{c c c c}' → '{cccc}'")
+        
+        return True
+        
+    except Exception as e:
+        print(f"\n✗ FAILED: {e}")
+        return False
+
+
+def test_case_2_vmatrix():
+    """Test: vmatrix environment conversion."""
+    print("\n" + "=" * 80)
+    print("Test 2: vmatrix environment")
+    print("=" * 80)
+    
+    converter = Converter()
+    
+    latex = r"\begin{vmatrix} a & b \\ c & d \end{vmatrix}"
+    print(f"LaTeX: {latex}")
+    
+    try:
+        omml = converter.convert_to_omml(latex)
+        print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
+        
+        # Check if vmatrix was converted
+        preprocessed = converter._preprocess_formula_for_omml(latex)
+        if "vmatrix" not in preprocessed and r"\left|" in preprocessed:
+            print("✓ vmatrix converted to \\left| ... \\right|")
+        
+        return True
+        
+    except Exception as e:
+        print(f"✗ FAILED: {e}")
+        return False
+
+
+def test_case_3_cases_environment():
+    """Test: cases environment conversion."""
+    print("\n" + "=" * 80)
+    print("Test 3: cases environment")
+    print("=" * 80)
+    
+    converter = Converter()
+    
+    latex = r"f(x) = \begin{cases} x^2 & x \geq 0 \\ -x & x < 0 \end{cases}"
+    print(f"LaTeX: {latex}")
+    
+    try:
+        omml = converter.convert_to_omml(latex)
+        print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
+        
+        # Check if cases was converted to array
+        preprocessed = converter._preprocess_formula_for_omml(latex)
+        if "cases" not in preprocessed and "array" in preprocessed:
+            print("✓ cases converted to array environment")
+        
+        return True
+        
+    except Exception as e:
+        print(f"✗ FAILED: {e}")
+        return False
+
+
+def test_case_4_aligned_environment():
+    """Test: aligned environment conversion."""
+    print("\n" + "=" * 80)
+    print("Test 4: aligned environment")
+    print("=" * 80)
+    
+    converter = Converter()
+    
+    latex = r"\begin{aligned} x + y &= 5 \\ 2x - y &= 1 \end{aligned}"
+    print(f"LaTeX: {latex}")
+    
+    try:
+        omml = converter.convert_to_omml(latex)
+        print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
+        
+        # Check if aligned was converted
+        preprocessed = converter._preprocess_formula_for_omml(latex)
+        if "aligned" not in preprocessed and "array" in preprocessed:
+            print("✓ aligned converted to array environment")
+        if "&" not in preprocessed or preprocessed.count("&") < latex.count("&"):
+            print("✓ Alignment markers removed")
+        
+        return True
+        
+    except Exception as e:
+        print(f"✗ FAILED: {e}")
+        return False
+
+
+def test_case_5_simple_formula():
+    """Test: Simple formula (should work without preprocessing)."""
+    print("\n" + "=" * 80)
+    print("Test 5: Simple formula")
+    print("=" * 80)
+    
+    converter = Converter()
+    
+    latex = r"x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}"
+    print(f"LaTeX: {latex}")
+    
+    try:
+        omml = converter.convert_to_omml(latex)
+        print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
+        return True
+        
+    except Exception as e:
+        print(f"✗ FAILED: {e}")
+        return False
+
+
+def test_case_6_nested_structures():
+    """Test: Nested structures with multiple issues."""
+    print("\n" + "=" * 80)
+    print("Test 6: Nested structures")
+    print("=" * 80)
+    
+    converter = Converter()
+    
+    latex = r"\left\{ \begin{array}{l c} \begin{vmatrix} a & b \\ c & d \end{vmatrix} & = ad - bc \\ f(x) = \begin{cases} 1 & x > 0 \\ 0 & x \leq 0 \end{cases} & \text{step function} \end{array} \right."
+    print(f"LaTeX: {latex}")
+    
+    try:
+        omml = converter.convert_to_omml(latex)
+        print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
+        
+        preprocessed = converter._preprocess_formula_for_omml(latex)
+        print("\nPreprocessing applied:")
+        if "vmatrix" not in preprocessed:
+            print("  ✓ vmatrix converted")
+        if "cases" not in preprocessed:
+            print("  ✓ cases converted")
+        if "{l c}" not in preprocessed and "{lc}" in preprocessed:
+            print("  ✓ Array specifiers fixed")
+        
+        return True
+        
+    except Exception as e:
+        print(f"✗ FAILED: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    print("=" * 80)
+    print("OMML CONVERSION TEST SUITE")
+    print("Testing preprocessing and conversion")
+    print("=" * 80)
+    
+    results = []
+    
+    try:
+        results.append(("Simple formula", test_case_5_simple_formula()))
+        results.append(("Array with spaces", test_case_1_array_with_spaces()))
+        results.append(("vmatrix", test_case_2_vmatrix()))
+        results.append(("cases", test_case_3_cases_environment()))
+        results.append(("aligned", test_case_4_aligned_environment()))
+        results.append(("Nested structures", test_case_6_nested_structures()))
+        
+        # Summary
+        print("\n" + "=" * 80)
+        print("TEST SUMMARY")
+        print("=" * 80)
+        
+        passed = sum(1 for _, result in results if result)
+        total = len(results)
+        
+        for name, result in results:
+            status = "✓ PASS" if result else "✗ FAIL"
+            print(f"{status}: {name}")
+        
+        print("\n" + "-" * 80)
+        print(f"Total: {passed}/{total} tests passed")
+        
+        if passed == total:
+            print("\n✓✓✓ ALL TESTS PASSED ✓✓✓")
+        else:
+            print(f"\n✗✗✗ {total - passed} TESTS FAILED ✗✗✗")
+        
+        print("=" * 80)
+        
+    except KeyboardInterrupt:
+        print("\n\nTests interrupted by user")
+    except Exception as e:
+        print(f"\n\nTest suite error: {e}")
+        import traceback
+        traceback.print_exc()
Author	SHA1	Message	Date
liuyuanchuang	e31017cfe7	fix: add preprocess	2026-02-04 12:45:34 +08:00
liuyuanchuang	69f9a70ae5	feat: add omml api	2026-02-04 12:35:14 +08:00
liuyuanchuang	27f25d9f4d	feat: update port config	2026-02-04 12:06:17 +08:00
liuyuanchuang	526c1f3a0d	feat: optimize the format convert	2026-02-04 12:00:06 +08:00
yogeliu	10dbd59161	fix: matrix not rendor in docx	2026-01-14 14:18:00 +08:00
yogeliu	df2b664af4	fix: add image padding for mineru	2026-01-05 21:37:51 +08:00
yogeliu	6ea37c9380	feat: add mineru model	2026-01-05 17:30:54 +08:00
yogeliu	3870c108b2	fix: image alpha error	2026-01-01 23:38:52 +08:00