diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py
index e2e0c92..3c18f92 100644
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -2,11 +2,12 @@
from fastapi import APIRouter, Depends, HTTPException
-from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service
-from app.schemas.image import ImageOCRRequest, ImageOCRResponse
+from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter
+from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
from app.services.ocr_service import OCRService, MineruOCRService
+from app.services.converter import Converter
router = APIRouter()
@@ -28,6 +29,9 @@ async def process_image_ocr(
- If plain text exists: use PP-DocLayoutV2 for mixed recognition
- Otherwise: use PaddleOCR-VL with formula prompt
4. Convert output to LaTeX, Markdown, and MathML formats
+
+ Note: OMML conversion is not included due to performance overhead.
+ Use the /latex-to-omml endpoint to convert LaTeX to OMML separately.
"""
image = image_processor.preprocess(
@@ -49,4 +53,34 @@ async def process_image_ocr(
latex=ocr_result.get("latex", ""),
markdown=ocr_result.get("markdown", ""),
mathml=ocr_result.get("mathml", ""),
+ mml=ocr_result.get("mml", ""),
)
+
+
+@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
+async def convert_latex_to_omml(
+ request: LatexToOmmlRequest,
+ converter: Converter = Depends(get_converter),
+) -> LatexToOmmlResponse:
+ """Convert LaTeX formula to OMML (Office Math Markup Language).
+
+ OMML is the math format used by Microsoft Word and other Office applications.
+ This endpoint is separate from the main OCR endpoint due to the performance
+ overhead of OMML conversion (requires creating a temporary DOCX file).
+
+ Args:
+ request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
+
+ Returns:
+ OMML representation of the formula.
+ """
+ if not request.latex or not request.latex.strip():
+ raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
+
+ try:
+ omml = converter.convert_to_omml(request.latex)
+ return LatexToOmmlResponse(omml=omml)
+ except ValueError as e:
+ raise HTTPException(status_code=400, detail=str(e))
+ except RuntimeError as e:
+ raise HTTPException(status_code=503, detail=str(e))
diff --git a/app/core/config.py b/app/core/config.py
index 6b33e14..ab3e21e 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -23,7 +23,7 @@ class Settings(BaseSettings):
# PaddleOCR-VL Settings
paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"
-
+
# MinerOCR Settings
miner_ocr_api_url: str = "http://127.0.0.1:8000/file_parse"
diff --git a/app/schemas/image.py b/app/schemas/image.py
index 23be6d0..fb8946f 100644
--- a/app/schemas/image.py
+++ b/app/schemas/image.py
@@ -40,11 +40,21 @@ class ImageOCRRequest(BaseModel):
class ImageOCRResponse(BaseModel):
"""Response body for image OCR endpoint."""
- latex: str = Field("", description="LaTeX representation of the content")
+ latex: str = Field("", description="LaTeX representation of the content (empty if mixed content)")
markdown: str = Field("", description="Markdown representation of the content")
- mathml: str = Field("", description="MathML representation (empty if no math detected)")
+ mathml: str = Field("", description="Standard MathML representation (empty if mixed content)")
+ mml: str = Field("", description="XML MathML with mml: namespace prefix (empty if mixed content)")
layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
- recognition_mode: str = Field(
- "", description="Recognition mode used: mixed_recognition or formula_recognition"
- )
+ recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition")
+
+class LatexToOmmlRequest(BaseModel):
+ """Request body for LaTeX to OMML conversion endpoint."""
+
+ latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)")
+
+
+class LatexToOmmlResponse(BaseModel):
+ """Response body for LaTeX to OMML conversion endpoint."""
+
+ omml: str = Field("", description="OMML (Office Math Markup Language) representation")
diff --git a/app/services/converter.py b/app/services/converter.py
index e18abd3..b5ff2ba 100644
--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -4,17 +4,29 @@ import os
import re
import tempfile
from dataclasses import dataclass
+from functools import lru_cache
from typing import Literal
import pypandoc
+from latex2mathml.converter import convert as latex_to_mathml
@dataclass
class ConvertResult:
- """Result of markdown conversion."""
+ """Result of markdown conversion.
+
+ Only populated when input contains pure LaTeX formula.
+ All fields are empty strings when input contains mixed content (text + formula).
+
+ Attributes:
+ latex: Pure LaTeX formula code (without delimiters).
+ mathml: Standard MathML format.
+ mml: XML MathML with mml: namespace prefix (mml:math).
+ """
latex: str
mathml: str
+ mml: str
@dataclass
@@ -28,59 +40,397 @@ class ExportResult:
ExportType = Literal["docx", "pdf"]
+# MathML namespace
+MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
+OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
+
+# XSLT for MathML to mml: namespace conversion
+MML_XSLT = """
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
class Converter:
- """Service for conversion and export operations."""
+ """Service for conversion and export operations.
+
+ Conversion rules:
+ - Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
+ - Mixed content (text + formula) returns empty results for all formats.
+ - OMML conversion is provided as a separate method due to performance overhead.
+
+ Performance optimizations:
+ - Pre-compiled regex patterns
+ - XSLT-based MML conversion
+ - Cached XSLT transforms
+ - Direct Pandoc OMML output (avoids DOCX parsing)
+ """
# Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
+ # Pre-compiled regex patterns for formula detection
+ _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
+ _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
+ _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
+ _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
+ _RE_MATH_ELEMENT = re.compile(r"")
+
+ # Pre-compiled regex patterns for preprocessing
+ _RE_VSPACE = re.compile(r"\\\[1mm\]")
+ _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
+ _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
+ _RE_ARITHMATEX = re.compile(r'(.*?)')
+ _RE_INLINE_SPACE = re.compile(r"(? bool:
+ """Check if text contains only a LaTeX formula (no mixed content).
+
+ A text is considered formula-only if it matches one of these patterns:
+ - Display math: $$...$$ or \\[...\\]
+ - Inline math: $...$ or \\(...\\)
+
+ Args:
+ text: Input text to check.
+
+ Returns:
+ True if the text contains only a LaTeX formula, False otherwise.
+ """
+ text = text.strip()
+
+ if not text:
+ return False
+
+ # Strict patterns: entire text must be a single formula with delimiters
+ # Using pre-compiled patterns with fullmatch semantics
+ if self._RE_DISPLAY_DOLLAR.fullmatch(text):
+ return True
+ if self._RE_DISPLAY_BRACKET.fullmatch(text):
+ return True
+ if self._RE_INLINE_DOLLAR.fullmatch(text):
+ return True
+ if self._RE_INLINE_PAREN.fullmatch(text):
+ return True
+
+ return False
+
def convert_to_formats(self, md_text: str) -> ConvertResult:
- """Convert markdown to LaTeX and MathML formats.
+ """Convert markdown to LaTeX, MathML, and MML formats.
+
+ Only converts when input contains a pure LaTeX formula.
+ Mixed content (text + formula) returns empty strings for all fields.
Args:
md_text: Markdown text to convert.
Returns:
- ConvertResult with latex and mathml fields.
+ ConvertResult with latex, mathml, and mml fields.
+ All fields are empty if input is not a pure formula.
Raises:
- ValueError: If md_text is empty.
- RuntimeError: If conversion fails.
+ RuntimeError: If conversion fails for a valid formula.
"""
- if md_text == "":
- return ConvertResult(latex="", mathml="")
+ # Empty input returns empty result
+ if not md_text or not md_text.strip():
+ return ConvertResult(latex="", mathml="", mml="")
+
+ # Check if input is formula-only
+ if not self._is_formula_only(md_text):
+ # Mixed content: cannot convert to formula formats
+ return ConvertResult(latex="", mathml="", mml="")
try:
- # Convert to LaTeX
- latex_output = pypandoc.convert_text(
- md_text,
- "latex",
- format=self.INPUT_FORMAT,
- ).rstrip("\n")
+ # Extract the LaTeX formula content (remove delimiters)
+ latex_formula = self._extract_latex_formula(md_text)
- # Convert to HTML with MathML
- mathml_output = pypandoc.convert_text(
- md_text,
- "html",
- format=self.INPUT_FORMAT,
- extra_args=["--mathml"],
- ).rstrip("\n")
+ # Convert to MathML
+ mathml = self._latex_to_mathml(latex_formula)
- return ConvertResult(latex=latex_output, mathml=mathml_output)
+ # Convert MathML to mml:math format (with namespace prefix)
+ mml = self._mathml_to_mml(mathml)
+
+ return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e
+ def convert_to_omml(self, latex_formula: str) -> str:
+ """Convert LaTeX formula to OMML (Office Math Markup Language).
+
+ This is a separate method due to the performance overhead of OMML conversion,
+ which requires creating a temporary DOCX file.
+
+ Args:
+ latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
+
+ Returns:
+ OMML representation as XML string.
+
+ Raises:
+ ValueError: If latex_formula is empty.
+ RuntimeError: If conversion fails.
+ """
+ if not latex_formula or not latex_formula.strip():
+ raise ValueError("LaTeX formula cannot be empty")
+
+ return self._latex_to_omml(latex_formula.strip())
+
+ def _extract_latex_formula(self, text: str) -> str:
+ """Extract LaTeX formula from text by removing delimiters.
+
+ Args:
+ text: Text containing LaTeX formula with delimiters.
+
+ Returns:
+ Pure LaTeX formula without delimiters.
+ """
+ text = text.strip()
+
+ # Remove display math delimiters: $$...$$ or \[...\]
+ if text.startswith("$$") and text.endswith("$$"):
+ return text[2:-2].strip()
+ if text.startswith("\\[") and text.endswith("\\]"):
+ return text[2:-2].strip()
+
+ # Remove inline math delimiters: $...$ or \(...\)
+ if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
+ return text[1:-1].strip()
+ if text.startswith("\\(") and text.endswith("\\)"):
+ return text[2:-2].strip()
+
+ # If no delimiters, return as-is
+ return text.strip()
+
+ @staticmethod
+ @lru_cache(maxsize=256)
+ def _latex_to_mathml_cached(latex_formula: str) -> str:
+ """Cached conversion of LaTeX formula to MathML.
+
+ Uses LRU cache to avoid recomputing for repeated formulas.
+ """
+ try:
+ # Use latex2mathml library for conversion (fast, pure Python)
+ return latex_to_mathml(latex_formula)
+ except Exception as e:
+ # Fallback: try with Pandoc (slower, but more robust)
+ try:
+ mathml_html = pypandoc.convert_text(
+ f"${latex_formula}$",
+ "html",
+ format="markdown+tex_math_dollars",
+ extra_args=["--mathml"],
+ )
+ # Extract just the