feature/converter #1

Merged
YogeLiu merged 13 commits from feature/converter into main 2026-02-05 13:48:22 +08:00
7 changed files with 571 additions and 187 deletions
Showing only changes of commit 526c1f3a0d - Show all commits

View File

@@ -2,11 +2,12 @@
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter
from app.schemas.image import ImageOCRRequest, ImageOCRResponse from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse
from app.services.image_processor import ImageProcessor from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector from app.services.layout_detector import LayoutDetector
from app.services.ocr_service import OCRService, MineruOCRService from app.services.ocr_service import OCRService, MineruOCRService
from app.services.converter import Converter
router = APIRouter() router = APIRouter()
@@ -28,6 +29,9 @@ async def process_image_ocr(
- If plain text exists: use PP-DocLayoutV2 for mixed recognition - If plain text exists: use PP-DocLayoutV2 for mixed recognition
- Otherwise: use PaddleOCR-VL with formula prompt - Otherwise: use PaddleOCR-VL with formula prompt
4. Convert output to LaTeX, Markdown, and MathML formats 4. Convert output to LaTeX, Markdown, and MathML formats
Note: OMML conversion is not included due to performance overhead.
Use the /latex-to-omml endpoint to convert LaTeX to OMML separately.
""" """
image = image_processor.preprocess( image = image_processor.preprocess(
@@ -49,4 +53,34 @@ async def process_image_ocr(
latex=ocr_result.get("latex", ""), latex=ocr_result.get("latex", ""),
markdown=ocr_result.get("markdown", ""), markdown=ocr_result.get("markdown", ""),
mathml=ocr_result.get("mathml", ""), mathml=ocr_result.get("mathml", ""),
mml=ocr_result.get("mml", ""),
) )
@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
async def convert_latex_to_omml(
request: LatexToOmmlRequest,
converter: Converter = Depends(get_converter),
) -> LatexToOmmlResponse:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
OMML is the math format used by Microsoft Word and other Office applications.
This endpoint is separate from the main OCR endpoint due to the performance
overhead of OMML conversion (requires creating a temporary DOCX file).
Args:
request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
Returns:
OMML representation of the formula.
"""
if not request.latex or not request.latex.strip():
raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
try:
omml = converter.convert_to_omml(request.latex)
return LatexToOmmlResponse(omml=omml)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))

View File

@@ -40,11 +40,21 @@ class ImageOCRRequest(BaseModel):
class ImageOCRResponse(BaseModel): class ImageOCRResponse(BaseModel):
"""Response body for image OCR endpoint.""" """Response body for image OCR endpoint."""
latex: str = Field("", description="LaTeX representation of the content") latex: str = Field("", description="LaTeX representation of the content (empty if mixed content)")
markdown: str = Field("", description="Markdown representation of the content") markdown: str = Field("", description="Markdown representation of the content")
mathml: str = Field("", description="MathML representation (empty if no math detected)") mathml: str = Field("", description="Standard MathML representation (empty if mixed content)")
mml: str = Field("", description="XML MathML with mml: namespace prefix (empty if mixed content)")
layout_info: LayoutInfo = Field(default_factory=LayoutInfo) layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
recognition_mode: str = Field( recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition")
"", description="Recognition mode used: mixed_recognition or formula_recognition"
)
class LatexToOmmlRequest(BaseModel):
"""Request body for LaTeX to OMML conversion endpoint."""
latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)")
class LatexToOmmlResponse(BaseModel):
"""Response body for LaTeX to OMML conversion endpoint."""
omml: str = Field("", description="OMML (Office Math Markup Language) representation")

View File

@@ -4,17 +4,29 @@ import os
import re import re
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache
from typing import Literal from typing import Literal
import pypandoc import pypandoc
from latex2mathml.converter import convert as latex_to_mathml
@dataclass @dataclass
class ConvertResult: class ConvertResult:
"""Result of markdown conversion.""" """Result of markdown conversion.
Only populated when input contains pure LaTeX formula.
All fields are empty strings when input contains mixed content (text + formula).
Attributes:
latex: Pure LaTeX formula code (without delimiters).
mathml: Standard MathML format.
mml: XML MathML with mml: namespace prefix (mml:math).
"""
latex: str latex: str
mathml: str mathml: str
mml: str
@dataclass @dataclass
@@ -28,59 +40,397 @@ class ExportResult:
ExportType = Literal["docx", "pdf"] ExportType = Literal["docx", "pdf"]
# MathML namespace
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
# XSLT for MathML to mml: namespace conversion
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:mml="http://www.w3.org/1998/Math/MathML"
xmlns:m="http://www.w3.org/1998/Math/MathML"
exclude-result-prefixes="m">
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- Match root math element -->
<xsl:template match="m:math|math">
<mml:math>
<xsl:apply-templates select="@*|node()"/>
</mml:math>
</xsl:template>
<!-- Match all other MathML elements -->
<xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
<xsl:element name="mml:{local-name()}">
<xsl:apply-templates select="@*|node()"/>
</xsl:element>
</xsl:template>
<!-- Copy attributes -->
<xsl:template match="@*">
<xsl:if test="local-name() != 'xmlns'">
<xsl:copy/>
</xsl:if>
</xsl:template>
<!-- Copy text nodes -->
<xsl:template match="text()">
<xsl:value-of select="."/>
</xsl:template>
</xsl:stylesheet>
"""
class Converter: class Converter:
"""Service for conversion and export operations.""" """Service for conversion and export operations.
Conversion rules:
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
- Mixed content (text + formula) returns empty results for all formats.
- OMML conversion is provided as a separate method due to performance overhead.
Performance optimizations:
- Pre-compiled regex patterns
- XSLT-based MML conversion
- Cached XSLT transforms
- Direct Pandoc OMML output (avoids DOCX parsing)
"""
# Pandoc input format with LaTeX math extensions # Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
# Pre-compiled regex patterns for formula detection
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
_RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
# Pre-compiled regex patterns for preprocessing
_RE_VSPACE = re.compile(r"\\\[1mm\]")
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
_RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
_RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
# Cached XSLT transform
_mml_xslt_transform = None
def __init__(self): def __init__(self):
"""Initialize converter.""" """Initialize converter."""
@classmethod
def _get_mml_xslt_transform(cls):
"""Get cached XSLT transform for MathML to mml: conversion."""
if cls._mml_xslt_transform is None:
from lxml import etree
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
return cls._mml_xslt_transform
def _is_formula_only(self, text: str) -> bool:
"""Check if text contains only a LaTeX formula (no mixed content).
A text is considered formula-only if it matches one of these patterns:
- Display math: $$...$$ or \\[...\\]
- Inline math: $...$ or \\(...\\)
Args:
text: Input text to check.
Returns:
True if the text contains only a LaTeX formula, False otherwise.
"""
text = text.strip()
if not text:
return False
# Strict patterns: entire text must be a single formula with delimiters
# Using pre-compiled patterns with fullmatch semantics
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
return True
if self._RE_DISPLAY_BRACKET.fullmatch(text):
return True
if self._RE_INLINE_DOLLAR.fullmatch(text):
return True
if self._RE_INLINE_PAREN.fullmatch(text):
return True
return False
def convert_to_formats(self, md_text: str) -> ConvertResult: def convert_to_formats(self, md_text: str) -> ConvertResult:
"""Convert markdown to LaTeX and MathML formats. """Convert markdown to LaTeX, MathML, and MML formats.
Only converts when input contains a pure LaTeX formula.
Mixed content (text + formula) returns empty strings for all fields.
Args: Args:
md_text: Markdown text to convert. md_text: Markdown text to convert.
Returns: Returns:
ConvertResult with latex and mathml fields. ConvertResult with latex, mathml, and mml fields.
All fields are empty if input is not a pure formula.
Raises: Raises:
ValueError: If md_text is empty. RuntimeError: If conversion fails for a valid formula.
RuntimeError: If conversion fails.
""" """
if md_text == "": # Empty input returns empty result
return ConvertResult(latex="", mathml="") if not md_text or not md_text.strip():
return ConvertResult(latex="", mathml="", mml="")
# Check if input is formula-only
if not self._is_formula_only(md_text):
# Mixed content: cannot convert to formula formats
return ConvertResult(latex="", mathml="", mml="")
try: try:
# Convert to LaTeX # Extract the LaTeX formula content (remove delimiters)
latex_output = pypandoc.convert_text( latex_formula = self._extract_latex_formula(md_text)
md_text,
"latex",
format=self.INPUT_FORMAT,
).rstrip("\n")
# Convert to HTML with MathML # Convert to MathML
mathml_output = pypandoc.convert_text( mathml = self._latex_to_mathml(latex_formula)
md_text,
"html",
format=self.INPUT_FORMAT,
extra_args=["--mathml"],
).rstrip("\n")
return ConvertResult(latex=latex_output, mathml=mathml_output) # Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml)
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
except Exception as e: except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e raise RuntimeError(f"Conversion failed: {e}") from e
def convert_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
This is a separate method due to the performance overhead of OMML conversion,
which requires creating a temporary DOCX file.
Args:
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
Returns:
OMML representation as XML string.
Raises:
ValueError: If latex_formula is empty.
RuntimeError: If conversion fails.
"""
if not latex_formula or not latex_formula.strip():
raise ValueError("LaTeX formula cannot be empty")
return self._latex_to_omml(latex_formula.strip())
def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
Pure LaTeX formula without delimiters.
"""
text = text.strip()
# Remove display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return text[2:-2].strip()
if text.startswith("\\[") and text.endswith("\\]"):
return text[2:-2].strip()
# Remove inline math delimiters: $...$ or \(...\)
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
return text[1:-1].strip()
if text.startswith("\\(") and text.endswith("\\)"):
return text[2:-2].strip()
# If no delimiters, return as-is
return text.strip()
@staticmethod
@lru_cache(maxsize=256)
def _latex_to_mathml_cached(latex_formula: str) -> str:
"""Cached conversion of LaTeX formula to MathML.
Uses LRU cache to avoid recomputing for repeated formulas.
"""
try:
# Use latex2mathml library for conversion (fast, pure Python)
return latex_to_mathml(latex_formula)
except Exception as e:
# Fallback: try with Pandoc (slower, but more robust)
try:
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
)
# Extract just the <math> element from the HTML
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
if match:
return match.group(0)
return mathml_html.rstrip("\n")
except Exception as pandoc_error:
raise RuntimeError(
f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
) from e
def _latex_to_mathml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to standard MathML.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
Returns:
Standard MathML representation.
"""
return self._latex_to_mathml_cached(latex_formula)
def _mathml_to_mml(self, mathml: str) -> str:
"""Convert standard MathML to mml:math format with namespace prefix.
Uses XSLT for efficient transformation. Transforms:
- <math ...> to <mml:math xmlns:mml="..." ...>
- All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
Args:
mathml: Standard MathML string.
Returns:
MathML with mml: namespace prefix.
"""
if not mathml:
return ""
try:
from lxml import etree
# Parse MathML
root = etree.fromstring(mathml.encode("utf-8"))
# Apply XSLT transformation (cached)
transform = self._get_mml_xslt_transform()
result_tree = transform(root)
# Serialize to string
return str(result_tree)
except Exception:
# Fallback: simple string replacement (less robust but no lxml dependency)
result = mathml
# Add namespace to root math element
result = re.sub(
r"<math\b",
f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
result,
)
result = re.sub(r"</math>", "</mml:math>", result)
# Add mml: prefix to all other elements using a single regex
# Match opening tags
result = re.sub(
r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
r"maction|semantics|annotation|annotation-xml)\b",
r"<mml:\1",
result,
)
# Match closing tags
result = re.sub(
r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
r"maction|semantics|annotation|annotation-xml)>",
r"</mml:\1>",
result,
)
return result
def _latex_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
Uses Pandoc to create DOCX in memory and extracts OMML from it.
Optimized to minimize disk I/O by using in-memory zip processing.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
Returns:
OMML representation as XML string.
"""
import io
import zipfile
try:
from lxml import etree
# Convert to DOCX bytes using Pandoc
# We still need a temp file for input, but output goes to temp file too
# Then we process the DOCX in memory
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(f"$${latex_formula}$$\n")
temp_md = f.name
temp_docx = temp_md.replace(".md", ".docx")
try:
pypandoc.convert_file(
temp_md,
"docx",
format=self.INPUT_FORMAT,
outputfile=temp_docx,
)
# Read DOCX into memory and process as ZIP
with open(temp_docx, "rb") as f:
docx_bytes = f.read()
# Extract document.xml from DOCX (which is a ZIP file)
with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
document_xml = zf.read("word/document.xml")
# Parse XML and extract OMML
root = etree.fromstring(document_xml)
# Find all oMath elements
omml_parts = []
for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
omml_parts.append(etree.tostring(math, encoding="unicode"))
return "\n".join(omml_parts)
finally:
# Cleanup temp files
if os.path.exists(temp_md):
os.remove(temp_md)
if os.path.exists(temp_docx):
os.remove(temp_docx)
except Exception as e:
raise RuntimeError(f"OMML conversion failed: {e}") from e
def preprocess_for_export(self, md_text: str) -> str: def preprocess_for_export(self, md_text: str) -> str:
"""Preprocess markdown text for export to docx/pdf. """Preprocess markdown text for export to docx/pdf.
Handles LaTeX formula formatting, matrix environments, and Handles LaTeX formula formatting, matrix environments, and
other transformations needed for proper Word/PDF rendering. other transformations needed for proper Word/PDF rendering.
Uses pre-compiled regex patterns for better performance.
Args: Args:
md_text: Raw markdown text. md_text: Raw markdown text.
@@ -88,36 +438,23 @@ class Converter:
Preprocessed markdown text. Preprocessed markdown text.
""" """
# Replace \[1mm] => \vspace{1mm} # Replace \[1mm] => \vspace{1mm}
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text) md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)
# Add blank lines around \[...\] block formulas # Add blank lines around \[...\] block formulas
md_text = re.sub( md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)
r"\1\n\n\\[\3\\]\n\n\4",
md_text,
flags=re.DOTALL,
)
md_text = re.sub(
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
r"\n\\[\2\\]\n",
md_text,
flags=re.MULTILINE | re.DOTALL,
)
# Remove arithmatex span wrappers # Remove arithmatex span wrappers
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text) cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)
# Convert inline formulas: \( \) => $ $ # Convert inline formulas: \( \) => $ $
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md) cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
# Convert block formulas: \[ \] => $$ $$ # Convert block formulas: \[ \] => $$ $$
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md) cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
# Remove spaces between $ and formula content # Remove spaces between $ and formula content
# Use negative lookahead/lookbehind to avoid matching $$ block formulas cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
# Convert matrix environments for better Word rendering # Convert matrix environments for better Word rendering
cleaned_md = self._convert_matrix_environments(cleaned_md) cleaned_md = self._convert_matrix_environments(cleaned_md)
@@ -142,19 +479,15 @@ class Converter:
This fixes the vertical line height issues in Word. This fixes the vertical line height issues in Word.
""" """
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right| # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
md_text = re.sub( md_text = self._RE_VMATRIX.sub(
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
md_text, md_text,
flags=re.DOTALL,
) )
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
md_text = re.sub( md_text = self._RE_VMATRIX_DOUBLE.sub(
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
md_text, md_text,
flags=re.DOTALL,
) )
return md_text return md_text
@@ -165,50 +498,22 @@ class Converter:
Pandoc's OMML converter doesn't accept spaces between column alignment Pandoc's OMML converter doesn't accept spaces between column alignment
specifiers in array environments. This converts patterns like specifiers in array environments. This converts patterns like
{c c c c} to {cccc}. {c c c c} to {cccc}.
Args:
md_text: Markdown text with LaTeX formulas.
Returns:
Markdown text with fixed array column specifiers.
""" """
def remove_spaces_in_specifier(match: re.Match) -> str: def remove_spaces_in_specifier(match: re.Match) -> str:
"""Remove spaces from column specifier.""" """Remove spaces from column specifier."""
specifier = match.group(1) specifier = match.group(1)
# Remove all spaces from the specifier return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
specifier_no_spaces = re.sub(r"\s+", "", specifier)
return f"\\begin{{array}}{{{specifier_no_spaces}}}"
# Match \begin{array}{...} and remove spaces in the column specifier return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
# Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...}
md_text = re.sub(
r"\\begin\{array\}\{([^}]+)\}",
remove_spaces_in_specifier,
md_text,
)
return md_text
def _fix_brace_spacing(self, md_text: str) -> str: def _fix_brace_spacing(self, md_text: str) -> str:
"""Fix spacing issues with braces in equation systems. """Fix spacing issues with braces in equation systems.
Removes whitespace and adds negative space for proper alignment in Word/OMML. Removes whitespace and adds negative space for proper alignment in Word/OMML.
""" """
# Fix \left\{ spacing md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
md_text = re.sub( md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
r"\\left\\\{\s+",
r"\\left\\{\\!",
md_text,
)
# Fix \right\} spacing
md_text = re.sub(
r"\s+\\right\\\}",
r"\\!\\right\\}",
md_text,
)
return md_text return md_text
def _convert_special_environments(self, md_text: str) -> str: def _convert_special_environments(self, md_text: str) -> str:
@@ -216,42 +521,28 @@ class Converter:
These environments have better rendering support in Word/OMML. These environments have better rendering support in Word/OMML.
""" """
# Pre-compiled pattern for alignment marker removal
_re_align_marker = re.compile(r"(^|\\\\)\s*&")
def convert_cases(match: re.Match) -> str: def convert_cases(match: re.Match) -> str:
content = match.group(1) content = match.group(1)
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
md_text = re.sub( md_text = self._RE_CASES.sub(convert_cases, md_text)
r"\\begin\{cases\}(.*?)\\end\{cases\}",
convert_cases,
md_text,
flags=re.DOTALL,
)
def convert_aligned_to_array(match: re.Match) -> str: def convert_aligned_to_array(match: re.Match) -> str:
content = match.group(1) content = match.group(1)
# Remove leading & alignment markers (not needed in array{l}) content = _re_align_marker.sub(r"\1", content)
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
md_text = re.sub( md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
convert_aligned_to_array,
md_text,
flags=re.DOTALL,
)
def convert_standalone_aligned(match: re.Match) -> str: def convert_standalone_aligned(match: re.Match) -> str:
content = match.group(1) content = match.group(1)
content = re.sub(r"(^|\\\\)\s*&", r"\1", content) content = _re_align_marker.sub(r"\1", content)
return r"\begin{array}{l}" + content + r"\end{array}" return r"\begin{array}{l}" + content + r"\end{array}"
md_text = re.sub( md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
convert_standalone_aligned,
md_text,
flags=re.DOTALL,
)
return md_text return md_text
@@ -259,36 +550,15 @@ class Converter:
"""Convert LaTeX \\tag{} commands to Word-compatible format. """Convert LaTeX \\tag{} commands to Word-compatible format.
The \\tag{} command is not supported in Word OMML format, so we convert it to The \\tag{} command is not supported in Word OMML format, so we convert it to
use simple spacing (\quad) to push the equation number to the right side. use simple spacing (\\quad) to push the equation number to the right side.
The tag remains inside the formula for better compatibility.
Args:
md_text: Markdown text containing LaTeX formulas with \\tag{}.
Returns:
Markdown text with \\tag{} commands converted to spacing format.
""" """
def convert_tag(match: re.Match) -> str: def convert_tag(match: re.Match) -> str:
"""Convert a single \\tag{} command within a formula."""
formula_content = match.group(1) formula_content = match.group(1)
tag_content = match.group(2) tag_content = match.group(2)
# Replace \tag{...} with \quad (...) to push the number to the right
# Keep it inside the formula for better Word compatibility
return f"$${formula_content} \\quad ({tag_content})$$" return f"$${formula_content} \\quad ({tag_content})$$"
# Match display formulas ($$...$$) containing \\tag{...} return self._RE_TAG.sub(convert_tag, md_text)
# Pattern: $$...content...\\tag {?...}...$$
# Allow optional space between \tag and {
md_text = re.sub(
r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$",
convert_tag,
md_text,
flags=re.DOTALL,
)
return md_text
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
"""Export markdown to docx or pdf file. """Export markdown to docx or pdf file.
@@ -381,4 +651,3 @@ class Converter:
""" """
if os.path.exists(file_path): if os.path.exists(file_path):
os.remove(file_path) os.remove(file_path)

View File

@@ -17,13 +17,31 @@ settings = get_settings()
_COMMANDS_NEED_SPACE = { _COMMANDS_NEED_SPACE = {
# operators / calculus # operators / calculus
"cdot", "times", "div", "pm", "mp", "cdot",
"int", "iint", "iiint", "oint", "sum", "prod", "lim", "times",
"div",
"pm",
"mp",
"int",
"iint",
"iiint",
"oint",
"sum",
"prod",
"lim",
# common functions # common functions
"sin", "cos", "tan", "cot", "sec", "csc", "sin",
"log", "ln", "exp", "cos",
"tan",
"cot",
"sec",
"csc",
"log",
"ln",
"exp",
# misc # misc
"partial", "nabla", "partial",
"nabla",
} }
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL) _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
@@ -58,7 +76,7 @@ def _split_glued_command_token(token: str) -> str:
if not best: if not best:
return token return token
suffix = body[len(best):] suffix = body[len(best) :]
if not suffix: if not suffix:
return token return token
@@ -165,6 +183,7 @@ class OCRService(OCRServiceBase):
"markdown": markdown_content, "markdown": markdown_content,
"latex": convert_result.latex, "latex": convert_result.latex,
"mathml": convert_result.mathml, "mathml": convert_result.mathml,
"mml": convert_result.mml,
} }
except Exception as e: except Exception as e:
raise RuntimeError(f"Mixed recognition failed: {e}") from e raise RuntimeError(f"Mixed recognition failed: {e}") from e
@@ -196,6 +215,7 @@ class OCRService(OCRServiceBase):
return { return {
"latex": convert_result.latex, "latex": convert_result.latex,
"mathml": convert_result.mathml, "mathml": convert_result.mathml,
"mml": convert_result.mml,
"markdown": markdown_content, "markdown": markdown_content,
} }
except Exception as e: except Exception as e:
@@ -251,65 +271,60 @@ class MineruOCRService(OCRServiceBase):
image = self.image_processor.add_padding(image) image = self.image_processor.add_padding(image)
# Convert numpy array to image bytes # Convert numpy array to image bytes
success, encoded_image = cv2.imencode('.png', image) success, encoded_image = cv2.imencode(".png", image)
if not success: if not success:
raise RuntimeError("Failed to encode image") raise RuntimeError("Failed to encode image")
image_bytes = BytesIO(encoded_image.tobytes()) image_bytes = BytesIO(encoded_image.tobytes())
# Prepare multipart form data # Prepare multipart form data
files = { files = {"files": ("image.png", image_bytes, "image/png")}
'files': ('image.png', image_bytes, 'image/png')
}
data = { data = {
'return_middle_json': 'false', "return_middle_json": "false",
'return_model_output': 'false', "return_model_output": "false",
'return_md': 'true', "return_md": "true",
'return_images': 'false', "return_images": "false",
'end_page_id': '99999', "end_page_id": "99999",
'start_page_id': '0', "start_page_id": "0",
'lang_list': 'en', "lang_list": "en",
'server_url': 'string', "server_url": "string",
'return_content_list': 'false', "return_content_list": "false",
'backend': 'hybrid-auto-engine', "backend": "hybrid-auto-engine",
'table_enable': 'true', "table_enable": "true",
'response_format_zip': 'false', "response_format_zip": "false",
'formula_enable': 'true', "formula_enable": "true",
'parse_method': 'ocr' "parse_method": "ocr",
} }
# Make API request # Make API request
response = requests.post( response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30)
self.api_url,
files=files,
data=data,
headers={'accept': 'application/json'},
timeout=30
)
response.raise_for_status() response.raise_for_status()
result = response.json() result = response.json()
# Extract markdown content from response # Extract markdown content from response
markdown_content = "" markdown_content = ""
if 'results' in result and 'image' in result['results']: if "results" in result and "image" in result["results"]:
markdown_content = result['results']['image'].get('md_content', '') markdown_content = result["results"]["image"].get("md_content", "")
# markdown_content = _postprocess_markdown(markdown_content) # markdown_content = _postprocess_markdown(markdown_content)
# Convert to other formats if converter is available # Convert to other formats if converter is available
latex = "" latex = ""
mathml = "" mathml = ""
mml = ""
if self.converter and markdown_content: if self.converter and markdown_content:
convert_result = self.converter.convert_to_formats(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content)
latex = convert_result.latex latex = convert_result.latex
mathml = convert_result.mathml mathml = convert_result.mathml
mml = convert_result.mml
return { return {
"markdown": markdown_content, "markdown": markdown_content,
"latex": latex, "latex": latex,
"mathml": mathml, "mathml": mathml,
"mml": mml,
} }
except requests.RequestException as e: except requests.RequestException as e:
@@ -318,8 +333,6 @@ class MineruOCRService(OCRServiceBase):
raise RuntimeError(f"Recognition failed: {e}") from e raise RuntimeError(f"Recognition failed: {e}") from e
if __name__ == "__main__": if __name__ == "__main__":
mineru_service = MineruOCRService() mineru_service = MineruOCRService()
image = cv2.imread("test/complex_formula.png") image = cv2.imread("test/complex_formula.png")

View File

@@ -26,7 +26,8 @@ dependencies = [
"pypandoc==1.16.2", "pypandoc==1.16.2",
"paddlepaddle", "paddlepaddle",
"paddleocr[doc-parser]", "paddleocr[doc-parser]",
"safetensors" "safetensors",
"lxml>=5.0.0"
] ]
[tool.uv.sources] [tool.uv.sources]

57
test_converter.py Normal file
View File

@@ -0,0 +1,57 @@
"""Test script for converter functionality."""
from app.services.converter import Converter
def test_latex_only_conversion():
"""Test conversion of LaTeX-only content."""
converter = Converter()
# Test case 1: Display math with $$...$$
latex_input = "$$E = mc^2$$"
result = converter.convert_to_formats(latex_input)
print("Test 1: Display math ($$...$$)")
print(f"Input: {latex_input}")
print(f"LaTeX: {result.latex}")
print(f"MathML: {result.mathml[:100]}...")
print(f"MML: {result.mml[:100]}...")
print(f"OMML: {result.omml[:100] if result.omml else 'Empty'}...")
print()
# Test case 2: Inline math with $...$
latex_input2 = "$\\frac{a}{b}$"
result2 = converter.convert_to_formats(latex_input2)
print("Test 2: Inline math ($...$)")
print(f"Input: {latex_input2}")
print(f"LaTeX: {result2.latex}")
print(f"MathML: {result2.mathml[:100]}...")
print()
# Test case 3: Complex formula
latex_input3 = "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$"
result3 = converter.convert_to_formats(latex_input3)
print("Test 3: Complex formula")
print(f"Input: {latex_input3}")
print(f"LaTeX: {result3.latex}")
print(f"MathML: {result3.mathml[:150]}...")
print(f"OMML length: {len(result3.omml)}")
print()
# Test case 4: Regular markdown (not LaTeX-only)
markdown_input = "# Hello\n\nThis is a test with math: $x = 2$"
result4 = converter.convert_to_formats(markdown_input)
print("Test 4: Regular markdown")
print(f"Input: {markdown_input}")
print(f"LaTeX: {result4.latex[:100]}...")
print(f"MathML: {result4.mathml[:100]}...")
print(f"MML: {result4.mml}")
print(f"OMML: {result4.omml}")
print()
if __name__ == "__main__":
test_latex_only_conversion()