From 526c1f3a0d92a60e45b9045bd7f7b36c55299a78 Mon Sep 17 00:00:00 2001 From: liuyuanchuang Date: Wed, 4 Feb 2026 12:00:06 +0800 Subject: [PATCH] feat: optimize the format convert --- app/api/v1/endpoints/image.py | 38 ++- app/core/config.py | 2 +- app/schemas/image.py | 20 +- app/services/converter.py | 519 ++++++++++++++++++++++++++-------- app/services/ocr_service.py | 119 ++++---- pyproject.toml | 3 +- test_converter.py | 57 ++++ 7 files changed, 571 insertions(+), 187 deletions(-) create mode 100644 test_converter.py diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py index e2e0c92..3c18f92 100644 --- a/app/api/v1/endpoints/image.py +++ b/app/api/v1/endpoints/image.py @@ -2,11 +2,12 @@ from fastapi import APIRouter, Depends, HTTPException -from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service -from app.schemas.image import ImageOCRRequest, ImageOCRResponse +from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter +from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector from app.services.ocr_service import OCRService, MineruOCRService +from app.services.converter import Converter router = APIRouter() @@ -28,6 +29,9 @@ async def process_image_ocr( - If plain text exists: use PP-DocLayoutV2 for mixed recognition - Otherwise: use PaddleOCR-VL with formula prompt 4. Convert output to LaTeX, Markdown, and MathML formats + + Note: OMML conversion is not included due to performance overhead. + Use the /latex-to-omml endpoint to convert LaTeX to OMML separately. """ image = image_processor.preprocess( @@ -49,4 +53,34 @@ async def process_image_ocr( latex=ocr_result.get("latex", ""), markdown=ocr_result.get("markdown", ""), mathml=ocr_result.get("mathml", ""), + mml=ocr_result.get("mml", ""), ) + + +@router.post("/latex-to-omml", response_model=LatexToOmmlResponse) +async def convert_latex_to_omml( + request: LatexToOmmlRequest, + converter: Converter = Depends(get_converter), +) -> LatexToOmmlResponse: + """Convert LaTeX formula to OMML (Office Math Markup Language). + + OMML is the math format used by Microsoft Word and other Office applications. + This endpoint is separate from the main OCR endpoint due to the performance + overhead of OMML conversion (requires creating a temporary DOCX file). + + Args: + request: Contains the LaTeX formula to convert (without $ or $$ delimiters). + + Returns: + OMML representation of the formula. + """ + if not request.latex or not request.latex.strip(): + raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty") + + try: + omml = converter.convert_to_omml(request.latex) + return LatexToOmmlResponse(omml=omml) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except RuntimeError as e: + raise HTTPException(status_code=503, detail=str(e)) diff --git a/app/core/config.py b/app/core/config.py index 6b33e14..ab3e21e 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -23,7 +23,7 @@ class Settings(BaseSettings): # PaddleOCR-VL Settings paddleocr_vl_url: str = "http://127.0.0.1:8000/v1" - + # MinerOCR Settings miner_ocr_api_url: str = "http://127.0.0.1:8000/file_parse" diff --git a/app/schemas/image.py b/app/schemas/image.py index 23be6d0..fb8946f 100644 --- a/app/schemas/image.py +++ b/app/schemas/image.py @@ -40,11 +40,21 @@ class ImageOCRRequest(BaseModel): class ImageOCRResponse(BaseModel): """Response body for image OCR endpoint.""" - latex: str = Field("", description="LaTeX representation of the content") + latex: str = Field("", description="LaTeX representation of the content (empty if mixed content)") markdown: str = Field("", description="Markdown representation of the content") - mathml: str = Field("", description="MathML representation (empty if no math detected)") + mathml: str = Field("", description="Standard MathML representation (empty if mixed content)") + mml: str = Field("", description="XML MathML with mml: namespace prefix (empty if mixed content)") layout_info: LayoutInfo = Field(default_factory=LayoutInfo) - recognition_mode: str = Field( - "", description="Recognition mode used: mixed_recognition or formula_recognition" - ) + recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition") + +class LatexToOmmlRequest(BaseModel): + """Request body for LaTeX to OMML conversion endpoint.""" + + latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)") + + +class LatexToOmmlResponse(BaseModel): + """Response body for LaTeX to OMML conversion endpoint.""" + + omml: str = Field("", description="OMML (Office Math Markup Language) representation") diff --git a/app/services/converter.py b/app/services/converter.py index e18abd3..b5ff2ba 100644 --- a/app/services/converter.py +++ b/app/services/converter.py @@ -4,17 +4,29 @@ import os import re import tempfile from dataclasses import dataclass +from functools import lru_cache from typing import Literal import pypandoc +from latex2mathml.converter import convert as latex_to_mathml @dataclass class ConvertResult: - """Result of markdown conversion.""" + """Result of markdown conversion. + + Only populated when input contains pure LaTeX formula. + All fields are empty strings when input contains mixed content (text + formula). + + Attributes: + latex: Pure LaTeX formula code (without delimiters). + mathml: Standard MathML format. + mml: XML MathML with mml: namespace prefix (mml:math). + """ latex: str mathml: str + mml: str @dataclass @@ -28,59 +40,397 @@ class ExportResult: ExportType = Literal["docx", "pdf"] +# MathML namespace +MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML" +OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math" + +# XSLT for MathML to mml: namespace conversion +MML_XSLT = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +""" + class Converter: - """Service for conversion and export operations.""" + """Service for conversion and export operations. + + Conversion rules: + - Only pure LaTeX formulas can be converted to latex/mathml/mml formats. + - Mixed content (text + formula) returns empty results for all formats. + - OMML conversion is provided as a separate method due to performance overhead. + + Performance optimizations: + - Pre-compiled regex patterns + - XSLT-based MML conversion + - Cached XSLT transforms + - Direct Pandoc OMML output (avoids DOCX parsing) + """ # Pandoc input format with LaTeX math extensions INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" + # Pre-compiled regex patterns for formula detection + _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$") + _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]") + _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)") + _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)") + _RE_MATH_ELEMENT = re.compile(r"]*>[\s\S]*?") + + # Pre-compiled regex patterns for preprocessing + _RE_VSPACE = re.compile(r"\\\[1mm\]") + _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL) + _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL) + _RE_ARITHMATEX = re.compile(r'(.*?)') + _RE_INLINE_SPACE = re.compile(r"(? bool: + """Check if text contains only a LaTeX formula (no mixed content). + + A text is considered formula-only if it matches one of these patterns: + - Display math: $$...$$ or \\[...\\] + - Inline math: $...$ or \\(...\\) + + Args: + text: Input text to check. + + Returns: + True if the text contains only a LaTeX formula, False otherwise. + """ + text = text.strip() + + if not text: + return False + + # Strict patterns: entire text must be a single formula with delimiters + # Using pre-compiled patterns with fullmatch semantics + if self._RE_DISPLAY_DOLLAR.fullmatch(text): + return True + if self._RE_DISPLAY_BRACKET.fullmatch(text): + return True + if self._RE_INLINE_DOLLAR.fullmatch(text): + return True + if self._RE_INLINE_PAREN.fullmatch(text): + return True + + return False + def convert_to_formats(self, md_text: str) -> ConvertResult: - """Convert markdown to LaTeX and MathML formats. + """Convert markdown to LaTeX, MathML, and MML formats. + + Only converts when input contains a pure LaTeX formula. + Mixed content (text + formula) returns empty strings for all fields. Args: md_text: Markdown text to convert. Returns: - ConvertResult with latex and mathml fields. + ConvertResult with latex, mathml, and mml fields. + All fields are empty if input is not a pure formula. Raises: - ValueError: If md_text is empty. - RuntimeError: If conversion fails. + RuntimeError: If conversion fails for a valid formula. """ - if md_text == "": - return ConvertResult(latex="", mathml="") + # Empty input returns empty result + if not md_text or not md_text.strip(): + return ConvertResult(latex="", mathml="", mml="") + + # Check if input is formula-only + if not self._is_formula_only(md_text): + # Mixed content: cannot convert to formula formats + return ConvertResult(latex="", mathml="", mml="") try: - # Convert to LaTeX - latex_output = pypandoc.convert_text( - md_text, - "latex", - format=self.INPUT_FORMAT, - ).rstrip("\n") + # Extract the LaTeX formula content (remove delimiters) + latex_formula = self._extract_latex_formula(md_text) - # Convert to HTML with MathML - mathml_output = pypandoc.convert_text( - md_text, - "html", - format=self.INPUT_FORMAT, - extra_args=["--mathml"], - ).rstrip("\n") + # Convert to MathML + mathml = self._latex_to_mathml(latex_formula) - return ConvertResult(latex=latex_output, mathml=mathml_output) + # Convert MathML to mml:math format (with namespace prefix) + mml = self._mathml_to_mml(mathml) + + return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml) except Exception as e: raise RuntimeError(f"Conversion failed: {e}") from e + def convert_to_omml(self, latex_formula: str) -> str: + """Convert LaTeX formula to OMML (Office Math Markup Language). + + This is a separate method due to the performance overhead of OMML conversion, + which requires creating a temporary DOCX file. + + Args: + latex_formula: Pure LaTeX formula (without delimiters like $ or $$). + + Returns: + OMML representation as XML string. + + Raises: + ValueError: If latex_formula is empty. + RuntimeError: If conversion fails. + """ + if not latex_formula or not latex_formula.strip(): + raise ValueError("LaTeX formula cannot be empty") + + return self._latex_to_omml(latex_formula.strip()) + + def _extract_latex_formula(self, text: str) -> str: + """Extract LaTeX formula from text by removing delimiters. + + Args: + text: Text containing LaTeX formula with delimiters. + + Returns: + Pure LaTeX formula without delimiters. + """ + text = text.strip() + + # Remove display math delimiters: $$...$$ or \[...\] + if text.startswith("$$") and text.endswith("$$"): + return text[2:-2].strip() + if text.startswith("\\[") and text.endswith("\\]"): + return text[2:-2].strip() + + # Remove inline math delimiters: $...$ or \(...\) + if text.startswith("$") and text.endswith("$") and not text.startswith("$$"): + return text[1:-1].strip() + if text.startswith("\\(") and text.endswith("\\)"): + return text[2:-2].strip() + + # If no delimiters, return as-is + return text.strip() + + @staticmethod + @lru_cache(maxsize=256) + def _latex_to_mathml_cached(latex_formula: str) -> str: + """Cached conversion of LaTeX formula to MathML. + + Uses LRU cache to avoid recomputing for repeated formulas. + """ + try: + # Use latex2mathml library for conversion (fast, pure Python) + return latex_to_mathml(latex_formula) + except Exception as e: + # Fallback: try with Pandoc (slower, but more robust) + try: + mathml_html = pypandoc.convert_text( + f"${latex_formula}$", + "html", + format="markdown+tex_math_dollars", + extra_args=["--mathml"], + ) + # Extract just the element from the HTML + match = Converter._RE_MATH_ELEMENT.search(mathml_html) + if match: + return match.group(0) + return mathml_html.rstrip("\n") + except Exception as pandoc_error: + raise RuntimeError( + f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}" + ) from e + + def _latex_to_mathml(self, latex_formula: str) -> str: + """Convert LaTeX formula to standard MathML. + + Args: + latex_formula: Pure LaTeX formula (without delimiters). + + Returns: + Standard MathML representation. + """ + return self._latex_to_mathml_cached(latex_formula) + + def _mathml_to_mml(self, mathml: str) -> str: + """Convert standard MathML to mml:math format with namespace prefix. + + Uses XSLT for efficient transformation. Transforms: + - to + - All child elements like , to , + + Args: + mathml: Standard MathML string. + + Returns: + MathML with mml: namespace prefix. + """ + if not mathml: + return "" + + try: + from lxml import etree + + # Parse MathML + root = etree.fromstring(mathml.encode("utf-8")) + + # Apply XSLT transformation (cached) + transform = self._get_mml_xslt_transform() + result_tree = transform(root) + + # Serialize to string + return str(result_tree) + + except Exception: + # Fallback: simple string replacement (less robust but no lxml dependency) + result = mathml + # Add namespace to root math element + result = re.sub( + r"", "", result) + + # Add mml: prefix to all other elements using a single regex + # Match opening tags + result = re.sub( + r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|" + r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|" + r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|" + r"maction|semantics|annotation|annotation-xml)\b", + r"", + r"", + result, + ) + + return result + + def _latex_to_omml(self, latex_formula: str) -> str: + """Convert LaTeX formula to OMML (Office Math Markup Language). + + Uses Pandoc to create DOCX in memory and extracts OMML from it. + Optimized to minimize disk I/O by using in-memory zip processing. + + Args: + latex_formula: Pure LaTeX formula (without delimiters). + + Returns: + OMML representation as XML string. + """ + import io + import zipfile + + try: + from lxml import etree + + # Convert to DOCX bytes using Pandoc + # We still need a temp file for input, but output goes to temp file too + # Then we process the DOCX in memory + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: + f.write(f"$${latex_formula}$$\n") + temp_md = f.name + + temp_docx = temp_md.replace(".md", ".docx") + + try: + pypandoc.convert_file( + temp_md, + "docx", + format=self.INPUT_FORMAT, + outputfile=temp_docx, + ) + + # Read DOCX into memory and process as ZIP + with open(temp_docx, "rb") as f: + docx_bytes = f.read() + + # Extract document.xml from DOCX (which is a ZIP file) + with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf: + document_xml = zf.read("word/document.xml") + + # Parse XML and extract OMML + root = etree.fromstring(document_xml) + + # Find all oMath elements + omml_parts = [] + for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"): + omml_parts.append(etree.tostring(math, encoding="unicode")) + + return "\n".join(omml_parts) + + finally: + # Cleanup temp files + if os.path.exists(temp_md): + os.remove(temp_md) + if os.path.exists(temp_docx): + os.remove(temp_docx) + + except Exception as e: + raise RuntimeError(f"OMML conversion failed: {e}") from e + def preprocess_for_export(self, md_text: str) -> str: """Preprocess markdown text for export to docx/pdf. Handles LaTeX formula formatting, matrix environments, and other transformations needed for proper Word/PDF rendering. + Uses pre-compiled regex patterns for better performance. + Args: md_text: Raw markdown text. @@ -88,36 +438,23 @@ class Converter: Preprocessed markdown text. """ # Replace \[1mm] => \vspace{1mm} - md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text) + md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text) # Add blank lines around \[...\] block formulas - md_text = re.sub( - r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", - r"\1\n\n\\[\3\\]\n\n\4", - md_text, - flags=re.DOTALL, - ) - md_text = re.sub( - r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", - r"\n\\[\2\\]\n", - md_text, - flags=re.MULTILINE | re.DOTALL, - ) + md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text) + md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text) # Remove arithmatex span wrappers - cleaned_md = re.sub(r'(.*?)', r"\1", md_text) + cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text) # Convert inline formulas: \( \) => $ $ - cleaned_md = re.sub(r"\\\(", r"$", cleaned_md) - cleaned_md = re.sub(r"\\\)", r"$", cleaned_md) + cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$") # Convert block formulas: \[ \] => $$ $$ - cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md) - cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md) + cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$") # Remove spaces between $ and formula content - # Use negative lookahead/lookbehind to avoid matching $$ block formulas - cleaned_md = re.sub(r"(? \left| \begin{matrix}...\end{matrix} \right| - md_text = re.sub( - r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", + md_text = self._RE_VMATRIX.sub( r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", md_text, - flags=re.DOTALL, ) # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| - md_text = re.sub( - r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", + md_text = self._RE_VMATRIX_DOUBLE.sub( r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", md_text, - flags=re.DOTALL, ) return md_text @@ -165,50 +498,22 @@ class Converter: Pandoc's OMML converter doesn't accept spaces between column alignment specifiers in array environments. This converts patterns like {c c c c} to {cccc}. - - Args: - md_text: Markdown text with LaTeX formulas. - - Returns: - Markdown text with fixed array column specifiers. """ def remove_spaces_in_specifier(match: re.Match) -> str: """Remove spaces from column specifier.""" specifier = match.group(1) - # Remove all spaces from the specifier - specifier_no_spaces = re.sub(r"\s+", "", specifier) - return f"\\begin{{array}}{{{specifier_no_spaces}}}" + return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}" - # Match \begin{array}{...} and remove spaces in the column specifier - # Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...} - md_text = re.sub( - r"\\begin\{array\}\{([^}]+)\}", - remove_spaces_in_specifier, - md_text, - ) - - return md_text + return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text) def _fix_brace_spacing(self, md_text: str) -> str: """Fix spacing issues with braces in equation systems. Removes whitespace and adds negative space for proper alignment in Word/OMML. """ - # Fix \left\{ spacing - md_text = re.sub( - r"\\left\\\{\s+", - r"\\left\\{\\!", - md_text, - ) - - # Fix \right\} spacing - md_text = re.sub( - r"\s+\\right\\\}", - r"\\!\\right\\}", - md_text, - ) - + md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text) + md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text) return md_text def _convert_special_environments(self, md_text: str) -> str: @@ -216,42 +521,28 @@ class Converter: These environments have better rendering support in Word/OMML. """ + # Pre-compiled pattern for alignment marker removal + _re_align_marker = re.compile(r"(^|\\\\)\s*&") def convert_cases(match: re.Match) -> str: content = match.group(1) return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." - md_text = re.sub( - r"\\begin\{cases\}(.*?)\\end\{cases\}", - convert_cases, - md_text, - flags=re.DOTALL, - ) + md_text = self._RE_CASES.sub(convert_cases, md_text) def convert_aligned_to_array(match: re.Match) -> str: content = match.group(1) - # Remove leading & alignment markers (not needed in array{l}) - content = re.sub(r"(^|\\\\)\s*&", r"\1", content) + content = _re_align_marker.sub(r"\1", content) return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." - md_text = re.sub( - r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", - convert_aligned_to_array, - md_text, - flags=re.DOTALL, - ) + md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text) def convert_standalone_aligned(match: re.Match) -> str: content = match.group(1) - content = re.sub(r"(^|\\\\)\s*&", r"\1", content) + content = _re_align_marker.sub(r"\1", content) return r"\begin{array}{l}" + content + r"\end{array}" - md_text = re.sub( - r"\\begin\{aligned\}(.*?)\\end\{aligned\}", - convert_standalone_aligned, - md_text, - flags=re.DOTALL, - ) + md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text) return md_text @@ -259,36 +550,15 @@ class Converter: """Convert LaTeX \\tag{} commands to Word-compatible format. The \\tag{} command is not supported in Word OMML format, so we convert it to - use simple spacing (\quad) to push the equation number to the right side. - The tag remains inside the formula for better compatibility. - - Args: - md_text: Markdown text containing LaTeX formulas with \\tag{}. - - Returns: - Markdown text with \\tag{} commands converted to spacing format. + use simple spacing (\\quad) to push the equation number to the right side. """ def convert_tag(match: re.Match) -> str: - """Convert a single \\tag{} command within a formula.""" formula_content = match.group(1) tag_content = match.group(2) - - # Replace \tag{...} with \quad (...) to push the number to the right - # Keep it inside the formula for better Word compatibility return f"$${formula_content} \\quad ({tag_content})$$" - # Match display formulas ($$...$$) containing \\tag{...} - # Pattern: $$...content...\\tag {?...}...$$ - # Allow optional space between \tag and { - md_text = re.sub( - r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", - convert_tag, - md_text, - flags=re.DOTALL, - ) - - return md_text + return self._RE_TAG.sub(convert_tag, md_text) def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: """Export markdown to docx or pdf file. @@ -381,4 +651,3 @@ class Converter: """ if os.path.exists(file_path): os.remove(file_path) - diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index aa8342a..35435bf 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -17,13 +17,31 @@ settings = get_settings() _COMMANDS_NEED_SPACE = { # operators / calculus - "cdot", "times", "div", "pm", "mp", - "int", "iint", "iiint", "oint", "sum", "prod", "lim", + "cdot", + "times", + "div", + "pm", + "mp", + "int", + "iint", + "iiint", + "oint", + "sum", + "prod", + "lim", # common functions - "sin", "cos", "tan", "cot", "sec", "csc", - "log", "ln", "exp", + "sin", + "cos", + "tan", + "cot", + "sec", + "csc", + "log", + "ln", + "exp", # misc - "partial", "nabla", + "partial", + "nabla", } _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL) @@ -58,7 +76,7 @@ def _split_glued_command_token(token: str) -> str: if not best: return token - suffix = body[len(best):] + suffix = body[len(best) :] if not suffix: return token @@ -118,11 +136,11 @@ class OCRService(OCRServiceBase): image_processor: Image processor instance. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url - self.layout_detector = layout_detector + self.layout_detector = layout_detector self.image_processor = image_processor self.converter = converter - def _get_pipeline(self): + def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: @@ -159,12 +177,13 @@ class OCRService(OCRServiceBase): markdown_content += res.markdown.get("markdown_texts", "") markdown_content = _postprocess_markdown(markdown_content) - convert_result = self.converter.convert_to_formats(markdown_content) + convert_result = self.converter.convert_to_formats(markdown_content) return { "markdown": markdown_content, "latex": convert_result.latex, "mathml": convert_result.mathml, + "mml": convert_result.mml, } except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e @@ -196,6 +215,7 @@ class OCRService(OCRServiceBase): return { "latex": convert_result.latex, "mathml": convert_result.mathml, + "mml": convert_result.mml, "markdown": markdown_content, } except Exception as e: @@ -220,7 +240,7 @@ class OCRService(OCRServiceBase): class MineruOCRService(OCRServiceBase): """Service for OCR using local file_parse API.""" - + def __init__( self, api_url: str = "http://127.0.0.1:8000/file_parse", @@ -228,7 +248,7 @@ class MineruOCRService(OCRServiceBase): converter: Optional[Converter] = None, ): """Initialize Local API service. - + Args: api_url: URL of the local file_parse API endpoint. converter: Optional converter instance for format conversion. @@ -236,13 +256,13 @@ class MineruOCRService(OCRServiceBase): self.api_url = api_url self.image_processor = image_processor self.converter = converter - + def recognize(self, image: np.ndarray) -> dict: """Recognize content using local file_parse API. - + Args: image: Input image as numpy array in BGR format. - + Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ @@ -251,78 +271,71 @@ class MineruOCRService(OCRServiceBase): image = self.image_processor.add_padding(image) # Convert numpy array to image bytes - success, encoded_image = cv2.imencode('.png', image) + success, encoded_image = cv2.imencode(".png", image) if not success: raise RuntimeError("Failed to encode image") - + image_bytes = BytesIO(encoded_image.tobytes()) - + # Prepare multipart form data - files = { - 'files': ('image.png', image_bytes, 'image/png') - } - + files = {"files": ("image.png", image_bytes, "image/png")} + data = { - 'return_middle_json': 'false', - 'return_model_output': 'false', - 'return_md': 'true', - 'return_images': 'false', - 'end_page_id': '99999', - 'start_page_id': '0', - 'lang_list': 'en', - 'server_url': 'string', - 'return_content_list': 'false', - 'backend': 'hybrid-auto-engine', - 'table_enable': 'true', - 'response_format_zip': 'false', - 'formula_enable': 'true', - 'parse_method': 'ocr' + "return_middle_json": "false", + "return_model_output": "false", + "return_md": "true", + "return_images": "false", + "end_page_id": "99999", + "start_page_id": "0", + "lang_list": "en", + "server_url": "string", + "return_content_list": "false", + "backend": "hybrid-auto-engine", + "table_enable": "true", + "response_format_zip": "false", + "formula_enable": "true", + "parse_method": "ocr", } - + # Make API request - response = requests.post( - self.api_url, - files=files, - data=data, - headers={'accept': 'application/json'}, - timeout=30 - ) + response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30) response.raise_for_status() - + result = response.json() - + # Extract markdown content from response markdown_content = "" - if 'results' in result and 'image' in result['results']: - markdown_content = result['results']['image'].get('md_content', '') + if "results" in result and "image" in result["results"]: + markdown_content = result["results"]["image"].get("md_content", "") # markdown_content = _postprocess_markdown(markdown_content) - + # Convert to other formats if converter is available latex = "" mathml = "" + mml = "" if self.converter and markdown_content: convert_result = self.converter.convert_to_formats(markdown_content) latex = convert_result.latex mathml = convert_result.mathml - + mml = convert_result.mml + return { "markdown": markdown_content, "latex": latex, "mathml": mathml, + "mml": mml, } - + except requests.RequestException as e: raise RuntimeError(f"Local API request failed: {e}") from e except Exception as e: raise RuntimeError(f"Recognition failed: {e}") from e - - if __name__ == "__main__": mineru_service = MineruOCRService() image = cv2.imread("test/complex_formula.png") image_numpy = np.array(image) ocr_result = mineru_service.recognize(image_numpy) - print(ocr_result) \ No newline at end of file + print(ocr_result) diff --git a/pyproject.toml b/pyproject.toml index 50a6860..73defc8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,8 @@ dependencies = [ "pypandoc==1.16.2", "paddlepaddle", "paddleocr[doc-parser]", - "safetensors" + "safetensors", + "lxml>=5.0.0" ] [tool.uv.sources] diff --git a/test_converter.py b/test_converter.py new file mode 100644 index 0000000..1240e34 --- /dev/null +++ b/test_converter.py @@ -0,0 +1,57 @@ +"""Test script for converter functionality.""" + +from app.services.converter import Converter + + +def test_latex_only_conversion(): + """Test conversion of LaTeX-only content.""" + converter = Converter() + + # Test case 1: Display math with $$...$$ + latex_input = "$$E = mc^2$$" + result = converter.convert_to_formats(latex_input) + + print("Test 1: Display math ($$...$$)") + print(f"Input: {latex_input}") + print(f"LaTeX: {result.latex}") + print(f"MathML: {result.mathml[:100]}...") + print(f"MML: {result.mml[:100]}...") + print(f"OMML: {result.omml[:100] if result.omml else 'Empty'}...") + print() + + # Test case 2: Inline math with $...$ + latex_input2 = "$\\frac{a}{b}$" + result2 = converter.convert_to_formats(latex_input2) + + print("Test 2: Inline math ($...$)") + print(f"Input: {latex_input2}") + print(f"LaTeX: {result2.latex}") + print(f"MathML: {result2.mathml[:100]}...") + print() + + # Test case 3: Complex formula + latex_input3 = "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$" + result3 = converter.convert_to_formats(latex_input3) + + print("Test 3: Complex formula") + print(f"Input: {latex_input3}") + print(f"LaTeX: {result3.latex}") + print(f"MathML: {result3.mathml[:150]}...") + print(f"OMML length: {len(result3.omml)}") + print() + + # Test case 4: Regular markdown (not LaTeX-only) + markdown_input = "# Hello\n\nThis is a test with math: $x = 2$" + result4 = converter.convert_to_formats(markdown_input) + + print("Test 4: Regular markdown") + print(f"Input: {markdown_input}") + print(f"LaTeX: {result4.latex[:100]}...") + print(f"MathML: {result4.mathml[:100]}...") + print(f"MML: {result4.mml}") + print(f"OMML: {result4.omml}") + print() + + +if __name__ == "__main__": + test_latex_only_conversion()