From 1a4d54ce344fc5cd63e2df2d263e8d420dc22ce8 Mon Sep 17 00:00:00 2001 From: liuyuanchuang Date: Sat, 7 Feb 2026 21:28:46 +0800 Subject: [PATCH] fix: post hanlde for ocr --- app/services/converter.py | 472 +++++++++++++++++++----------------- app/services/ocr_service.py | 36 ++- 2 files changed, 281 insertions(+), 227 deletions(-) diff --git a/app/services/converter.py b/app/services/converter.py index b2b02a3..792fac4 100644 --- a/app/services/converter.py +++ b/app/services/converter.py @@ -136,6 +136,7 @@ class Converter: """Get cached XSLT transform for MathML to mml: conversion.""" if cls._mml_xslt_transform is None: from lxml import etree + xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8")) cls._mml_xslt_transform = etree.XSLT(xslt_doc) return cls._mml_xslt_transform @@ -197,14 +198,17 @@ class Converter: return ConvertResult(latex="", mathml="", mml="") try: + # Detect if formula is display (block) or inline + is_display = self._is_display_formula(md_text) + # Extract the LaTeX formula content (remove delimiters) latex_formula = self._extract_latex_formula(md_text) # Preprocess formula for better conversion (fix array specifiers, etc.) preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula) - # Convert to MathML - mathml = self._latex_to_mathml(preprocessed_formula) + # Convert to MathML (pass display flag to use correct delimiters) + mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display) # Convert MathML to mml:math format (with namespace prefix) mml = self._mathml_to_mml(mathml) @@ -238,18 +242,18 @@ class Converter: # Preprocess formula using the same preprocessing as export preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip()) - + return self._latex_to_omml(preprocessed) def _preprocess_formula_for_conversion(self, latex_formula: str) -> str: """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.). Applies the same preprocessing steps as preprocess_for_export to ensure - consistency across all conversion paths. This fixes common issues that + consistency across all conversion paths. This fixes common issues that cause Pandoc conversion to fail. - Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py), - so we don't need to handle them here. + Note: OCR errors (number errors, command spacing) are fixed earlier in the + pipeline (in ocr_service.py), so we don't need to handle them here. Args: latex_formula: Pure LaTeX formula. @@ -259,18 +263,38 @@ class Converter: """ # 1. Convert matrix environments latex_formula = self._convert_matrix_environments(latex_formula) - + # 2. Fix array column specifiers (remove spaces) latex_formula = self._fix_array_column_specifiers(latex_formula) - + # 3. Fix brace spacing latex_formula = self._fix_brace_spacing(latex_formula) - + # 4. Convert special environments (cases, aligned) latex_formula = self._convert_special_environments(latex_formula) - + return latex_formula + def _is_display_formula(self, text: str) -> bool: + """Check if the formula is a display (block) formula. + + Args: + text: Text containing LaTeX formula with delimiters. + + Returns: + True if display formula ($$...$$ or \\[...\\]), False if inline. + """ + text = text.strip() + + # Display math delimiters: $$...$$ or \[...\] + if text.startswith("$$") and text.endswith("$$"): + return True + if text.startswith("\\[") and text.endswith("\\]"): + return True + + # Inline math delimiters: $...$ or \(...\) + return False + def _extract_latex_formula(self, text: str) -> str: """Extract LaTeX formula from text by removing delimiters. @@ -299,18 +323,30 @@ class Converter: @staticmethod @lru_cache(maxsize=256) - def _latex_to_mathml_cached(latex_formula: str) -> str: + def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str: """Cached conversion of LaTeX formula to MathML. Uses Pandoc for conversion to ensure Word compatibility. Pandoc generates standard MathML that Word can properly import. - Uses LRU cache to avoid recomputing for repeated formulas. + Args: + latex_formula: Pure LaTeX formula (without delimiters). + is_display: True if display (block) formula, False if inline. + + Returns: + Standard MathML representation. """ + # Use appropriate delimiters based on formula type + # Display formulas use $$...$$, inline formulas use $...$ + if is_display: + pandoc_input = f"$${latex_formula}$$" + else: + pandoc_input = f"${latex_formula}$" + try: # Use Pandoc for Word-compatible MathML (primary method) mathml_html = pypandoc.convert_text( - f"${latex_formula}$", + pandoc_input, "html", format="markdown+tex_math_dollars", extra_args=["--mathml"], @@ -321,24 +357,23 @@ class Converter: mathml = match.group(0) # Post-process for Word compatibility return Converter._postprocess_mathml_for_word(mathml) - - # If no match, return as-is - return mathml_html.rstrip("\n") - + + # If Pandoc didn't generate MathML (returned HTML instead), use fallback + # This happens when Pandoc's mathml output format is not available or fails + raise ValueError("Pandoc did not generate MathML, got HTML instead") + except Exception as pandoc_error: # Fallback: try latex2mathml (less Word-compatible) try: mathml = latex_to_mathml(latex_formula) return Converter._postprocess_mathml_for_word(mathml) except Exception as e: - raise RuntimeError( - f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}" - ) from e - + raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e + @staticmethod def _postprocess_mathml_for_word(mathml: str) -> str: """Post-process MathML to improve Word compatibility. - + Applies transformations to make MathML more compatible and concise: - Remove and wrappers (Word doesn't need them) - Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.) @@ -346,32 +381,32 @@ class Converter: - Change display="inline" to display="block" for better rendering - Decode Unicode entities to actual characters (Word prefers this) - Ensure proper namespace - + Args: mathml: MathML string. - + Returns: Simplified, Word-compatible MathML string. """ import re - + # Step 1: Remove and wrappers # These often cause Word import issues - if '' in mathml: + if "" in mathml: # Extract content between and - match = re.search(r'(.*?)(.*?)]*)>', mathml) + math_match = re.search(r"]*)>", mathml) if math_match: math_attrs = math_match.group(1) - + # Rebuild without semantics - mathml = f'{content}' - + mathml = f"{content}" + # Step 2: Remove unnecessary attributes that don't affect rendering # These are verbose and Word doesn't need them unnecessary_attrs = [ @@ -390,234 +425,231 @@ class Converter: r'\s+class="[^"]*"', r'\s+style="[^"]*"', ] - + for attr_pattern in unnecessary_attrs: - mathml = re.sub(attr_pattern, '', mathml) - + mathml = re.sub(attr_pattern, "", mathml) + # Step 3: Remove redundant single wrapper at the top level # Pattern: content # Simplify to: content - mrow_pattern = r'(]*>)\s*(.*?)\s*()' + mrow_pattern = r"(]*>)\s*(.*?)\s*()" match = re.search(mrow_pattern, mathml, re.DOTALL) if match: # Check if there's only one mrow at the top level content = match.group(2) # Only remove if the content doesn't have other top-level elements - if not re.search(r']+>\s*<[^/]', content): - mathml = f'{match.group(1)}{content}{match.group(3)}' - + if not re.search(r"]+>\s*<[^/]", content): + mathml = f"{match.group(1)}{content}{match.group(3)}" + # Step 4: Change display to block for better Word rendering mathml = mathml.replace('display="inline"', 'display="block"') - + # Step 5: If no display attribute, add it - if 'display=' not in mathml and '', - '(': '(', - ')': ')', - ',': ',', - '.': '.', - '|': '|', - '°': '°', - '×': '×', # times - '÷': '÷', # div - '±': '±', # pm - '∓': '∓', # mp - + "+": "+", + "-": "-", + "*": "*", + "/": "/", + "=": "=", + "<": "<", + ">": ">", + "(": "(", + ")": ")", + ",": ",", + ".": ".", + "|": "|", + "°": "°", + "×": "×", # times + "÷": "÷", # div + "±": "±", # pm + "∓": "∓", # mp # Ellipsis symbols - '…': '…', # ldots (horizontal) - '⋮': '⋮', # vdots (vertical) - '⋯': '⋯', # cdots (centered) - '⋰': '⋰', # iddots (diagonal up) - '⋱': '⋱', # ddots (diagonal down) - + "…": "…", # ldots (horizontal) + "⋮": "⋮", # vdots (vertical) + "⋯": "⋯", # cdots (centered) + "⋰": "⋰", # iddots (diagonal up) + "⋱": "⋱", # ddots (diagonal down) # Greek letters (lowercase) - 'α': 'α', # alpha - 'β': 'β', # beta - 'γ': 'γ', # gamma - 'δ': 'δ', # delta - 'ε': 'ε', # epsilon - 'ζ': 'ζ', # zeta - 'η': 'η', # eta - 'θ': 'θ', # theta - 'ι': 'ι', # iota - 'κ': 'κ', # kappa - 'λ': 'λ', # lambda - 'μ': 'μ', # mu - 'ν': 'ν', # nu - 'ξ': 'ξ', # xi - 'ο': 'ο', # omicron - 'π': 'π', # pi - 'ρ': 'ρ', # rho - 'ς': 'ς', # final sigma - 'σ': 'σ', # sigma - 'τ': 'τ', # tau - 'υ': 'υ', # upsilon - 'φ': 'φ', # phi - 'χ': 'χ', # chi - 'ψ': 'ψ', # psi - 'ω': 'ω', # omega - 'ϕ': 'ϕ', # phi variant - + "α": "α", # alpha + "β": "β", # beta + "γ": "γ", # gamma + "δ": "δ", # delta + "ε": "ε", # epsilon + "ζ": "ζ", # zeta + "η": "η", # eta + "θ": "θ", # theta + "ι": "ι", # iota + "κ": "κ", # kappa + "λ": "λ", # lambda + "μ": "μ", # mu + "ν": "ν", # nu + "ξ": "ξ", # xi + "ο": "ο", # omicron + "π": "π", # pi + "ρ": "ρ", # rho + "ς": "ς", # final sigma + "σ": "σ", # sigma + "τ": "τ", # tau + "υ": "υ", # upsilon + "φ": "φ", # phi + "χ": "χ", # chi + "ψ": "ψ", # psi + "ω": "ω", # omega + "ϕ": "ϕ", # phi variant # Greek letters (uppercase) - 'Α': 'Α', # Alpha - 'Β': 'Β', # Beta - 'Γ': 'Γ', # Gamma - 'Δ': 'Δ', # Delta - 'Ε': 'Ε', # Epsilon - 'Ζ': 'Ζ', # Zeta - 'Η': 'Η', # Eta - 'Θ': 'Θ', # Theta - 'Ι': 'Ι', # Iota - 'Κ': 'Κ', # Kappa - 'Λ': 'Λ', # Lambda - 'Μ': 'Μ', # Mu - 'Ν': 'Ν', # Nu - 'Ξ': 'Ξ', # Xi - 'Ο': 'Ο', # Omicron - 'Π': 'Π', # Pi - 'Ρ': 'Ρ', # Rho - 'Σ': 'Σ', # Sigma - 'Τ': 'Τ', # Tau - 'Υ': 'Υ', # Upsilon - 'Φ': 'Φ', # Phi - 'Χ': 'Χ', # Chi - 'Ψ': 'Ψ', # Psi - 'Ω': 'Ω', # Omega - + "Α": "Α", # Alpha + "Β": "Β", # Beta + "Γ": "Γ", # Gamma + "Δ": "Δ", # Delta + "Ε": "Ε", # Epsilon + "Ζ": "Ζ", # Zeta + "Η": "Η", # Eta + "Θ": "Θ", # Theta + "Ι": "Ι", # Iota + "Κ": "Κ", # Kappa + "Λ": "Λ", # Lambda + "Μ": "Μ", # Mu + "Ν": "Ν", # Nu + "Ξ": "Ξ", # Xi + "Ο": "Ο", # Omicron + "Π": "Π", # Pi + "Ρ": "Ρ", # Rho + "Σ": "Σ", # Sigma + "Τ": "Τ", # Tau + "Υ": "Υ", # Upsilon + "Φ": "Φ", # Phi + "Χ": "Χ", # Chi + "Ψ": "Ψ", # Psi + "Ω": "Ω", # Omega # Math symbols - '∅': '∅', # emptyset - '∈': '∈', # in - '∉': '∉', # notin - '∋': '∋', # ni - '∌': '∌', # nni - '∑': '∑', # sum - '∏': '∏', # prod - '√': '√', # sqrt - '∛': '∛', # cbrt - '∜': '∜', # fourthroot - '∞': '∞', # infty - '∩': '∩', # cap - '∪': '∪', # cup - '∫': '∫', # int - '∬': '∬', # iint - '∭': '∭', # iiint - '∮': '∮', # oint - '⊂': '⊂', # subset - '⊃': '⊃', # supset - '⊄': '⊄', # nsubset - '⊅': '⊅', # nsupset - '⊆': '⊆', # subseteq - '⊇': '⊇', # supseteq - '⊈': '⊈', # nsubseteq - '⊉': '⊉', # nsupseteq - '≤': '≤', # leq - '≥': '≥', # geq - '≠': '≠', # neq - '≡': '≡', # equiv - '≈': '≈', # approx - '≃': '≃', # simeq - '≅': '≅', # cong - '∂': '∂', # partial - '∇': '∇', # nabla - '∀': '∀', # forall - '∃': '∃', # exists - '∄': '∄', # nexists - '¬': '¬', # neg/lnot - '∧': '∧', # wedge/land - '∨': '∨', # vee/lor - '→': '→', # to/rightarrow - '←': '←', # leftarrow - '↔': '↔', # leftrightarrow - '⇒': '⇒', # Rightarrow - '⇐': '⇐', # Leftarrow - '⇔': '⇔', # Leftrightarrow - '↑': '↑', # uparrow - '↓': '↓', # downarrow - '⇑': '⇑', # Uparrow - '⇓': '⇓', # Downarrow - '↕': '↕', # updownarrow - '⇕': '⇕', # Updownarrow - '≠': '≠', # ne - '≪': '≪', # ll - '≫': '≫', # gg - '⩽': '⩽', # leqslant - '⩾': '⩾', # geqslant - '⊥': '⊥', # perp - '∥': '∥', # parallel - '∠': '∠', # angle - '△': '△', # triangle - '□': '□', # square - '◊': '◊', # diamond - '♠': '♠', # spadesuit - '♡': '♡', # heartsuit - '♢': '♢', # diamondsuit - '♣': '♣', # clubsuit - 'ℓ': 'ℓ', # ell - '℘': '℘', # wp (Weierstrass p) - 'ℜ': 'ℜ', # Re (real part) - 'ℑ': 'ℑ', # Im (imaginary part) - 'ℵ': 'ℵ', # aleph - 'ℶ': 'ℶ', # beth + "∅": "∅", # emptyset + "∈": "∈", # in + "∉": "∉", # notin + "∋": "∋", # ni + "∌": "∌", # nni + "∑": "∑", # sum + "∏": "∏", # prod + "√": "√", # sqrt + "∛": "∛", # cbrt + "∜": "∜", # fourthroot + "∞": "∞", # infty + "∩": "∩", # cap + "∪": "∪", # cup + "∫": "∫", # int + "∬": "∬", # iint + "∭": "∭", # iiint + "∮": "∮", # oint + "⊂": "⊂", # subset + "⊃": "⊃", # supset + "⊄": "⊄", # nsubset + "⊅": "⊅", # nsupset + "⊆": "⊆", # subseteq + "⊇": "⊇", # supseteq + "⊈": "⊈", # nsubseteq + "⊉": "⊉", # nsupseteq + "≤": "≤", # leq + "≥": "≥", # geq + "≠": "≠", # neq + "≡": "≡", # equiv + "≈": "≈", # approx + "≃": "≃", # simeq + "≅": "≅", # cong + "∂": "∂", # partial + "∇": "∇", # nabla + "∀": "∀", # forall + "∃": "∃", # exists + "∄": "∄", # nexists + "¬": "¬", # neg/lnot + "∧": "∧", # wedge/land + "∨": "∨", # vee/lor + "→": "→", # to/rightarrow + "←": "←", # leftarrow + "↔": "↔", # leftrightarrow + "⇒": "⇒", # Rightarrow + "⇐": "⇐", # Leftarrow + "⇔": "⇔", # Leftrightarrow + "↑": "↑", # uparrow + "↓": "↓", # downarrow + "⇑": "⇑", # Uparrow + "⇓": "⇓", # Downarrow + "↕": "↕", # updownarrow + "⇕": "⇕", # Updownarrow + "≠": "≠", # ne + "≪": "≪", # ll + "≫": "≫", # gg + "⩽": "⩽", # leqslant + "⩾": "⩾", # geqslant + "⊥": "⊥", # perp + "∥": "∥", # parallel + "∠": "∠", # angle + "△": "△", # triangle + "□": "□", # square + "◊": "◊", # diamond + "♠": "♠", # spadesuit + "♡": "♡", # heartsuit + "♢": "♢", # diamondsuit + "♣": "♣", # clubsuit + "ℓ": "ℓ", # ell + "℘": "℘", # wp (Weierstrass p) + "ℜ": "ℜ", # Re (real part) + "ℑ": "ℑ", # Im (imaginary part) + "ℵ": "ℵ", # aleph + "ℶ": "ℶ", # beth } - + for entity, char in unicode_map.items(): mathml = mathml.replace(entity, char) - + # Also handle decimal entity format (&#NNNN;) for common characters # Convert decimal to hex-based lookup decimal_patterns = [ - (r'λ', 'λ'), # lambda (decimal 955 = hex 03BB) - (r'⋮', '⋮'), # vdots (decimal 8942 = hex 22EE) - (r'⋯', '⋯'), # cdots (decimal 8943 = hex 22EF) - (r'…', '…'), # ldots (decimal 8230 = hex 2026) - (r'∞', '∞'), # infty (decimal 8734 = hex 221E) - (r'∑', '∑'), # sum (decimal 8721 = hex 2211) - (r'∏', '∏'), # prod (decimal 8719 = hex 220F) - (r'√', '√'), # sqrt (decimal 8730 = hex 221A) - (r'∈', '∈'), # in (decimal 8712 = hex 2208) - (r'∉', '∉'), # notin (decimal 8713 = hex 2209) - (r'∩', '∩'), # cap (decimal 8745 = hex 2229) - (r'∪', '∪'), # cup (decimal 8746 = hex 222A) - (r'≤', '≤'), # leq (decimal 8804 = hex 2264) - (r'≥', '≥'), # geq (decimal 8805 = hex 2265) - (r'≠', '≠'), # neq (decimal 8800 = hex 2260) - (r'≈', '≈'), # approx (decimal 8776 = hex 2248) - (r'≡', '≡'), # equiv (decimal 8801 = hex 2261) + (r"λ", "λ"), # lambda (decimal 955 = hex 03BB) + (r"⋮", "⋮"), # vdots (decimal 8942 = hex 22EE) + (r"⋯", "⋯"), # cdots (decimal 8943 = hex 22EF) + (r"…", "…"), # ldots (decimal 8230 = hex 2026) + (r"∞", "∞"), # infty (decimal 8734 = hex 221E) + (r"∑", "∑"), # sum (decimal 8721 = hex 2211) + (r"∏", "∏"), # prod (decimal 8719 = hex 220F) + (r"√", "√"), # sqrt (decimal 8730 = hex 221A) + (r"∈", "∈"), # in (decimal 8712 = hex 2208) + (r"∉", "∉"), # notin (decimal 8713 = hex 2209) + (r"∩", "∩"), # cap (decimal 8745 = hex 2229) + (r"∪", "∪"), # cup (decimal 8746 = hex 222A) + (r"≤", "≤"), # leq (decimal 8804 = hex 2264) + (r"≥", "≥"), # geq (decimal 8805 = hex 2265) + (r"≠", "≠"), # neq (decimal 8800 = hex 2260) + (r"≈", "≈"), # approx (decimal 8776 = hex 2248) + (r"≡", "≡"), # equiv (decimal 8801 = hex 2261) ] - + for pattern, char in decimal_patterns: mathml = mathml.replace(pattern, char) - + # Step 8: Clean up extra whitespace - mathml = re.sub(r'>\s+<', '><', mathml) - + mathml = re.sub(r">\s+<", "><", mathml) + return mathml - def _latex_to_mathml(self, latex_formula: str) -> str: + def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str: """Convert LaTeX formula to standard MathML. Args: latex_formula: Pure LaTeX formula (without delimiters). + is_display: True if display (block) formula, False if inline. Returns: Standard MathML representation. """ - return self._latex_to_mathml_cached(latex_formula) + return self._latex_to_mathml_cached(latex_formula, is_display=is_display) def _mathml_to_mml(self, mathml: str) -> str: """Convert standard MathML to mml:math format with namespace prefix. diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 2981052..18f5b85 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = { "log", "ln", "exp", + # set relations (often glued by OCR) + "in", + "notin", + "subset", + "supset", + "subseteq", + "supseteq", + "cap", + "cup", # misc "partial", "nabla", } _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL) +# Match LaTeX commands: \command (greedy match all letters) +# The splitting logic in _split_glued_command_token will handle \inX -> \in X _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+") # stage2: differentials inside math segments @@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str: Examples: - \\cdotdS -> \\cdot dS - \\intdx -> \\int dx + - \\inX -> \\in X (stop at uppercase letter) """ if not token.startswith("\\"): return token @@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str: return token best = None - # longest prefix that is in whitelist - for i in range(1, len(body)): + # Find longest prefix that is in whitelist + for i in range(1, len(body) + 1): prefix = body[:i] if prefix in _COMMANDS_NEED_SPACE: best = prefix @@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str: # Pattern 2: Spaces inside braces that follow _ or ^ # _{i 1} -> _{i1}, ^{2 3} -> ^{23} # This is safe because spaces inside subscript/superscript braces are usually OCR errors + # BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them + # must be preserved as they serve as command terminators (\in X != \inX) def clean_subscript_superscript_braces(match): operator = match.group(1) # _ or ^ content = match.group(2) # content inside braces - # Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta) - # Only remove spaces between non-backslash characters - cleaned = re.sub(r"(? x\in is fine) + # Strategy: remove spaces before \ and between non-command chars, + # but preserve the space after \command when followed by a non-\ char + cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd + cleaned = re.sub(r"(? str: Processing stages: 0. Fix OCR number errors (spaces in numbers) - 1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS) + 1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X) 2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1}) 3. Normalize differentials (DISABLED by default to avoid breaking variables) @@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str: # stage0: fix OCR number errors (digits with spaces) expr = _fix_ocr_number_errors(expr) - # stage1: split glued command tokens (e.g. \cdotdS) + # stage1: split glued command tokens (e.g. \cdotdS, \inX) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)