fix: post hanlde for ocr

This commit is contained in:
liuyuanchuang
2026-02-07 21:28:46 +08:00
parent f514f98142
commit 1a4d54ce34
2 changed files with 281 additions and 227 deletions

View File

@@ -136,6 +136,7 @@ class Converter:
"""Get cached XSLT transform for MathML to mml: conversion.""" """Get cached XSLT transform for MathML to mml: conversion."""
if cls._mml_xslt_transform is None: if cls._mml_xslt_transform is None:
from lxml import etree from lxml import etree
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8")) xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
cls._mml_xslt_transform = etree.XSLT(xslt_doc) cls._mml_xslt_transform = etree.XSLT(xslt_doc)
return cls._mml_xslt_transform return cls._mml_xslt_transform
@@ -197,14 +198,17 @@ class Converter:
return ConvertResult(latex="", mathml="", mml="") return ConvertResult(latex="", mathml="", mml="")
try: try:
# Detect if formula is display (block) or inline
is_display = self._is_display_formula(md_text)
# Extract the LaTeX formula content (remove delimiters) # Extract the LaTeX formula content (remove delimiters)
latex_formula = self._extract_latex_formula(md_text) latex_formula = self._extract_latex_formula(md_text)
# Preprocess formula for better conversion (fix array specifiers, etc.) # Preprocess formula for better conversion (fix array specifiers, etc.)
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula) preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
# Convert to MathML # Convert to MathML (pass display flag to use correct delimiters)
mathml = self._latex_to_mathml(preprocessed_formula) mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)
# Convert MathML to mml:math format (with namespace prefix) # Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml) mml = self._mathml_to_mml(mathml)
@@ -238,18 +242,18 @@ class Converter:
# Preprocess formula using the same preprocessing as export # Preprocess formula using the same preprocessing as export
preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip()) preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())
return self._latex_to_omml(preprocessed) return self._latex_to_omml(preprocessed)
def _preprocess_formula_for_conversion(self, latex_formula: str) -> str: def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for any conversion (MathML, OMML, etc.). """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).
Applies the same preprocessing steps as preprocess_for_export to ensure Applies the same preprocessing steps as preprocess_for_export to ensure
consistency across all conversion paths. This fixes common issues that consistency across all conversion paths. This fixes common issues that
cause Pandoc conversion to fail. cause Pandoc conversion to fail.
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py), Note: OCR errors (number errors, command spacing) are fixed earlier in the
so we don't need to handle them here. pipeline (in ocr_service.py), so we don't need to handle them here.
Args: Args:
latex_formula: Pure LaTeX formula. latex_formula: Pure LaTeX formula.
@@ -259,18 +263,38 @@ class Converter:
""" """
# 1. Convert matrix environments # 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula) latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces) # 2. Fix array column specifiers (remove spaces)
latex_formula = self._fix_array_column_specifiers(latex_formula) latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing # 3. Fix brace spacing
latex_formula = self._fix_brace_spacing(latex_formula) latex_formula = self._fix_brace_spacing(latex_formula)
# 4. Convert special environments (cases, aligned) # 4. Convert special environments (cases, aligned)
latex_formula = self._convert_special_environments(latex_formula) latex_formula = self._convert_special_environments(latex_formula)
return latex_formula return latex_formula
def _is_display_formula(self, text: str) -> bool:
"""Check if the formula is a display (block) formula.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
True if display formula ($$...$$ or \\[...\\]), False if inline.
"""
text = text.strip()
# Display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return True
if text.startswith("\\[") and text.endswith("\\]"):
return True
# Inline math delimiters: $...$ or \(...\)
return False
def _extract_latex_formula(self, text: str) -> str: def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters. """Extract LaTeX formula from text by removing delimiters.
@@ -299,18 +323,30 @@ class Converter:
@staticmethod @staticmethod
@lru_cache(maxsize=256) @lru_cache(maxsize=256)
def _latex_to_mathml_cached(latex_formula: str) -> str: def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
"""Cached conversion of LaTeX formula to MathML. """Cached conversion of LaTeX formula to MathML.
Uses Pandoc for conversion to ensure Word compatibility. Uses Pandoc for conversion to ensure Word compatibility.
Pandoc generates standard MathML that Word can properly import. Pandoc generates standard MathML that Word can properly import.
Uses LRU cache to avoid recomputing for repeated formulas. Args:
latex_formula: Pure LaTeX formula (without delimiters).
is_display: True if display (block) formula, False if inline.
Returns:
Standard MathML representation.
""" """
# Use appropriate delimiters based on formula type
# Display formulas use $$...$$, inline formulas use $...$
if is_display:
pandoc_input = f"$${latex_formula}$$"
else:
pandoc_input = f"${latex_formula}$"
try: try:
# Use Pandoc for Word-compatible MathML (primary method) # Use Pandoc for Word-compatible MathML (primary method)
mathml_html = pypandoc.convert_text( mathml_html = pypandoc.convert_text(
f"${latex_formula}$", pandoc_input,
"html", "html",
format="markdown+tex_math_dollars", format="markdown+tex_math_dollars",
extra_args=["--mathml"], extra_args=["--mathml"],
@@ -321,24 +357,23 @@ class Converter:
mathml = match.group(0) mathml = match.group(0)
# Post-process for Word compatibility # Post-process for Word compatibility
return Converter._postprocess_mathml_for_word(mathml) return Converter._postprocess_mathml_for_word(mathml)
# If no match, return as-is # If Pandoc didn't generate MathML (returned HTML instead), use fallback
return mathml_html.rstrip("\n") # This happens when Pandoc's mathml output format is not available or fails
raise ValueError("Pandoc did not generate MathML, got HTML instead")
except Exception as pandoc_error: except Exception as pandoc_error:
# Fallback: try latex2mathml (less Word-compatible) # Fallback: try latex2mathml (less Word-compatible)
try: try:
mathml = latex_to_mathml(latex_formula) mathml = latex_to_mathml(latex_formula)
return Converter._postprocess_mathml_for_word(mathml) return Converter._postprocess_mathml_for_word(mathml)
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
) from e
@staticmethod @staticmethod
def _postprocess_mathml_for_word(mathml: str) -> str: def _postprocess_mathml_for_word(mathml: str) -> str:
"""Post-process MathML to improve Word compatibility. """Post-process MathML to improve Word compatibility.
Applies transformations to make MathML more compatible and concise: Applies transformations to make MathML more compatible and concise:
- Remove <semantics> and <annotation> wrappers (Word doesn't need them) - Remove <semantics> and <annotation> wrappers (Word doesn't need them)
- Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.) - Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
@@ -346,32 +381,32 @@ class Converter:
- Change display="inline" to display="block" for better rendering - Change display="inline" to display="block" for better rendering
- Decode Unicode entities to actual characters (Word prefers this) - Decode Unicode entities to actual characters (Word prefers this)
- Ensure proper namespace - Ensure proper namespace
Args: Args:
mathml: MathML string. mathml: MathML string.
Returns: Returns:
Simplified, Word-compatible MathML string. Simplified, Word-compatible MathML string.
""" """
import re import re
# Step 1: Remove <semantics> and <annotation> wrappers # Step 1: Remove <semantics> and <annotation> wrappers
# These often cause Word import issues # These often cause Word import issues
if '<semantics>' in mathml: if "<semantics>" in mathml:
# Extract content between <semantics> and <annotation> # Extract content between <semantics> and <annotation>
match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL) match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
if match: if match:
content = match.group(1).strip() content = match.group(1).strip()
# Get the math element attributes # Get the math element attributes
math_attrs = "" math_attrs = ""
math_match = re.search(r'<math([^>]*)>', mathml) math_match = re.search(r"<math([^>]*)>", mathml)
if math_match: if math_match:
math_attrs = math_match.group(1) math_attrs = math_match.group(1)
# Rebuild without semantics # Rebuild without semantics
mathml = f'<math{math_attrs}>{content}</math>' mathml = f"<math{math_attrs}>{content}</math>"
# Step 2: Remove unnecessary attributes that don't affect rendering # Step 2: Remove unnecessary attributes that don't affect rendering
# These are verbose and Word doesn't need them # These are verbose and Word doesn't need them
unnecessary_attrs = [ unnecessary_attrs = [
@@ -390,234 +425,231 @@ class Converter:
r'\s+class="[^"]*"', r'\s+class="[^"]*"',
r'\s+style="[^"]*"', r'\s+style="[^"]*"',
] ]
for attr_pattern in unnecessary_attrs: for attr_pattern in unnecessary_attrs:
mathml = re.sub(attr_pattern, '', mathml) mathml = re.sub(attr_pattern, "", mathml)
# Step 3: Remove redundant single <mrow> wrapper at the top level # Step 3: Remove redundant single <mrow> wrapper at the top level
# Pattern: <math ...><mrow>content</mrow></math> # Pattern: <math ...><mrow>content</mrow></math>
# Simplify to: <math ...>content</math> # Simplify to: <math ...>content</math>
mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)' mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
match = re.search(mrow_pattern, mathml, re.DOTALL) match = re.search(mrow_pattern, mathml, re.DOTALL)
if match: if match:
# Check if there's only one mrow at the top level # Check if there's only one mrow at the top level
content = match.group(2) content = match.group(2)
# Only remove if the content doesn't have other top-level elements # Only remove if the content doesn't have other top-level elements
if not re.search(r'</[^>]+>\s*<[^/]', content): if not re.search(r"</[^>]+>\s*<[^/]", content):
mathml = f'{match.group(1)}{content}{match.group(3)}' mathml = f"{match.group(1)}{content}{match.group(3)}"
# Step 4: Change display to block for better Word rendering # Step 4: Change display to block for better Word rendering
mathml = mathml.replace('display="inline"', 'display="block"') mathml = mathml.replace('display="inline"', 'display="block"')
# Step 5: If no display attribute, add it # Step 5: If no display attribute, add it
if 'display=' not in mathml and '<math' in mathml: if "display=" not in mathml and "<math" in mathml:
mathml = mathml.replace('<math', '<math display="block"', 1) mathml = mathml.replace("<math", '<math display="block"', 1)
# Step 6: Ensure xmlns is present # Step 6: Ensure xmlns is present
if 'xmlns=' not in mathml and '<math' in mathml: if "xmlns=" not in mathml and "<math" in mathml:
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1) mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
# Step 7: Decode common Unicode entities to actual characters (Word prefers this) # Step 7: Decode common Unicode entities to actual characters (Word prefers this)
unicode_map = { unicode_map = {
# Basic operators # Basic operators
'&#x0002B;': '+', "&#x0002B;": "+",
'&#x0002D;': '-', "&#x0002D;": "-",
'&#x0002A;': '*', "&#x0002A;": "*",
'&#x0002F;': '/', "&#x0002F;": "/",
'&#x0003D;': '=', "&#x0003D;": "=",
'&#x0003C;': '<', "&#x0003C;": "<",
'&#x0003E;': '>', "&#x0003E;": ">",
'&#x00028;': '(', "&#x00028;": "(",
'&#x00029;': ')', "&#x00029;": ")",
'&#x0002C;': ',', "&#x0002C;": ",",
'&#x0002E;': '.', "&#x0002E;": ".",
'&#x0007C;': '|', "&#x0007C;": "|",
'&#x00B0;': '°', "&#x00B0;": "°",
'&#x00D7;': '×', # times "&#x00D7;": "×", # times
'&#x00F7;': '÷', # div "&#x00F7;": "÷", # div
'&#x00B1;': '±', # pm "&#x00B1;": "±", # pm
'&#x2213;': '', # mp "&#x2213;": "", # mp
# Ellipsis symbols # Ellipsis symbols
'&#x02026;': '', # ldots (horizontal) "&#x02026;": "", # ldots (horizontal)
'&#x022EE;': '', # vdots (vertical) "&#x022EE;": "", # vdots (vertical)
'&#x022EF;': '', # cdots (centered) "&#x022EF;": "", # cdots (centered)
'&#x022F0;': '', # iddots (diagonal up) "&#x022F0;": "", # iddots (diagonal up)
'&#x022F1;': '', # ddots (diagonal down) "&#x022F1;": "", # ddots (diagonal down)
# Greek letters (lowercase) # Greek letters (lowercase)
'&#x03B1;': 'α', # alpha "&#x03B1;": "α", # alpha
'&#x03B2;': 'β', # beta "&#x03B2;": "β", # beta
'&#x03B3;': 'γ', # gamma "&#x03B3;": "γ", # gamma
'&#x03B4;': 'δ', # delta "&#x03B4;": "δ", # delta
'&#x03B5;': 'ε', # epsilon "&#x03B5;": "ε", # epsilon
'&#x03B6;': 'ζ', # zeta "&#x03B6;": "ζ", # zeta
'&#x03B7;': 'η', # eta "&#x03B7;": "η", # eta
'&#x03B8;': 'θ', # theta "&#x03B8;": "θ", # theta
'&#x03B9;': 'ι', # iota "&#x03B9;": "ι", # iota
'&#x03BA;': 'κ', # kappa "&#x03BA;": "κ", # kappa
'&#x03BB;': 'λ', # lambda "&#x03BB;": "λ", # lambda
'&#x03BC;': 'μ', # mu "&#x03BC;": "μ", # mu
'&#x03BD;': 'ν', # nu "&#x03BD;": "ν", # nu
'&#x03BE;': 'ξ', # xi "&#x03BE;": "ξ", # xi
'&#x03BF;': 'ο', # omicron "&#x03BF;": "ο", # omicron
'&#x03C0;': 'π', # pi "&#x03C0;": "π", # pi
'&#x03C1;': 'ρ', # rho "&#x03C1;": "ρ", # rho
'&#x03C2;': 'ς', # final sigma "&#x03C2;": "ς", # final sigma
'&#x03C3;': 'σ', # sigma "&#x03C3;": "σ", # sigma
'&#x03C4;': 'τ', # tau "&#x03C4;": "τ", # tau
'&#x03C5;': 'υ', # upsilon "&#x03C5;": "υ", # upsilon
'&#x03C6;': 'φ', # phi "&#x03C6;": "φ", # phi
'&#x03C7;': 'χ', # chi "&#x03C7;": "χ", # chi
'&#x03C8;': 'ψ', # psi "&#x03C8;": "ψ", # psi
'&#x03C9;': 'ω', # omega "&#x03C9;": "ω", # omega
'&#x03D5;': 'ϕ', # phi variant "&#x03D5;": "ϕ", # phi variant
# Greek letters (uppercase) # Greek letters (uppercase)
'&#x0391;': 'Α', # Alpha "&#x0391;": "Α", # Alpha
'&#x0392;': 'Β', # Beta "&#x0392;": "Β", # Beta
'&#x0393;': 'Γ', # Gamma "&#x0393;": "Γ", # Gamma
'&#x0394;': 'Δ', # Delta "&#x0394;": "Δ", # Delta
'&#x0395;': 'Ε', # Epsilon "&#x0395;": "Ε", # Epsilon
'&#x0396;': 'Ζ', # Zeta "&#x0396;": "Ζ", # Zeta
'&#x0397;': 'Η', # Eta "&#x0397;": "Η", # Eta
'&#x0398;': 'Θ', # Theta "&#x0398;": "Θ", # Theta
'&#x0399;': 'Ι', # Iota "&#x0399;": "Ι", # Iota
'&#x039A;': 'Κ', # Kappa "&#x039A;": "Κ", # Kappa
'&#x039B;': 'Λ', # Lambda "&#x039B;": "Λ", # Lambda
'&#x039C;': 'Μ', # Mu "&#x039C;": "Μ", # Mu
'&#x039D;': 'Ν', # Nu "&#x039D;": "Ν", # Nu
'&#x039E;': 'Ξ', # Xi "&#x039E;": "Ξ", # Xi
'&#x039F;': 'Ο', # Omicron "&#x039F;": "Ο", # Omicron
'&#x03A0;': 'Π', # Pi "&#x03A0;": "Π", # Pi
'&#x03A1;': 'Ρ', # Rho "&#x03A1;": "Ρ", # Rho
'&#x03A3;': 'Σ', # Sigma "&#x03A3;": "Σ", # Sigma
'&#x03A4;': 'Τ', # Tau "&#x03A4;": "Τ", # Tau
'&#x03A5;': 'Υ', # Upsilon "&#x03A5;": "Υ", # Upsilon
'&#x03A6;': 'Φ', # Phi "&#x03A6;": "Φ", # Phi
'&#x03A7;': 'Χ', # Chi "&#x03A7;": "Χ", # Chi
'&#x03A8;': 'Ψ', # Psi "&#x03A8;": "Ψ", # Psi
'&#x03A9;': 'Ω', # Omega "&#x03A9;": "Ω", # Omega
# Math symbols # Math symbols
'&#x2205;': '', # emptyset "&#x2205;": "", # emptyset
'&#x2208;': '', # in "&#x2208;": "", # in
'&#x2209;': '', # notin "&#x2209;": "", # notin
'&#x220B;': '', # ni "&#x220B;": "", # ni
'&#x220C;': '', # nni "&#x220C;": "", # nni
'&#x2211;': '', # sum "&#x2211;": "", # sum
'&#x220F;': '', # prod "&#x220F;": "", # prod
'&#x221A;': '', # sqrt "&#x221A;": "", # sqrt
'&#x221B;': '', # cbrt "&#x221B;": "", # cbrt
'&#x221C;': '', # fourthroot "&#x221C;": "", # fourthroot
'&#x221E;': '', # infty "&#x221E;": "", # infty
'&#x2229;': '', # cap "&#x2229;": "", # cap
'&#x222A;': '', # cup "&#x222A;": "", # cup
'&#x222B;': '', # int "&#x222B;": "", # int
'&#x222C;': '', # iint "&#x222C;": "", # iint
'&#x222D;': '', # iiint "&#x222D;": "", # iiint
'&#x222E;': '', # oint "&#x222E;": "", # oint
'&#x2282;': '', # subset "&#x2282;": "", # subset
'&#x2283;': '', # supset "&#x2283;": "", # supset
'&#x2284;': '', # nsubset "&#x2284;": "", # nsubset
'&#x2285;': '', # nsupset "&#x2285;": "", # nsupset
'&#x2286;': '', # subseteq "&#x2286;": "", # subseteq
'&#x2287;': '', # supseteq "&#x2287;": "", # supseteq
'&#x2288;': '', # nsubseteq "&#x2288;": "", # nsubseteq
'&#x2289;': '', # nsupseteq "&#x2289;": "", # nsupseteq
'&#x2264;': '', # leq "&#x2264;": "", # leq
'&#x2265;': '', # geq "&#x2265;": "", # geq
'&#x2260;': '', # neq "&#x2260;": "", # neq
'&#x2261;': '', # equiv "&#x2261;": "", # equiv
'&#x2248;': '', # approx "&#x2248;": "", # approx
'&#x2243;': '', # simeq "&#x2243;": "", # simeq
'&#x2245;': '', # cong "&#x2245;": "", # cong
'&#x2202;': '', # partial "&#x2202;": "", # partial
'&#x2207;': '', # nabla "&#x2207;": "", # nabla
'&#x2200;': '', # forall "&#x2200;": "", # forall
'&#x2203;': '', # exists "&#x2203;": "", # exists
'&#x2204;': '', # nexists "&#x2204;": "", # nexists
'&#x00AC;': '¬', # neg/lnot "&#x00AC;": "¬", # neg/lnot
'&#x2227;': '', # wedge/land "&#x2227;": "", # wedge/land
'&#x2228;': '', # vee/lor "&#x2228;": "", # vee/lor
'&#x2192;': '', # to/rightarrow "&#x2192;": "", # to/rightarrow
'&#x2190;': '', # leftarrow "&#x2190;": "", # leftarrow
'&#x2194;': '', # leftrightarrow "&#x2194;": "", # leftrightarrow
'&#x21D2;': '', # Rightarrow "&#x21D2;": "", # Rightarrow
'&#x21D0;': '', # Leftarrow "&#x21D0;": "", # Leftarrow
'&#x21D4;': '', # Leftrightarrow "&#x21D4;": "", # Leftrightarrow
'&#x2191;': '', # uparrow "&#x2191;": "", # uparrow
'&#x2193;': '', # downarrow "&#x2193;": "", # downarrow
'&#x21D1;': '', # Uparrow "&#x21D1;": "", # Uparrow
'&#x21D3;': '', # Downarrow "&#x21D3;": "", # Downarrow
'&#x2195;': '', # updownarrow "&#x2195;": "", # updownarrow
'&#x21D5;': '', # Updownarrow "&#x21D5;": "", # Updownarrow
'&#x2260;': '', # ne "&#x2260;": "", # ne
'&#x226A;': '', # ll "&#x226A;": "", # ll
'&#x226B;': '', # gg "&#x226B;": "", # gg
'&#x2A7D;': '', # leqslant "&#x2A7D;": "", # leqslant
'&#x2A7E;': '', # geqslant "&#x2A7E;": "", # geqslant
'&#x22A5;': '', # perp "&#x22A5;": "", # perp
'&#x2225;': '', # parallel "&#x2225;": "", # parallel
'&#x2220;': '', # angle "&#x2220;": "", # angle
'&#x25B3;': '', # triangle "&#x25B3;": "", # triangle
'&#x25A1;': '', # square "&#x25A1;": "", # square
'&#x25CA;': '', # diamond "&#x25CA;": "", # diamond
'&#x2660;': '', # spadesuit "&#x2660;": "", # spadesuit
'&#x2661;': '', # heartsuit "&#x2661;": "", # heartsuit
'&#x2662;': '', # diamondsuit "&#x2662;": "", # diamondsuit
'&#x2663;': '', # clubsuit "&#x2663;": "", # clubsuit
'&#x2113;': '', # ell "&#x2113;": "", # ell
'&#x2118;': '', # wp (Weierstrass p) "&#x2118;": "", # wp (Weierstrass p)
'&#x211C;': '', # Re (real part) "&#x211C;": "", # Re (real part)
'&#x2111;': '', # Im (imaginary part) "&#x2111;": "", # Im (imaginary part)
'&#x2135;': '', # aleph "&#x2135;": "", # aleph
'&#x2136;': '', # beth "&#x2136;": "", # beth
} }
for entity, char in unicode_map.items(): for entity, char in unicode_map.items():
mathml = mathml.replace(entity, char) mathml = mathml.replace(entity, char)
# Also handle decimal entity format (&#NNNN;) for common characters # Also handle decimal entity format (&#NNNN;) for common characters
# Convert decimal to hex-based lookup # Convert decimal to hex-based lookup
decimal_patterns = [ decimal_patterns = [
(r'&#955;', 'λ'), # lambda (decimal 955 = hex 03BB) (r"&#955;", "λ"), # lambda (decimal 955 = hex 03BB)
(r'&#8942;', ''), # vdots (decimal 8942 = hex 22EE) (r"&#8942;", ""), # vdots (decimal 8942 = hex 22EE)
(r'&#8943;', ''), # cdots (decimal 8943 = hex 22EF) (r"&#8943;", ""), # cdots (decimal 8943 = hex 22EF)
(r'&#8230;', ''), # ldots (decimal 8230 = hex 2026) (r"&#8230;", ""), # ldots (decimal 8230 = hex 2026)
(r'&#8734;', ''), # infty (decimal 8734 = hex 221E) (r"&#8734;", ""), # infty (decimal 8734 = hex 221E)
(r'&#8721;', ''), # sum (decimal 8721 = hex 2211) (r"&#8721;", ""), # sum (decimal 8721 = hex 2211)
(r'&#8719;', ''), # prod (decimal 8719 = hex 220F) (r"&#8719;", ""), # prod (decimal 8719 = hex 220F)
(r'&#8730;', ''), # sqrt (decimal 8730 = hex 221A) (r"&#8730;", ""), # sqrt (decimal 8730 = hex 221A)
(r'&#8712;', ''), # in (decimal 8712 = hex 2208) (r"&#8712;", ""), # in (decimal 8712 = hex 2208)
(r'&#8713;', ''), # notin (decimal 8713 = hex 2209) (r"&#8713;", ""), # notin (decimal 8713 = hex 2209)
(r'&#8745;', ''), # cap (decimal 8745 = hex 2229) (r"&#8745;", ""), # cap (decimal 8745 = hex 2229)
(r'&#8746;', ''), # cup (decimal 8746 = hex 222A) (r"&#8746;", ""), # cup (decimal 8746 = hex 222A)
(r'&#8804;', ''), # leq (decimal 8804 = hex 2264) (r"&#8804;", ""), # leq (decimal 8804 = hex 2264)
(r'&#8805;', ''), # geq (decimal 8805 = hex 2265) (r"&#8805;", ""), # geq (decimal 8805 = hex 2265)
(r'&#8800;', ''), # neq (decimal 8800 = hex 2260) (r"&#8800;", ""), # neq (decimal 8800 = hex 2260)
(r'&#8776;', ''), # approx (decimal 8776 = hex 2248) (r"&#8776;", ""), # approx (decimal 8776 = hex 2248)
(r'&#8801;', ''), # equiv (decimal 8801 = hex 2261) (r"&#8801;", ""), # equiv (decimal 8801 = hex 2261)
] ]
for pattern, char in decimal_patterns: for pattern, char in decimal_patterns:
mathml = mathml.replace(pattern, char) mathml = mathml.replace(pattern, char)
# Step 8: Clean up extra whitespace # Step 8: Clean up extra whitespace
mathml = re.sub(r'>\s+<', '><', mathml) mathml = re.sub(r">\s+<", "><", mathml)
return mathml return mathml
def _latex_to_mathml(self, latex_formula: str) -> str: def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
"""Convert LaTeX formula to standard MathML. """Convert LaTeX formula to standard MathML.
Args: Args:
latex_formula: Pure LaTeX formula (without delimiters). latex_formula: Pure LaTeX formula (without delimiters).
is_display: True if display (block) formula, False if inline.
Returns: Returns:
Standard MathML representation. Standard MathML representation.
""" """
return self._latex_to_mathml_cached(latex_formula) return self._latex_to_mathml_cached(latex_formula, is_display=is_display)
def _mathml_to_mml(self, mathml: str) -> str: def _mathml_to_mml(self, mathml: str) -> str:
"""Convert standard MathML to mml:math format with namespace prefix. """Convert standard MathML to mml:math format with namespace prefix.

View File

@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
"log", "log",
"ln", "ln",
"exp", "exp",
# set relations (often glued by OCR)
"in",
"notin",
"subset",
"supset",
"subseteq",
"supseteq",
"cap",
"cup",
# misc # misc
"partial", "partial",
"nabla", "nabla",
} }
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL) _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
# Match LaTeX commands: \command (greedy match all letters)
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+") _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
# stage2: differentials inside math segments # stage2: differentials inside math segments
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
Examples: Examples:
- \\cdotdS -> \\cdot dS - \\cdotdS -> \\cdot dS
- \\intdx -> \\int dx - \\intdx -> \\int dx
- \\inX -> \\in X (stop at uppercase letter)
""" """
if not token.startswith("\\"): if not token.startswith("\\"):
return token return token
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
return token return token
best = None best = None
# longest prefix that is in whitelist # Find longest prefix that is in whitelist
for i in range(1, len(body)): for i in range(1, len(body) + 1):
prefix = body[:i] prefix = body[:i]
if prefix in _COMMANDS_NEED_SPACE: if prefix in _COMMANDS_NEED_SPACE:
best = prefix best = prefix
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
# Pattern 2: Spaces inside braces that follow _ or ^ # Pattern 2: Spaces inside braces that follow _ or ^
# _{i 1} -> _{i1}, ^{2 3} -> ^{23} # _{i 1} -> _{i1}, ^{2 3} -> ^{23}
# This is safe because spaces inside subscript/superscript braces are usually OCR errors # This is safe because spaces inside subscript/superscript braces are usually OCR errors
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
# must be preserved as they serve as command terminators (\in X != \inX)
def clean_subscript_superscript_braces(match): def clean_subscript_superscript_braces(match):
operator = match.group(1) # _ or ^ operator = match.group(1) # _ or ^
content = match.group(2) # content inside braces content = match.group(2) # content inside braces
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta) if "\\" not in content:
# Only remove spaces between non-backslash characters # No LaTeX commands: safe to remove all spaces
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content) cleaned = re.sub(r"\s+", "", content)
else:
# Contains LaTeX commands: remove spaces carefully
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
# Strategy: remove spaces before \ and between non-command chars,
# but preserve the space after \command when followed by a non-\ char
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
return f"{operator}{{{cleaned}}}" return f"{operator}{{{cleaned}}}"
# Match _{ ... } or ^{ ... } # Match _{ ... } or ^{ ... }
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:
Processing stages: Processing stages:
0. Fix OCR number errors (spaces in numbers) 0. Fix OCR number errors (spaces in numbers)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS) 1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1}) 2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
3. Normalize differentials (DISABLED by default to avoid breaking variables) 3. Normalize differentials (DISABLED by default to avoid breaking variables)
@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
# stage0: fix OCR number errors (digits with spaces) # stage0: fix OCR number errors (digits with spaces)
expr = _fix_ocr_number_errors(expr) expr = _fix_ocr_number_errors(expr)
# stage1: split glued command tokens (e.g. \cdotdS) # stage1: split glued command tokens (e.g. \cdotdS, \inX)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces) # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)