fix: post hanlde for ocr
This commit is contained in:
@@ -136,6 +136,7 @@ class Converter:
|
||||
"""Get cached XSLT transform for MathML to mml: conversion."""
|
||||
if cls._mml_xslt_transform is None:
|
||||
from lxml import etree
|
||||
|
||||
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
|
||||
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
|
||||
return cls._mml_xslt_transform
|
||||
@@ -197,14 +198,17 @@ class Converter:
|
||||
return ConvertResult(latex="", mathml="", mml="")
|
||||
|
||||
try:
|
||||
# Detect if formula is display (block) or inline
|
||||
is_display = self._is_display_formula(md_text)
|
||||
|
||||
# Extract the LaTeX formula content (remove delimiters)
|
||||
latex_formula = self._extract_latex_formula(md_text)
|
||||
|
||||
# Preprocess formula for better conversion (fix array specifiers, etc.)
|
||||
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
|
||||
|
||||
# Convert to MathML
|
||||
mathml = self._latex_to_mathml(preprocessed_formula)
|
||||
# Convert to MathML (pass display flag to use correct delimiters)
|
||||
mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)
|
||||
|
||||
# Convert MathML to mml:math format (with namespace prefix)
|
||||
mml = self._mathml_to_mml(mathml)
|
||||
@@ -248,8 +252,8 @@ class Converter:
|
||||
consistency across all conversion paths. This fixes common issues that
|
||||
cause Pandoc conversion to fail.
|
||||
|
||||
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
|
||||
so we don't need to handle them here.
|
||||
Note: OCR errors (number errors, command spacing) are fixed earlier in the
|
||||
pipeline (in ocr_service.py), so we don't need to handle them here.
|
||||
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula.
|
||||
@@ -271,6 +275,26 @@ class Converter:
|
||||
|
||||
return latex_formula
|
||||
|
||||
def _is_display_formula(self, text: str) -> bool:
|
||||
"""Check if the formula is a display (block) formula.
|
||||
|
||||
Args:
|
||||
text: Text containing LaTeX formula with delimiters.
|
||||
|
||||
Returns:
|
||||
True if display formula ($$...$$ or \\[...\\]), False if inline.
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# Display math delimiters: $$...$$ or \[...\]
|
||||
if text.startswith("$$") and text.endswith("$$"):
|
||||
return True
|
||||
if text.startswith("\\[") and text.endswith("\\]"):
|
||||
return True
|
||||
|
||||
# Inline math delimiters: $...$ or \(...\)
|
||||
return False
|
||||
|
||||
def _extract_latex_formula(self, text: str) -> str:
|
||||
"""Extract LaTeX formula from text by removing delimiters.
|
||||
|
||||
@@ -299,18 +323,30 @@ class Converter:
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=256)
|
||||
def _latex_to_mathml_cached(latex_formula: str) -> str:
|
||||
def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
|
||||
"""Cached conversion of LaTeX formula to MathML.
|
||||
|
||||
Uses Pandoc for conversion to ensure Word compatibility.
|
||||
Pandoc generates standard MathML that Word can properly import.
|
||||
|
||||
Uses LRU cache to avoid recomputing for repeated formulas.
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula (without delimiters).
|
||||
is_display: True if display (block) formula, False if inline.
|
||||
|
||||
Returns:
|
||||
Standard MathML representation.
|
||||
"""
|
||||
# Use appropriate delimiters based on formula type
|
||||
# Display formulas use $$...$$, inline formulas use $...$
|
||||
if is_display:
|
||||
pandoc_input = f"$${latex_formula}$$"
|
||||
else:
|
||||
pandoc_input = f"${latex_formula}$"
|
||||
|
||||
try:
|
||||
# Use Pandoc for Word-compatible MathML (primary method)
|
||||
mathml_html = pypandoc.convert_text(
|
||||
f"${latex_formula}$",
|
||||
pandoc_input,
|
||||
"html",
|
||||
format="markdown+tex_math_dollars",
|
||||
extra_args=["--mathml"],
|
||||
@@ -322,8 +358,9 @@ class Converter:
|
||||
# Post-process for Word compatibility
|
||||
return Converter._postprocess_mathml_for_word(mathml)
|
||||
|
||||
# If no match, return as-is
|
||||
return mathml_html.rstrip("\n")
|
||||
# If Pandoc didn't generate MathML (returned HTML instead), use fallback
|
||||
# This happens when Pandoc's mathml output format is not available or fails
|
||||
raise ValueError("Pandoc did not generate MathML, got HTML instead")
|
||||
|
||||
except Exception as pandoc_error:
|
||||
# Fallback: try latex2mathml (less Word-compatible)
|
||||
@@ -331,9 +368,7 @@ class Converter:
|
||||
mathml = latex_to_mathml(latex_formula)
|
||||
return Converter._postprocess_mathml_for_word(mathml)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
|
||||
) from e
|
||||
raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e
|
||||
|
||||
@staticmethod
|
||||
def _postprocess_mathml_for_word(mathml: str) -> str:
|
||||
@@ -357,20 +392,20 @@ class Converter:
|
||||
|
||||
# Step 1: Remove <semantics> and <annotation> wrappers
|
||||
# These often cause Word import issues
|
||||
if '<semantics>' in mathml:
|
||||
if "<semantics>" in mathml:
|
||||
# Extract content between <semantics> and <annotation>
|
||||
match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
|
||||
match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
|
||||
if match:
|
||||
content = match.group(1).strip()
|
||||
|
||||
# Get the math element attributes
|
||||
math_attrs = ""
|
||||
math_match = re.search(r'<math([^>]*)>', mathml)
|
||||
math_match = re.search(r"<math([^>]*)>", mathml)
|
||||
if math_match:
|
||||
math_attrs = math_match.group(1)
|
||||
|
||||
# Rebuild without semantics
|
||||
mathml = f'<math{math_attrs}>{content}</math>'
|
||||
mathml = f"<math{math_attrs}>{content}</math>"
|
||||
|
||||
# Step 2: Remove unnecessary attributes that don't affect rendering
|
||||
# These are verbose and Word doesn't need them
|
||||
@@ -392,187 +427,183 @@ class Converter:
|
||||
]
|
||||
|
||||
for attr_pattern in unnecessary_attrs:
|
||||
mathml = re.sub(attr_pattern, '', mathml)
|
||||
mathml = re.sub(attr_pattern, "", mathml)
|
||||
|
||||
# Step 3: Remove redundant single <mrow> wrapper at the top level
|
||||
# Pattern: <math ...><mrow>content</mrow></math>
|
||||
# Simplify to: <math ...>content</math>
|
||||
mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
|
||||
mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
|
||||
match = re.search(mrow_pattern, mathml, re.DOTALL)
|
||||
if match:
|
||||
# Check if there's only one mrow at the top level
|
||||
content = match.group(2)
|
||||
# Only remove if the content doesn't have other top-level elements
|
||||
if not re.search(r'</[^>]+>\s*<[^/]', content):
|
||||
mathml = f'{match.group(1)}{content}{match.group(3)}'
|
||||
if not re.search(r"</[^>]+>\s*<[^/]", content):
|
||||
mathml = f"{match.group(1)}{content}{match.group(3)}"
|
||||
|
||||
# Step 4: Change display to block for better Word rendering
|
||||
mathml = mathml.replace('display="inline"', 'display="block"')
|
||||
|
||||
# Step 5: If no display attribute, add it
|
||||
if 'display=' not in mathml and '<math' in mathml:
|
||||
mathml = mathml.replace('<math', '<math display="block"', 1)
|
||||
if "display=" not in mathml and "<math" in mathml:
|
||||
mathml = mathml.replace("<math", '<math display="block"', 1)
|
||||
|
||||
# Step 6: Ensure xmlns is present
|
||||
if 'xmlns=' not in mathml and '<math' in mathml:
|
||||
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
||||
if "xmlns=" not in mathml and "<math" in mathml:
|
||||
mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
||||
|
||||
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
|
||||
unicode_map = {
|
||||
# Basic operators
|
||||
'+': '+',
|
||||
'-': '-',
|
||||
'*': '*',
|
||||
'/': '/',
|
||||
'=': '=',
|
||||
'<': '<',
|
||||
'>': '>',
|
||||
'(': '(',
|
||||
')': ')',
|
||||
',': ',',
|
||||
'.': '.',
|
||||
'|': '|',
|
||||
'°': '°',
|
||||
'×': '×', # times
|
||||
'÷': '÷', # div
|
||||
'±': '±', # pm
|
||||
'∓': '∓', # mp
|
||||
|
||||
"+": "+",
|
||||
"-": "-",
|
||||
"*": "*",
|
||||
"/": "/",
|
||||
"=": "=",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
"(": "(",
|
||||
")": ")",
|
||||
",": ",",
|
||||
".": ".",
|
||||
"|": "|",
|
||||
"°": "°",
|
||||
"×": "×", # times
|
||||
"÷": "÷", # div
|
||||
"±": "±", # pm
|
||||
"∓": "∓", # mp
|
||||
# Ellipsis symbols
|
||||
'…': '…', # ldots (horizontal)
|
||||
'⋮': '⋮', # vdots (vertical)
|
||||
'⋯': '⋯', # cdots (centered)
|
||||
'⋰': '⋰', # iddots (diagonal up)
|
||||
'⋱': '⋱', # ddots (diagonal down)
|
||||
|
||||
"…": "…", # ldots (horizontal)
|
||||
"⋮": "⋮", # vdots (vertical)
|
||||
"⋯": "⋯", # cdots (centered)
|
||||
"⋰": "⋰", # iddots (diagonal up)
|
||||
"⋱": "⋱", # ddots (diagonal down)
|
||||
# Greek letters (lowercase)
|
||||
'α': 'α', # alpha
|
||||
'β': 'β', # beta
|
||||
'γ': 'γ', # gamma
|
||||
'δ': 'δ', # delta
|
||||
'ε': 'ε', # epsilon
|
||||
'ζ': 'ζ', # zeta
|
||||
'η': 'η', # eta
|
||||
'θ': 'θ', # theta
|
||||
'ι': 'ι', # iota
|
||||
'κ': 'κ', # kappa
|
||||
'λ': 'λ', # lambda
|
||||
'μ': 'μ', # mu
|
||||
'ν': 'ν', # nu
|
||||
'ξ': 'ξ', # xi
|
||||
'ο': 'ο', # omicron
|
||||
'π': 'π', # pi
|
||||
'ρ': 'ρ', # rho
|
||||
'ς': 'ς', # final sigma
|
||||
'σ': 'σ', # sigma
|
||||
'τ': 'τ', # tau
|
||||
'υ': 'υ', # upsilon
|
||||
'φ': 'φ', # phi
|
||||
'χ': 'χ', # chi
|
||||
'ψ': 'ψ', # psi
|
||||
'ω': 'ω', # omega
|
||||
'ϕ': 'ϕ', # phi variant
|
||||
|
||||
"α": "α", # alpha
|
||||
"β": "β", # beta
|
||||
"γ": "γ", # gamma
|
||||
"δ": "δ", # delta
|
||||
"ε": "ε", # epsilon
|
||||
"ζ": "ζ", # zeta
|
||||
"η": "η", # eta
|
||||
"θ": "θ", # theta
|
||||
"ι": "ι", # iota
|
||||
"κ": "κ", # kappa
|
||||
"λ": "λ", # lambda
|
||||
"μ": "μ", # mu
|
||||
"ν": "ν", # nu
|
||||
"ξ": "ξ", # xi
|
||||
"ο": "ο", # omicron
|
||||
"π": "π", # pi
|
||||
"ρ": "ρ", # rho
|
||||
"ς": "ς", # final sigma
|
||||
"σ": "σ", # sigma
|
||||
"τ": "τ", # tau
|
||||
"υ": "υ", # upsilon
|
||||
"φ": "φ", # phi
|
||||
"χ": "χ", # chi
|
||||
"ψ": "ψ", # psi
|
||||
"ω": "ω", # omega
|
||||
"ϕ": "ϕ", # phi variant
|
||||
# Greek letters (uppercase)
|
||||
'Α': 'Α', # Alpha
|
||||
'Β': 'Β', # Beta
|
||||
'Γ': 'Γ', # Gamma
|
||||
'Δ': 'Δ', # Delta
|
||||
'Ε': 'Ε', # Epsilon
|
||||
'Ζ': 'Ζ', # Zeta
|
||||
'Η': 'Η', # Eta
|
||||
'Θ': 'Θ', # Theta
|
||||
'Ι': 'Ι', # Iota
|
||||
'Κ': 'Κ', # Kappa
|
||||
'Λ': 'Λ', # Lambda
|
||||
'Μ': 'Μ', # Mu
|
||||
'Ν': 'Ν', # Nu
|
||||
'Ξ': 'Ξ', # Xi
|
||||
'Ο': 'Ο', # Omicron
|
||||
'Π': 'Π', # Pi
|
||||
'Ρ': 'Ρ', # Rho
|
||||
'Σ': 'Σ', # Sigma
|
||||
'Τ': 'Τ', # Tau
|
||||
'Υ': 'Υ', # Upsilon
|
||||
'Φ': 'Φ', # Phi
|
||||
'Χ': 'Χ', # Chi
|
||||
'Ψ': 'Ψ', # Psi
|
||||
'Ω': 'Ω', # Omega
|
||||
|
||||
"Α": "Α", # Alpha
|
||||
"Β": "Β", # Beta
|
||||
"Γ": "Γ", # Gamma
|
||||
"Δ": "Δ", # Delta
|
||||
"Ε": "Ε", # Epsilon
|
||||
"Ζ": "Ζ", # Zeta
|
||||
"Η": "Η", # Eta
|
||||
"Θ": "Θ", # Theta
|
||||
"Ι": "Ι", # Iota
|
||||
"Κ": "Κ", # Kappa
|
||||
"Λ": "Λ", # Lambda
|
||||
"Μ": "Μ", # Mu
|
||||
"Ν": "Ν", # Nu
|
||||
"Ξ": "Ξ", # Xi
|
||||
"Ο": "Ο", # Omicron
|
||||
"Π": "Π", # Pi
|
||||
"Ρ": "Ρ", # Rho
|
||||
"Σ": "Σ", # Sigma
|
||||
"Τ": "Τ", # Tau
|
||||
"Υ": "Υ", # Upsilon
|
||||
"Φ": "Φ", # Phi
|
||||
"Χ": "Χ", # Chi
|
||||
"Ψ": "Ψ", # Psi
|
||||
"Ω": "Ω", # Omega
|
||||
# Math symbols
|
||||
'∅': '∅', # emptyset
|
||||
'∈': '∈', # in
|
||||
'∉': '∉', # notin
|
||||
'∋': '∋', # ni
|
||||
'∌': '∌', # nni
|
||||
'∑': '∑', # sum
|
||||
'∏': '∏', # prod
|
||||
'√': '√', # sqrt
|
||||
'∛': '∛', # cbrt
|
||||
'∜': '∜', # fourthroot
|
||||
'∞': '∞', # infty
|
||||
'∩': '∩', # cap
|
||||
'∪': '∪', # cup
|
||||
'∫': '∫', # int
|
||||
'∬': '∬', # iint
|
||||
'∭': '∭', # iiint
|
||||
'∮': '∮', # oint
|
||||
'⊂': '⊂', # subset
|
||||
'⊃': '⊃', # supset
|
||||
'⊄': '⊄', # nsubset
|
||||
'⊅': '⊅', # nsupset
|
||||
'⊆': '⊆', # subseteq
|
||||
'⊇': '⊇', # supseteq
|
||||
'⊈': '⊈', # nsubseteq
|
||||
'⊉': '⊉', # nsupseteq
|
||||
'≤': '≤', # leq
|
||||
'≥': '≥', # geq
|
||||
'≠': '≠', # neq
|
||||
'≡': '≡', # equiv
|
||||
'≈': '≈', # approx
|
||||
'≃': '≃', # simeq
|
||||
'≅': '≅', # cong
|
||||
'∂': '∂', # partial
|
||||
'∇': '∇', # nabla
|
||||
'∀': '∀', # forall
|
||||
'∃': '∃', # exists
|
||||
'∄': '∄', # nexists
|
||||
'¬': '¬', # neg/lnot
|
||||
'∧': '∧', # wedge/land
|
||||
'∨': '∨', # vee/lor
|
||||
'→': '→', # to/rightarrow
|
||||
'←': '←', # leftarrow
|
||||
'↔': '↔', # leftrightarrow
|
||||
'⇒': '⇒', # Rightarrow
|
||||
'⇐': '⇐', # Leftarrow
|
||||
'⇔': '⇔', # Leftrightarrow
|
||||
'↑': '↑', # uparrow
|
||||
'↓': '↓', # downarrow
|
||||
'⇑': '⇑', # Uparrow
|
||||
'⇓': '⇓', # Downarrow
|
||||
'↕': '↕', # updownarrow
|
||||
'⇕': '⇕', # Updownarrow
|
||||
'≠': '≠', # ne
|
||||
'≪': '≪', # ll
|
||||
'≫': '≫', # gg
|
||||
'⩽': '⩽', # leqslant
|
||||
'⩾': '⩾', # geqslant
|
||||
'⊥': '⊥', # perp
|
||||
'∥': '∥', # parallel
|
||||
'∠': '∠', # angle
|
||||
'△': '△', # triangle
|
||||
'□': '□', # square
|
||||
'◊': '◊', # diamond
|
||||
'♠': '♠', # spadesuit
|
||||
'♡': '♡', # heartsuit
|
||||
'♢': '♢', # diamondsuit
|
||||
'♣': '♣', # clubsuit
|
||||
'ℓ': 'ℓ', # ell
|
||||
'℘': '℘', # wp (Weierstrass p)
|
||||
'ℜ': 'ℜ', # Re (real part)
|
||||
'ℑ': 'ℑ', # Im (imaginary part)
|
||||
'ℵ': 'ℵ', # aleph
|
||||
'ℶ': 'ℶ', # beth
|
||||
"∅": "∅", # emptyset
|
||||
"∈": "∈", # in
|
||||
"∉": "∉", # notin
|
||||
"∋": "∋", # ni
|
||||
"∌": "∌", # nni
|
||||
"∑": "∑", # sum
|
||||
"∏": "∏", # prod
|
||||
"√": "√", # sqrt
|
||||
"∛": "∛", # cbrt
|
||||
"∜": "∜", # fourthroot
|
||||
"∞": "∞", # infty
|
||||
"∩": "∩", # cap
|
||||
"∪": "∪", # cup
|
||||
"∫": "∫", # int
|
||||
"∬": "∬", # iint
|
||||
"∭": "∭", # iiint
|
||||
"∮": "∮", # oint
|
||||
"⊂": "⊂", # subset
|
||||
"⊃": "⊃", # supset
|
||||
"⊄": "⊄", # nsubset
|
||||
"⊅": "⊅", # nsupset
|
||||
"⊆": "⊆", # subseteq
|
||||
"⊇": "⊇", # supseteq
|
||||
"⊈": "⊈", # nsubseteq
|
||||
"⊉": "⊉", # nsupseteq
|
||||
"≤": "≤", # leq
|
||||
"≥": "≥", # geq
|
||||
"≠": "≠", # neq
|
||||
"≡": "≡", # equiv
|
||||
"≈": "≈", # approx
|
||||
"≃": "≃", # simeq
|
||||
"≅": "≅", # cong
|
||||
"∂": "∂", # partial
|
||||
"∇": "∇", # nabla
|
||||
"∀": "∀", # forall
|
||||
"∃": "∃", # exists
|
||||
"∄": "∄", # nexists
|
||||
"¬": "¬", # neg/lnot
|
||||
"∧": "∧", # wedge/land
|
||||
"∨": "∨", # vee/lor
|
||||
"→": "→", # to/rightarrow
|
||||
"←": "←", # leftarrow
|
||||
"↔": "↔", # leftrightarrow
|
||||
"⇒": "⇒", # Rightarrow
|
||||
"⇐": "⇐", # Leftarrow
|
||||
"⇔": "⇔", # Leftrightarrow
|
||||
"↑": "↑", # uparrow
|
||||
"↓": "↓", # downarrow
|
||||
"⇑": "⇑", # Uparrow
|
||||
"⇓": "⇓", # Downarrow
|
||||
"↕": "↕", # updownarrow
|
||||
"⇕": "⇕", # Updownarrow
|
||||
"≠": "≠", # ne
|
||||
"≪": "≪", # ll
|
||||
"≫": "≫", # gg
|
||||
"⩽": "⩽", # leqslant
|
||||
"⩾": "⩾", # geqslant
|
||||
"⊥": "⊥", # perp
|
||||
"∥": "∥", # parallel
|
||||
"∠": "∠", # angle
|
||||
"△": "△", # triangle
|
||||
"□": "□", # square
|
||||
"◊": "◊", # diamond
|
||||
"♠": "♠", # spadesuit
|
||||
"♡": "♡", # heartsuit
|
||||
"♢": "♢", # diamondsuit
|
||||
"♣": "♣", # clubsuit
|
||||
"ℓ": "ℓ", # ell
|
||||
"℘": "℘", # wp (Weierstrass p)
|
||||
"ℜ": "ℜ", # Re (real part)
|
||||
"ℑ": "ℑ", # Im (imaginary part)
|
||||
"ℵ": "ℵ", # aleph
|
||||
"ℶ": "ℶ", # beth
|
||||
}
|
||||
|
||||
for entity, char in unicode_map.items():
|
||||
@@ -581,43 +612,44 @@ class Converter:
|
||||
# Also handle decimal entity format (&#NNNN;) for common characters
|
||||
# Convert decimal to hex-based lookup
|
||||
decimal_patterns = [
|
||||
(r'λ', 'λ'), # lambda (decimal 955 = hex 03BB)
|
||||
(r'⋮', '⋮'), # vdots (decimal 8942 = hex 22EE)
|
||||
(r'⋯', '⋯'), # cdots (decimal 8943 = hex 22EF)
|
||||
(r'…', '…'), # ldots (decimal 8230 = hex 2026)
|
||||
(r'∞', '∞'), # infty (decimal 8734 = hex 221E)
|
||||
(r'∑', '∑'), # sum (decimal 8721 = hex 2211)
|
||||
(r'∏', '∏'), # prod (decimal 8719 = hex 220F)
|
||||
(r'√', '√'), # sqrt (decimal 8730 = hex 221A)
|
||||
(r'∈', '∈'), # in (decimal 8712 = hex 2208)
|
||||
(r'∉', '∉'), # notin (decimal 8713 = hex 2209)
|
||||
(r'∩', '∩'), # cap (decimal 8745 = hex 2229)
|
||||
(r'∪', '∪'), # cup (decimal 8746 = hex 222A)
|
||||
(r'≤', '≤'), # leq (decimal 8804 = hex 2264)
|
||||
(r'≥', '≥'), # geq (decimal 8805 = hex 2265)
|
||||
(r'≠', '≠'), # neq (decimal 8800 = hex 2260)
|
||||
(r'≈', '≈'), # approx (decimal 8776 = hex 2248)
|
||||
(r'≡', '≡'), # equiv (decimal 8801 = hex 2261)
|
||||
(r"λ", "λ"), # lambda (decimal 955 = hex 03BB)
|
||||
(r"⋮", "⋮"), # vdots (decimal 8942 = hex 22EE)
|
||||
(r"⋯", "⋯"), # cdots (decimal 8943 = hex 22EF)
|
||||
(r"…", "…"), # ldots (decimal 8230 = hex 2026)
|
||||
(r"∞", "∞"), # infty (decimal 8734 = hex 221E)
|
||||
(r"∑", "∑"), # sum (decimal 8721 = hex 2211)
|
||||
(r"∏", "∏"), # prod (decimal 8719 = hex 220F)
|
||||
(r"√", "√"), # sqrt (decimal 8730 = hex 221A)
|
||||
(r"∈", "∈"), # in (decimal 8712 = hex 2208)
|
||||
(r"∉", "∉"), # notin (decimal 8713 = hex 2209)
|
||||
(r"∩", "∩"), # cap (decimal 8745 = hex 2229)
|
||||
(r"∪", "∪"), # cup (decimal 8746 = hex 222A)
|
||||
(r"≤", "≤"), # leq (decimal 8804 = hex 2264)
|
||||
(r"≥", "≥"), # geq (decimal 8805 = hex 2265)
|
||||
(r"≠", "≠"), # neq (decimal 8800 = hex 2260)
|
||||
(r"≈", "≈"), # approx (decimal 8776 = hex 2248)
|
||||
(r"≡", "≡"), # equiv (decimal 8801 = hex 2261)
|
||||
]
|
||||
|
||||
for pattern, char in decimal_patterns:
|
||||
mathml = mathml.replace(pattern, char)
|
||||
|
||||
# Step 8: Clean up extra whitespace
|
||||
mathml = re.sub(r'>\s+<', '><', mathml)
|
||||
mathml = re.sub(r">\s+<", "><", mathml)
|
||||
|
||||
return mathml
|
||||
|
||||
def _latex_to_mathml(self, latex_formula: str) -> str:
|
||||
def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
|
||||
"""Convert LaTeX formula to standard MathML.
|
||||
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula (without delimiters).
|
||||
is_display: True if display (block) formula, False if inline.
|
||||
|
||||
Returns:
|
||||
Standard MathML representation.
|
||||
"""
|
||||
return self._latex_to_mathml_cached(latex_formula)
|
||||
return self._latex_to_mathml_cached(latex_formula, is_display=is_display)
|
||||
|
||||
def _mathml_to_mml(self, mathml: str) -> str:
|
||||
"""Convert standard MathML to mml:math format with namespace prefix.
|
||||
|
||||
@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
|
||||
"log",
|
||||
"ln",
|
||||
"exp",
|
||||
# set relations (often glued by OCR)
|
||||
"in",
|
||||
"notin",
|
||||
"subset",
|
||||
"supset",
|
||||
"subseteq",
|
||||
"supseteq",
|
||||
"cap",
|
||||
"cup",
|
||||
# misc
|
||||
"partial",
|
||||
"nabla",
|
||||
}
|
||||
|
||||
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||
# Match LaTeX commands: \command (greedy match all letters)
|
||||
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
|
||||
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
||||
|
||||
# stage2: differentials inside math segments
|
||||
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
|
||||
Examples:
|
||||
- \\cdotdS -> \\cdot dS
|
||||
- \\intdx -> \\int dx
|
||||
- \\inX -> \\in X (stop at uppercase letter)
|
||||
"""
|
||||
if not token.startswith("\\"):
|
||||
return token
|
||||
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
|
||||
return token
|
||||
|
||||
best = None
|
||||
# longest prefix that is in whitelist
|
||||
for i in range(1, len(body)):
|
||||
# Find longest prefix that is in whitelist
|
||||
for i in range(1, len(body) + 1):
|
||||
prefix = body[:i]
|
||||
if prefix in _COMMANDS_NEED_SPACE:
|
||||
best = prefix
|
||||
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
|
||||
# Pattern 2: Spaces inside braces that follow _ or ^
|
||||
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
||||
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
||||
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
|
||||
# must be preserved as they serve as command terminators (\in X != \inX)
|
||||
def clean_subscript_superscript_braces(match):
|
||||
operator = match.group(1) # _ or ^
|
||||
content = match.group(2) # content inside braces
|
||||
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
|
||||
# Only remove spaces between non-backslash characters
|
||||
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
|
||||
if "\\" not in content:
|
||||
# No LaTeX commands: safe to remove all spaces
|
||||
cleaned = re.sub(r"\s+", "", content)
|
||||
else:
|
||||
# Contains LaTeX commands: remove spaces carefully
|
||||
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
|
||||
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
|
||||
# Strategy: remove spaces before \ and between non-command chars,
|
||||
# but preserve the space after \command when followed by a non-\ char
|
||||
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
|
||||
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
|
||||
return f"{operator}{{{cleaned}}}"
|
||||
|
||||
# Match _{ ... } or ^{ ... }
|
||||
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:
|
||||
|
||||
Processing stages:
|
||||
0. Fix OCR number errors (spaces in numbers)
|
||||
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
|
||||
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
|
||||
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
|
||||
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
||||
|
||||
@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
|
||||
# stage0: fix OCR number errors (digits with spaces)
|
||||
expr = _fix_ocr_number_errors(expr)
|
||||
|
||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||
# stage1: split glued command tokens (e.g. \cdotdS, \inX)
|
||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||
|
||||
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
|
||||
|
||||
Reference in New Issue
Block a user