fix: post hanlde for ocr
This commit is contained in:
@@ -136,6 +136,7 @@ class Converter:
|
|||||||
"""Get cached XSLT transform for MathML to mml: conversion."""
|
"""Get cached XSLT transform for MathML to mml: conversion."""
|
||||||
if cls._mml_xslt_transform is None:
|
if cls._mml_xslt_transform is None:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
|
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
|
||||||
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
|
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
|
||||||
return cls._mml_xslt_transform
|
return cls._mml_xslt_transform
|
||||||
@@ -197,14 +198,17 @@ class Converter:
|
|||||||
return ConvertResult(latex="", mathml="", mml="")
|
return ConvertResult(latex="", mathml="", mml="")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Detect if formula is display (block) or inline
|
||||||
|
is_display = self._is_display_formula(md_text)
|
||||||
|
|
||||||
# Extract the LaTeX formula content (remove delimiters)
|
# Extract the LaTeX formula content (remove delimiters)
|
||||||
latex_formula = self._extract_latex_formula(md_text)
|
latex_formula = self._extract_latex_formula(md_text)
|
||||||
|
|
||||||
# Preprocess formula for better conversion (fix array specifiers, etc.)
|
# Preprocess formula for better conversion (fix array specifiers, etc.)
|
||||||
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
|
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
|
||||||
|
|
||||||
# Convert to MathML
|
# Convert to MathML (pass display flag to use correct delimiters)
|
||||||
mathml = self._latex_to_mathml(preprocessed_formula)
|
mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)
|
||||||
|
|
||||||
# Convert MathML to mml:math format (with namespace prefix)
|
# Convert MathML to mml:math format (with namespace prefix)
|
||||||
mml = self._mathml_to_mml(mathml)
|
mml = self._mathml_to_mml(mathml)
|
||||||
@@ -248,8 +252,8 @@ class Converter:
|
|||||||
consistency across all conversion paths. This fixes common issues that
|
consistency across all conversion paths. This fixes common issues that
|
||||||
cause Pandoc conversion to fail.
|
cause Pandoc conversion to fail.
|
||||||
|
|
||||||
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
|
Note: OCR errors (number errors, command spacing) are fixed earlier in the
|
||||||
so we don't need to handle them here.
|
pipeline (in ocr_service.py), so we don't need to handle them here.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
latex_formula: Pure LaTeX formula.
|
latex_formula: Pure LaTeX formula.
|
||||||
@@ -271,6 +275,26 @@ class Converter:
|
|||||||
|
|
||||||
return latex_formula
|
return latex_formula
|
||||||
|
|
||||||
|
def _is_display_formula(self, text: str) -> bool:
|
||||||
|
"""Check if the formula is a display (block) formula.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text containing LaTeX formula with delimiters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if display formula ($$...$$ or \\[...\\]), False if inline.
|
||||||
|
"""
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
# Display math delimiters: $$...$$ or \[...\]
|
||||||
|
if text.startswith("$$") and text.endswith("$$"):
|
||||||
|
return True
|
||||||
|
if text.startswith("\\[") and text.endswith("\\]"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Inline math delimiters: $...$ or \(...\)
|
||||||
|
return False
|
||||||
|
|
||||||
def _extract_latex_formula(self, text: str) -> str:
|
def _extract_latex_formula(self, text: str) -> str:
|
||||||
"""Extract LaTeX formula from text by removing delimiters.
|
"""Extract LaTeX formula from text by removing delimiters.
|
||||||
|
|
||||||
@@ -299,18 +323,30 @@ class Converter:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=256)
|
@lru_cache(maxsize=256)
|
||||||
def _latex_to_mathml_cached(latex_formula: str) -> str:
|
def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
|
||||||
"""Cached conversion of LaTeX formula to MathML.
|
"""Cached conversion of LaTeX formula to MathML.
|
||||||
|
|
||||||
Uses Pandoc for conversion to ensure Word compatibility.
|
Uses Pandoc for conversion to ensure Word compatibility.
|
||||||
Pandoc generates standard MathML that Word can properly import.
|
Pandoc generates standard MathML that Word can properly import.
|
||||||
|
|
||||||
Uses LRU cache to avoid recomputing for repeated formulas.
|
Args:
|
||||||
|
latex_formula: Pure LaTeX formula (without delimiters).
|
||||||
|
is_display: True if display (block) formula, False if inline.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Standard MathML representation.
|
||||||
"""
|
"""
|
||||||
|
# Use appropriate delimiters based on formula type
|
||||||
|
# Display formulas use $$...$$, inline formulas use $...$
|
||||||
|
if is_display:
|
||||||
|
pandoc_input = f"$${latex_formula}$$"
|
||||||
|
else:
|
||||||
|
pandoc_input = f"${latex_formula}$"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use Pandoc for Word-compatible MathML (primary method)
|
# Use Pandoc for Word-compatible MathML (primary method)
|
||||||
mathml_html = pypandoc.convert_text(
|
mathml_html = pypandoc.convert_text(
|
||||||
f"${latex_formula}$",
|
pandoc_input,
|
||||||
"html",
|
"html",
|
||||||
format="markdown+tex_math_dollars",
|
format="markdown+tex_math_dollars",
|
||||||
extra_args=["--mathml"],
|
extra_args=["--mathml"],
|
||||||
@@ -322,8 +358,9 @@ class Converter:
|
|||||||
# Post-process for Word compatibility
|
# Post-process for Word compatibility
|
||||||
return Converter._postprocess_mathml_for_word(mathml)
|
return Converter._postprocess_mathml_for_word(mathml)
|
||||||
|
|
||||||
# If no match, return as-is
|
# If Pandoc didn't generate MathML (returned HTML instead), use fallback
|
||||||
return mathml_html.rstrip("\n")
|
# This happens when Pandoc's mathml output format is not available or fails
|
||||||
|
raise ValueError("Pandoc did not generate MathML, got HTML instead")
|
||||||
|
|
||||||
except Exception as pandoc_error:
|
except Exception as pandoc_error:
|
||||||
# Fallback: try latex2mathml (less Word-compatible)
|
# Fallback: try latex2mathml (less Word-compatible)
|
||||||
@@ -331,9 +368,7 @@ class Converter:
|
|||||||
mathml = latex_to_mathml(latex_formula)
|
mathml = latex_to_mathml(latex_formula)
|
||||||
return Converter._postprocess_mathml_for_word(mathml)
|
return Converter._postprocess_mathml_for_word(mathml)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e
|
||||||
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _postprocess_mathml_for_word(mathml: str) -> str:
|
def _postprocess_mathml_for_word(mathml: str) -> str:
|
||||||
@@ -357,20 +392,20 @@ class Converter:
|
|||||||
|
|
||||||
# Step 1: Remove <semantics> and <annotation> wrappers
|
# Step 1: Remove <semantics> and <annotation> wrappers
|
||||||
# These often cause Word import issues
|
# These often cause Word import issues
|
||||||
if '<semantics>' in mathml:
|
if "<semantics>" in mathml:
|
||||||
# Extract content between <semantics> and <annotation>
|
# Extract content between <semantics> and <annotation>
|
||||||
match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
|
match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
|
||||||
if match:
|
if match:
|
||||||
content = match.group(1).strip()
|
content = match.group(1).strip()
|
||||||
|
|
||||||
# Get the math element attributes
|
# Get the math element attributes
|
||||||
math_attrs = ""
|
math_attrs = ""
|
||||||
math_match = re.search(r'<math([^>]*)>', mathml)
|
math_match = re.search(r"<math([^>]*)>", mathml)
|
||||||
if math_match:
|
if math_match:
|
||||||
math_attrs = math_match.group(1)
|
math_attrs = math_match.group(1)
|
||||||
|
|
||||||
# Rebuild without semantics
|
# Rebuild without semantics
|
||||||
mathml = f'<math{math_attrs}>{content}</math>'
|
mathml = f"<math{math_attrs}>{content}</math>"
|
||||||
|
|
||||||
# Step 2: Remove unnecessary attributes that don't affect rendering
|
# Step 2: Remove unnecessary attributes that don't affect rendering
|
||||||
# These are verbose and Word doesn't need them
|
# These are verbose and Word doesn't need them
|
||||||
@@ -392,187 +427,183 @@ class Converter:
|
|||||||
]
|
]
|
||||||
|
|
||||||
for attr_pattern in unnecessary_attrs:
|
for attr_pattern in unnecessary_attrs:
|
||||||
mathml = re.sub(attr_pattern, '', mathml)
|
mathml = re.sub(attr_pattern, "", mathml)
|
||||||
|
|
||||||
# Step 3: Remove redundant single <mrow> wrapper at the top level
|
# Step 3: Remove redundant single <mrow> wrapper at the top level
|
||||||
# Pattern: <math ...><mrow>content</mrow></math>
|
# Pattern: <math ...><mrow>content</mrow></math>
|
||||||
# Simplify to: <math ...>content</math>
|
# Simplify to: <math ...>content</math>
|
||||||
mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
|
mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
|
||||||
match = re.search(mrow_pattern, mathml, re.DOTALL)
|
match = re.search(mrow_pattern, mathml, re.DOTALL)
|
||||||
if match:
|
if match:
|
||||||
# Check if there's only one mrow at the top level
|
# Check if there's only one mrow at the top level
|
||||||
content = match.group(2)
|
content = match.group(2)
|
||||||
# Only remove if the content doesn't have other top-level elements
|
# Only remove if the content doesn't have other top-level elements
|
||||||
if not re.search(r'</[^>]+>\s*<[^/]', content):
|
if not re.search(r"</[^>]+>\s*<[^/]", content):
|
||||||
mathml = f'{match.group(1)}{content}{match.group(3)}'
|
mathml = f"{match.group(1)}{content}{match.group(3)}"
|
||||||
|
|
||||||
# Step 4: Change display to block for better Word rendering
|
# Step 4: Change display to block for better Word rendering
|
||||||
mathml = mathml.replace('display="inline"', 'display="block"')
|
mathml = mathml.replace('display="inline"', 'display="block"')
|
||||||
|
|
||||||
# Step 5: If no display attribute, add it
|
# Step 5: If no display attribute, add it
|
||||||
if 'display=' not in mathml and '<math' in mathml:
|
if "display=" not in mathml and "<math" in mathml:
|
||||||
mathml = mathml.replace('<math', '<math display="block"', 1)
|
mathml = mathml.replace("<math", '<math display="block"', 1)
|
||||||
|
|
||||||
# Step 6: Ensure xmlns is present
|
# Step 6: Ensure xmlns is present
|
||||||
if 'xmlns=' not in mathml and '<math' in mathml:
|
if "xmlns=" not in mathml and "<math" in mathml:
|
||||||
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
||||||
|
|
||||||
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
|
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
|
||||||
unicode_map = {
|
unicode_map = {
|
||||||
# Basic operators
|
# Basic operators
|
||||||
'+': '+',
|
"+": "+",
|
||||||
'-': '-',
|
"-": "-",
|
||||||
'*': '*',
|
"*": "*",
|
||||||
'/': '/',
|
"/": "/",
|
||||||
'=': '=',
|
"=": "=",
|
||||||
'<': '<',
|
"<": "<",
|
||||||
'>': '>',
|
">": ">",
|
||||||
'(': '(',
|
"(": "(",
|
||||||
')': ')',
|
")": ")",
|
||||||
',': ',',
|
",": ",",
|
||||||
'.': '.',
|
".": ".",
|
||||||
'|': '|',
|
"|": "|",
|
||||||
'°': '°',
|
"°": "°",
|
||||||
'×': '×', # times
|
"×": "×", # times
|
||||||
'÷': '÷', # div
|
"÷": "÷", # div
|
||||||
'±': '±', # pm
|
"±": "±", # pm
|
||||||
'∓': '∓', # mp
|
"∓": "∓", # mp
|
||||||
|
|
||||||
# Ellipsis symbols
|
# Ellipsis symbols
|
||||||
'…': '…', # ldots (horizontal)
|
"…": "…", # ldots (horizontal)
|
||||||
'⋮': '⋮', # vdots (vertical)
|
"⋮": "⋮", # vdots (vertical)
|
||||||
'⋯': '⋯', # cdots (centered)
|
"⋯": "⋯", # cdots (centered)
|
||||||
'⋰': '⋰', # iddots (diagonal up)
|
"⋰": "⋰", # iddots (diagonal up)
|
||||||
'⋱': '⋱', # ddots (diagonal down)
|
"⋱": "⋱", # ddots (diagonal down)
|
||||||
|
|
||||||
# Greek letters (lowercase)
|
# Greek letters (lowercase)
|
||||||
'α': 'α', # alpha
|
"α": "α", # alpha
|
||||||
'β': 'β', # beta
|
"β": "β", # beta
|
||||||
'γ': 'γ', # gamma
|
"γ": "γ", # gamma
|
||||||
'δ': 'δ', # delta
|
"δ": "δ", # delta
|
||||||
'ε': 'ε', # epsilon
|
"ε": "ε", # epsilon
|
||||||
'ζ': 'ζ', # zeta
|
"ζ": "ζ", # zeta
|
||||||
'η': 'η', # eta
|
"η": "η", # eta
|
||||||
'θ': 'θ', # theta
|
"θ": "θ", # theta
|
||||||
'ι': 'ι', # iota
|
"ι": "ι", # iota
|
||||||
'κ': 'κ', # kappa
|
"κ": "κ", # kappa
|
||||||
'λ': 'λ', # lambda
|
"λ": "λ", # lambda
|
||||||
'μ': 'μ', # mu
|
"μ": "μ", # mu
|
||||||
'ν': 'ν', # nu
|
"ν": "ν", # nu
|
||||||
'ξ': 'ξ', # xi
|
"ξ": "ξ", # xi
|
||||||
'ο': 'ο', # omicron
|
"ο": "ο", # omicron
|
||||||
'π': 'π', # pi
|
"π": "π", # pi
|
||||||
'ρ': 'ρ', # rho
|
"ρ": "ρ", # rho
|
||||||
'ς': 'ς', # final sigma
|
"ς": "ς", # final sigma
|
||||||
'σ': 'σ', # sigma
|
"σ": "σ", # sigma
|
||||||
'τ': 'τ', # tau
|
"τ": "τ", # tau
|
||||||
'υ': 'υ', # upsilon
|
"υ": "υ", # upsilon
|
||||||
'φ': 'φ', # phi
|
"φ": "φ", # phi
|
||||||
'χ': 'χ', # chi
|
"χ": "χ", # chi
|
||||||
'ψ': 'ψ', # psi
|
"ψ": "ψ", # psi
|
||||||
'ω': 'ω', # omega
|
"ω": "ω", # omega
|
||||||
'ϕ': 'ϕ', # phi variant
|
"ϕ": "ϕ", # phi variant
|
||||||
|
|
||||||
# Greek letters (uppercase)
|
# Greek letters (uppercase)
|
||||||
'Α': 'Α', # Alpha
|
"Α": "Α", # Alpha
|
||||||
'Β': 'Β', # Beta
|
"Β": "Β", # Beta
|
||||||
'Γ': 'Γ', # Gamma
|
"Γ": "Γ", # Gamma
|
||||||
'Δ': 'Δ', # Delta
|
"Δ": "Δ", # Delta
|
||||||
'Ε': 'Ε', # Epsilon
|
"Ε": "Ε", # Epsilon
|
||||||
'Ζ': 'Ζ', # Zeta
|
"Ζ": "Ζ", # Zeta
|
||||||
'Η': 'Η', # Eta
|
"Η": "Η", # Eta
|
||||||
'Θ': 'Θ', # Theta
|
"Θ": "Θ", # Theta
|
||||||
'Ι': 'Ι', # Iota
|
"Ι": "Ι", # Iota
|
||||||
'Κ': 'Κ', # Kappa
|
"Κ": "Κ", # Kappa
|
||||||
'Λ': 'Λ', # Lambda
|
"Λ": "Λ", # Lambda
|
||||||
'Μ': 'Μ', # Mu
|
"Μ": "Μ", # Mu
|
||||||
'Ν': 'Ν', # Nu
|
"Ν": "Ν", # Nu
|
||||||
'Ξ': 'Ξ', # Xi
|
"Ξ": "Ξ", # Xi
|
||||||
'Ο': 'Ο', # Omicron
|
"Ο": "Ο", # Omicron
|
||||||
'Π': 'Π', # Pi
|
"Π": "Π", # Pi
|
||||||
'Ρ': 'Ρ', # Rho
|
"Ρ": "Ρ", # Rho
|
||||||
'Σ': 'Σ', # Sigma
|
"Σ": "Σ", # Sigma
|
||||||
'Τ': 'Τ', # Tau
|
"Τ": "Τ", # Tau
|
||||||
'Υ': 'Υ', # Upsilon
|
"Υ": "Υ", # Upsilon
|
||||||
'Φ': 'Φ', # Phi
|
"Φ": "Φ", # Phi
|
||||||
'Χ': 'Χ', # Chi
|
"Χ": "Χ", # Chi
|
||||||
'Ψ': 'Ψ', # Psi
|
"Ψ": "Ψ", # Psi
|
||||||
'Ω': 'Ω', # Omega
|
"Ω": "Ω", # Omega
|
||||||
|
|
||||||
# Math symbols
|
# Math symbols
|
||||||
'∅': '∅', # emptyset
|
"∅": "∅", # emptyset
|
||||||
'∈': '∈', # in
|
"∈": "∈", # in
|
||||||
'∉': '∉', # notin
|
"∉": "∉", # notin
|
||||||
'∋': '∋', # ni
|
"∋": "∋", # ni
|
||||||
'∌': '∌', # nni
|
"∌": "∌", # nni
|
||||||
'∑': '∑', # sum
|
"∑": "∑", # sum
|
||||||
'∏': '∏', # prod
|
"∏": "∏", # prod
|
||||||
'√': '√', # sqrt
|
"√": "√", # sqrt
|
||||||
'∛': '∛', # cbrt
|
"∛": "∛", # cbrt
|
||||||
'∜': '∜', # fourthroot
|
"∜": "∜", # fourthroot
|
||||||
'∞': '∞', # infty
|
"∞": "∞", # infty
|
||||||
'∩': '∩', # cap
|
"∩": "∩", # cap
|
||||||
'∪': '∪', # cup
|
"∪": "∪", # cup
|
||||||
'∫': '∫', # int
|
"∫": "∫", # int
|
||||||
'∬': '∬', # iint
|
"∬": "∬", # iint
|
||||||
'∭': '∭', # iiint
|
"∭": "∭", # iiint
|
||||||
'∮': '∮', # oint
|
"∮": "∮", # oint
|
||||||
'⊂': '⊂', # subset
|
"⊂": "⊂", # subset
|
||||||
'⊃': '⊃', # supset
|
"⊃": "⊃", # supset
|
||||||
'⊄': '⊄', # nsubset
|
"⊄": "⊄", # nsubset
|
||||||
'⊅': '⊅', # nsupset
|
"⊅": "⊅", # nsupset
|
||||||
'⊆': '⊆', # subseteq
|
"⊆": "⊆", # subseteq
|
||||||
'⊇': '⊇', # supseteq
|
"⊇": "⊇", # supseteq
|
||||||
'⊈': '⊈', # nsubseteq
|
"⊈": "⊈", # nsubseteq
|
||||||
'⊉': '⊉', # nsupseteq
|
"⊉": "⊉", # nsupseteq
|
||||||
'≤': '≤', # leq
|
"≤": "≤", # leq
|
||||||
'≥': '≥', # geq
|
"≥": "≥", # geq
|
||||||
'≠': '≠', # neq
|
"≠": "≠", # neq
|
||||||
'≡': '≡', # equiv
|
"≡": "≡", # equiv
|
||||||
'≈': '≈', # approx
|
"≈": "≈", # approx
|
||||||
'≃': '≃', # simeq
|
"≃": "≃", # simeq
|
||||||
'≅': '≅', # cong
|
"≅": "≅", # cong
|
||||||
'∂': '∂', # partial
|
"∂": "∂", # partial
|
||||||
'∇': '∇', # nabla
|
"∇": "∇", # nabla
|
||||||
'∀': '∀', # forall
|
"∀": "∀", # forall
|
||||||
'∃': '∃', # exists
|
"∃": "∃", # exists
|
||||||
'∄': '∄', # nexists
|
"∄": "∄", # nexists
|
||||||
'¬': '¬', # neg/lnot
|
"¬": "¬", # neg/lnot
|
||||||
'∧': '∧', # wedge/land
|
"∧": "∧", # wedge/land
|
||||||
'∨': '∨', # vee/lor
|
"∨": "∨", # vee/lor
|
||||||
'→': '→', # to/rightarrow
|
"→": "→", # to/rightarrow
|
||||||
'←': '←', # leftarrow
|
"←": "←", # leftarrow
|
||||||
'↔': '↔', # leftrightarrow
|
"↔": "↔", # leftrightarrow
|
||||||
'⇒': '⇒', # Rightarrow
|
"⇒": "⇒", # Rightarrow
|
||||||
'⇐': '⇐', # Leftarrow
|
"⇐": "⇐", # Leftarrow
|
||||||
'⇔': '⇔', # Leftrightarrow
|
"⇔": "⇔", # Leftrightarrow
|
||||||
'↑': '↑', # uparrow
|
"↑": "↑", # uparrow
|
||||||
'↓': '↓', # downarrow
|
"↓": "↓", # downarrow
|
||||||
'⇑': '⇑', # Uparrow
|
"⇑": "⇑", # Uparrow
|
||||||
'⇓': '⇓', # Downarrow
|
"⇓": "⇓", # Downarrow
|
||||||
'↕': '↕', # updownarrow
|
"↕": "↕", # updownarrow
|
||||||
'⇕': '⇕', # Updownarrow
|
"⇕": "⇕", # Updownarrow
|
||||||
'≠': '≠', # ne
|
"≠": "≠", # ne
|
||||||
'≪': '≪', # ll
|
"≪": "≪", # ll
|
||||||
'≫': '≫', # gg
|
"≫": "≫", # gg
|
||||||
'⩽': '⩽', # leqslant
|
"⩽": "⩽", # leqslant
|
||||||
'⩾': '⩾', # geqslant
|
"⩾": "⩾", # geqslant
|
||||||
'⊥': '⊥', # perp
|
"⊥": "⊥", # perp
|
||||||
'∥': '∥', # parallel
|
"∥": "∥", # parallel
|
||||||
'∠': '∠', # angle
|
"∠": "∠", # angle
|
||||||
'△': '△', # triangle
|
"△": "△", # triangle
|
||||||
'□': '□', # square
|
"□": "□", # square
|
||||||
'◊': '◊', # diamond
|
"◊": "◊", # diamond
|
||||||
'♠': '♠', # spadesuit
|
"♠": "♠", # spadesuit
|
||||||
'♡': '♡', # heartsuit
|
"♡": "♡", # heartsuit
|
||||||
'♢': '♢', # diamondsuit
|
"♢": "♢", # diamondsuit
|
||||||
'♣': '♣', # clubsuit
|
"♣": "♣", # clubsuit
|
||||||
'ℓ': 'ℓ', # ell
|
"ℓ": "ℓ", # ell
|
||||||
'℘': '℘', # wp (Weierstrass p)
|
"℘": "℘", # wp (Weierstrass p)
|
||||||
'ℜ': 'ℜ', # Re (real part)
|
"ℜ": "ℜ", # Re (real part)
|
||||||
'ℑ': 'ℑ', # Im (imaginary part)
|
"ℑ": "ℑ", # Im (imaginary part)
|
||||||
'ℵ': 'ℵ', # aleph
|
"ℵ": "ℵ", # aleph
|
||||||
'ℶ': 'ℶ', # beth
|
"ℶ": "ℶ", # beth
|
||||||
}
|
}
|
||||||
|
|
||||||
for entity, char in unicode_map.items():
|
for entity, char in unicode_map.items():
|
||||||
@@ -581,43 +612,44 @@ class Converter:
|
|||||||
# Also handle decimal entity format (&#NNNN;) for common characters
|
# Also handle decimal entity format (&#NNNN;) for common characters
|
||||||
# Convert decimal to hex-based lookup
|
# Convert decimal to hex-based lookup
|
||||||
decimal_patterns = [
|
decimal_patterns = [
|
||||||
(r'λ', 'λ'), # lambda (decimal 955 = hex 03BB)
|
(r"λ", "λ"), # lambda (decimal 955 = hex 03BB)
|
||||||
(r'⋮', '⋮'), # vdots (decimal 8942 = hex 22EE)
|
(r"⋮", "⋮"), # vdots (decimal 8942 = hex 22EE)
|
||||||
(r'⋯', '⋯'), # cdots (decimal 8943 = hex 22EF)
|
(r"⋯", "⋯"), # cdots (decimal 8943 = hex 22EF)
|
||||||
(r'…', '…'), # ldots (decimal 8230 = hex 2026)
|
(r"…", "…"), # ldots (decimal 8230 = hex 2026)
|
||||||
(r'∞', '∞'), # infty (decimal 8734 = hex 221E)
|
(r"∞", "∞"), # infty (decimal 8734 = hex 221E)
|
||||||
(r'∑', '∑'), # sum (decimal 8721 = hex 2211)
|
(r"∑", "∑"), # sum (decimal 8721 = hex 2211)
|
||||||
(r'∏', '∏'), # prod (decimal 8719 = hex 220F)
|
(r"∏", "∏"), # prod (decimal 8719 = hex 220F)
|
||||||
(r'√', '√'), # sqrt (decimal 8730 = hex 221A)
|
(r"√", "√"), # sqrt (decimal 8730 = hex 221A)
|
||||||
(r'∈', '∈'), # in (decimal 8712 = hex 2208)
|
(r"∈", "∈"), # in (decimal 8712 = hex 2208)
|
||||||
(r'∉', '∉'), # notin (decimal 8713 = hex 2209)
|
(r"∉", "∉"), # notin (decimal 8713 = hex 2209)
|
||||||
(r'∩', '∩'), # cap (decimal 8745 = hex 2229)
|
(r"∩", "∩"), # cap (decimal 8745 = hex 2229)
|
||||||
(r'∪', '∪'), # cup (decimal 8746 = hex 222A)
|
(r"∪", "∪"), # cup (decimal 8746 = hex 222A)
|
||||||
(r'≤', '≤'), # leq (decimal 8804 = hex 2264)
|
(r"≤", "≤"), # leq (decimal 8804 = hex 2264)
|
||||||
(r'≥', '≥'), # geq (decimal 8805 = hex 2265)
|
(r"≥", "≥"), # geq (decimal 8805 = hex 2265)
|
||||||
(r'≠', '≠'), # neq (decimal 8800 = hex 2260)
|
(r"≠", "≠"), # neq (decimal 8800 = hex 2260)
|
||||||
(r'≈', '≈'), # approx (decimal 8776 = hex 2248)
|
(r"≈", "≈"), # approx (decimal 8776 = hex 2248)
|
||||||
(r'≡', '≡'), # equiv (decimal 8801 = hex 2261)
|
(r"≡", "≡"), # equiv (decimal 8801 = hex 2261)
|
||||||
]
|
]
|
||||||
|
|
||||||
for pattern, char in decimal_patterns:
|
for pattern, char in decimal_patterns:
|
||||||
mathml = mathml.replace(pattern, char)
|
mathml = mathml.replace(pattern, char)
|
||||||
|
|
||||||
# Step 8: Clean up extra whitespace
|
# Step 8: Clean up extra whitespace
|
||||||
mathml = re.sub(r'>\s+<', '><', mathml)
|
mathml = re.sub(r">\s+<", "><", mathml)
|
||||||
|
|
||||||
return mathml
|
return mathml
|
||||||
|
|
||||||
def _latex_to_mathml(self, latex_formula: str) -> str:
|
def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
|
||||||
"""Convert LaTeX formula to standard MathML.
|
"""Convert LaTeX formula to standard MathML.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
latex_formula: Pure LaTeX formula (without delimiters).
|
latex_formula: Pure LaTeX formula (without delimiters).
|
||||||
|
is_display: True if display (block) formula, False if inline.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Standard MathML representation.
|
Standard MathML representation.
|
||||||
"""
|
"""
|
||||||
return self._latex_to_mathml_cached(latex_formula)
|
return self._latex_to_mathml_cached(latex_formula, is_display=is_display)
|
||||||
|
|
||||||
def _mathml_to_mml(self, mathml: str) -> str:
|
def _mathml_to_mml(self, mathml: str) -> str:
|
||||||
"""Convert standard MathML to mml:math format with namespace prefix.
|
"""Convert standard MathML to mml:math format with namespace prefix.
|
||||||
|
|||||||
@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
|
|||||||
"log",
|
"log",
|
||||||
"ln",
|
"ln",
|
||||||
"exp",
|
"exp",
|
||||||
|
# set relations (often glued by OCR)
|
||||||
|
"in",
|
||||||
|
"notin",
|
||||||
|
"subset",
|
||||||
|
"supset",
|
||||||
|
"subseteq",
|
||||||
|
"supseteq",
|
||||||
|
"cap",
|
||||||
|
"cup",
|
||||||
# misc
|
# misc
|
||||||
"partial",
|
"partial",
|
||||||
"nabla",
|
"nabla",
|
||||||
}
|
}
|
||||||
|
|
||||||
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||||
|
# Match LaTeX commands: \command (greedy match all letters)
|
||||||
|
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
|
||||||
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
||||||
|
|
||||||
# stage2: differentials inside math segments
|
# stage2: differentials inside math segments
|
||||||
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
|
|||||||
Examples:
|
Examples:
|
||||||
- \\cdotdS -> \\cdot dS
|
- \\cdotdS -> \\cdot dS
|
||||||
- \\intdx -> \\int dx
|
- \\intdx -> \\int dx
|
||||||
|
- \\inX -> \\in X (stop at uppercase letter)
|
||||||
"""
|
"""
|
||||||
if not token.startswith("\\"):
|
if not token.startswith("\\"):
|
||||||
return token
|
return token
|
||||||
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
|
|||||||
return token
|
return token
|
||||||
|
|
||||||
best = None
|
best = None
|
||||||
# longest prefix that is in whitelist
|
# Find longest prefix that is in whitelist
|
||||||
for i in range(1, len(body)):
|
for i in range(1, len(body) + 1):
|
||||||
prefix = body[:i]
|
prefix = body[:i]
|
||||||
if prefix in _COMMANDS_NEED_SPACE:
|
if prefix in _COMMANDS_NEED_SPACE:
|
||||||
best = prefix
|
best = prefix
|
||||||
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
|
|||||||
# Pattern 2: Spaces inside braces that follow _ or ^
|
# Pattern 2: Spaces inside braces that follow _ or ^
|
||||||
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
||||||
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
||||||
|
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
|
||||||
|
# must be preserved as they serve as command terminators (\in X != \inX)
|
||||||
def clean_subscript_superscript_braces(match):
|
def clean_subscript_superscript_braces(match):
|
||||||
operator = match.group(1) # _ or ^
|
operator = match.group(1) # _ or ^
|
||||||
content = match.group(2) # content inside braces
|
content = match.group(2) # content inside braces
|
||||||
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
|
if "\\" not in content:
|
||||||
# Only remove spaces between non-backslash characters
|
# No LaTeX commands: safe to remove all spaces
|
||||||
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
|
cleaned = re.sub(r"\s+", "", content)
|
||||||
|
else:
|
||||||
|
# Contains LaTeX commands: remove spaces carefully
|
||||||
|
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
|
||||||
|
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
|
||||||
|
# Strategy: remove spaces before \ and between non-command chars,
|
||||||
|
# but preserve the space after \command when followed by a non-\ char
|
||||||
|
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
|
||||||
|
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
|
||||||
return f"{operator}{{{cleaned}}}"
|
return f"{operator}{{{cleaned}}}"
|
||||||
|
|
||||||
# Match _{ ... } or ^{ ... }
|
# Match _{ ... } or ^{ ... }
|
||||||
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:
|
|||||||
|
|
||||||
Processing stages:
|
Processing stages:
|
||||||
0. Fix OCR number errors (spaces in numbers)
|
0. Fix OCR number errors (spaces in numbers)
|
||||||
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
|
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
|
||||||
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
|
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
|
||||||
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
||||||
|
|
||||||
@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
|
|||||||
# stage0: fix OCR number errors (digits with spaces)
|
# stage0: fix OCR number errors (digits with spaces)
|
||||||
expr = _fix_ocr_number_errors(expr)
|
expr = _fix_ocr_number_errors(expr)
|
||||||
|
|
||||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
# stage1: split glued command tokens (e.g. \cdotdS, \inX)
|
||||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||||
|
|
||||||
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
|
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
|
||||||
|
|||||||
Reference in New Issue
Block a user