fix: update mathml

This commit is contained in:
liuyuanchuang
2026-02-04 15:49:13 +08:00
parent e31017cfe7
commit 56a02eb6da
3 changed files with 469 additions and 17 deletions

View File

@@ -296,29 +296,77 @@ class Converter:
def _latex_to_mathml_cached(latex_formula: str) -> str:
"""Cached conversion of LaTeX formula to MathML.
Uses Pandoc for conversion to ensure Word compatibility.
Pandoc generates standard MathML that Word can properly import.
Uses LRU cache to avoid recomputing for repeated formulas.
"""
try:
# Use latex2mathml library for conversion (fast, pure Python)
return latex_to_mathml(latex_formula)
except Exception as e:
# Fallback: try with Pandoc (slower, but more robust)
# Use Pandoc for Word-compatible MathML (primary method)
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
)
# Extract just the <math> element from the HTML
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
if match:
mathml = match.group(0)
# Post-process for Word compatibility
return Converter._postprocess_mathml_for_word(mathml)
# If no match, return as-is
return mathml_html.rstrip("\n")
except Exception as pandoc_error:
# Fallback: try latex2mathml (less Word-compatible)
try:
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
)
# Extract just the <math> element from the HTML
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
if match:
return match.group(0)
return mathml_html.rstrip("\n")
except Exception as pandoc_error:
mathml = latex_to_mathml(latex_formula)
return Converter._postprocess_mathml_for_word(mathml)
except Exception as e:
raise RuntimeError(
f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
) from e
@staticmethod
def _postprocess_mathml_for_word(mathml: str) -> str:
"""Post-process MathML to improve Word compatibility.
Applies transformations to make MathML more compatible with Word:
- Change display="inline" to display="block" for better rendering
- Decode Unicode entities to actual characters (Word prefers this)
- Clean up unnecessary attributes
Args:
mathml: MathML string.
Returns:
Word-compatible MathML string.
"""
# Change display to block for better Word rendering
mathml = mathml.replace('display="inline"', 'display="block"')
# If no display attribute, add it
if 'display=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math display="block"', 1)
# Decode common Unicode entities to actual characters (Word prefers this)
unicode_map = {
'&#x0002B;': '+',
'&#x02026;': '',
'&#x022EE;': '',
'&#x0003D;': '=',
'&#x0007C;': '|',
'&#x0002C;': ',',
'&#x00028;': '(',
'&#x00029;': ')',
}
for entity, char in unicode_map.items():
mathml = mathml.replace(entity, char)
return mathml
def _latex_to_mathml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to standard MathML.