fix: rm other attr

This commit is contained in:
liuyuanchuang
2026-02-04 16:56:20 +08:00
parent f1229483bf
commit cd790231ec
5 changed files with 490 additions and 19 deletions

View File

@@ -339,8 +339,10 @@ class Converter:
def _postprocess_mathml_for_word(mathml: str) -> str:
"""Post-process MathML to improve Word compatibility.
Applies transformations to make MathML more compatible with Word:
Applies transformations to make MathML more compatible and concise:
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
- Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
- Remove redundant single <mrow> wrappers
- Change display="inline" to display="block" for better rendering
- Decode Unicode entities to actual characters (Word prefers this)
- Ensure proper namespace
@@ -349,7 +351,7 @@ class Converter:
mathml: MathML string.
Returns:
Word-compatible MathML string.
Simplified, Word-compatible MathML string.
"""
import re
@@ -370,18 +372,52 @@ class Converter:
# Rebuild without semantics
mathml = f'<math{math_attrs}>{content}</math>'
# Step 2: Change display to block for better Word rendering
# Step 2: Remove unnecessary attributes that don't affect rendering
# These are verbose and Word doesn't need them
unnecessary_attrs = [
r'\s+form="prefix"',
r'\s+form="postfix"',
r'\s+form="infix"',
r'\s+stretchy="true"',
r'\s+stretchy="false"',
r'\s+fence="true"',
r'\s+fence="false"',
r'\s+separator="true"',
r'\s+separator="false"',
r'\s+columnalign="[^"]*"',
r'\s+columnspacing="[^"]*"',
r'\s+rowspacing="[^"]*"',
r'\s+class="[^"]*"',
r'\s+style="[^"]*"',
]
for attr_pattern in unnecessary_attrs:
mathml = re.sub(attr_pattern, '', mathml)
# Step 3: Remove redundant single <mrow> wrapper at the top level
# Pattern: <math ...><mrow>content</mrow></math>
# Simplify to: <math ...>content</math>
mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
match = re.search(mrow_pattern, mathml, re.DOTALL)
if match:
# Check if there's only one mrow at the top level
content = match.group(2)
# Only remove if the content doesn't have other top-level elements
if not re.search(r'</[^>]+>\s*<[^/]', content):
mathml = f'{match.group(1)}{content}{match.group(3)}'
# Step 4: Change display to block for better Word rendering
mathml = mathml.replace('display="inline"', 'display="block"')
# Step 3: If no display attribute, add it
# Step 5: If no display attribute, add it
if 'display=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math display="block"', 1)
# Step 4: Ensure xmlns is present
# Step 6: Ensure xmlns is present
if 'xmlns=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
# Step 5: Decode common Unicode entities to actual characters (Word prefers this)
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
unicode_map = {
'&#x0002B;': '+',
'&#x0002D;': '-',
@@ -402,11 +438,26 @@ class Converter:
'&#x03B3;': 'γ',
'&#x03C6;': 'φ',
'&#x03D5;': 'ϕ',
'&#x03B1;': 'α',
'&#x03B2;': 'β',
'&#x03B4;': 'δ',
'&#x03B5;': 'ε',
'&#x03B8;': 'θ',
'&#x03BB;': 'λ',
'&#x03BC;': 'μ',
'&#x03C0;': 'π',
'&#x03C1;': 'ρ',
'&#x03C3;': 'σ',
'&#x03C4;': 'τ',
'&#x03C9;': 'ω',
}
for entity, char in unicode_map.items():
mathml = mathml.replace(entity, char)
# Step 8: Clean up extra whitespace
mathml = re.sub(r'>\s+<', '><', mathml)
return mathml
def _latex_to_mathml(self, latex_formula: str) -> str: