fix: rm other attr in mathml

This commit is contained in:
liuyuanchuang
2026-02-04 16:12:22 +08:00
parent 35419b2102
commit f1229483bf
3 changed files with 483 additions and 8 deletions

View File

@@ -340,9 +340,10 @@ class Converter:
"""Post-process MathML to improve Word compatibility.
Applies transformations to make MathML more compatible with Word:
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
- Change display="inline" to display="block" for better rendering
- Decode Unicode entities to actual characters (Word prefers this)
- Clean up unnecessary attributes
- Ensure proper namespace
Args:
mathml: MathML string.
@@ -350,23 +351,57 @@ class Converter:
Returns:
Word-compatible MathML string.
"""
# Change display to block for better Word rendering
import re
# Step 1: Remove <semantics> and <annotation> wrappers
# These often cause Word import issues
if '<semantics>' in mathml:
# Extract content between <semantics> and <annotation>
match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
if match:
content = match.group(1).strip()
# Get the math element attributes
math_attrs = ""
math_match = re.search(r'<math([^>]*)>', mathml)
if math_match:
math_attrs = math_match.group(1)
# Rebuild without semantics
mathml = f'<math{math_attrs}>{content}</math>'
# Step 2: Change display to block for better Word rendering
mathml = mathml.replace('display="inline"', 'display="block"')
# If no display attribute, add it
# Step 3: If no display attribute, add it
if 'display=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math display="block"', 1)
# Decode common Unicode entities to actual characters (Word prefers this)
# Step 4: Ensure xmlns is present
if 'xmlns=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
# Step 5: Decode common Unicode entities to actual characters (Word prefers this)
unicode_map = {
'&#x0002B;': '+',
'&#x02026;': '',
'&#x022EE;': '',
'&#x0002D;': '-',
'&#x0002A;': '*',
'&#x0002F;': '/',
'&#x0003D;': '=',
'&#x0007C;': '|',
'&#x0002C;': ',',
'&#x0003C;': '<',
'&#x0003E;': '>',
'&#x00028;': '(',
'&#x00029;': ')',
'&#x0002C;': ',',
'&#x0002E;': '.',
'&#x0007C;': '|',
'&#x02026;': '',
'&#x022EE;': '',
'&#x022EF;': '',
'&#x00B0;': '°',
'&#x03B3;': 'γ',
'&#x03C6;': 'φ',
'&#x03D5;': 'ϕ',
}
for entity, char in unicode_map.items():