fix: rm other attr in mathml
This commit is contained in:
@@ -340,9 +340,10 @@ class Converter:
|
||||
"""Post-process MathML to improve Word compatibility.
|
||||
|
||||
Applies transformations to make MathML more compatible with Word:
|
||||
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
|
||||
- Change display="inline" to display="block" for better rendering
|
||||
- Decode Unicode entities to actual characters (Word prefers this)
|
||||
- Clean up unnecessary attributes
|
||||
- Ensure proper namespace
|
||||
|
||||
Args:
|
||||
mathml: MathML string.
|
||||
@@ -350,23 +351,57 @@ class Converter:
|
||||
Returns:
|
||||
Word-compatible MathML string.
|
||||
"""
|
||||
# Change display to block for better Word rendering
|
||||
import re
|
||||
|
||||
# Step 1: Remove <semantics> and <annotation> wrappers
|
||||
# These often cause Word import issues
|
||||
if '<semantics>' in mathml:
|
||||
# Extract content between <semantics> and <annotation>
|
||||
match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
|
||||
if match:
|
||||
content = match.group(1).strip()
|
||||
|
||||
# Get the math element attributes
|
||||
math_attrs = ""
|
||||
math_match = re.search(r'<math([^>]*)>', mathml)
|
||||
if math_match:
|
||||
math_attrs = math_match.group(1)
|
||||
|
||||
# Rebuild without semantics
|
||||
mathml = f'<math{math_attrs}>{content}</math>'
|
||||
|
||||
# Step 2: Change display to block for better Word rendering
|
||||
mathml = mathml.replace('display="inline"', 'display="block"')
|
||||
|
||||
# If no display attribute, add it
|
||||
# Step 3: If no display attribute, add it
|
||||
if 'display=' not in mathml and '<math' in mathml:
|
||||
mathml = mathml.replace('<math', '<math display="block"', 1)
|
||||
|
||||
# Decode common Unicode entities to actual characters (Word prefers this)
|
||||
# Step 4: Ensure xmlns is present
|
||||
if 'xmlns=' not in mathml and '<math' in mathml:
|
||||
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
||||
|
||||
# Step 5: Decode common Unicode entities to actual characters (Word prefers this)
|
||||
unicode_map = {
|
||||
'+': '+',
|
||||
'…': '⋯',
|
||||
'⋮': '⋮',
|
||||
'-': '-',
|
||||
'*': '*',
|
||||
'/': '/',
|
||||
'=': '=',
|
||||
'|': '|',
|
||||
',': ',',
|
||||
'<': '<',
|
||||
'>': '>',
|
||||
'(': '(',
|
||||
')': ')',
|
||||
',': ',',
|
||||
'.': '.',
|
||||
'|': '|',
|
||||
'…': '⋯',
|
||||
'⋮': '⋮',
|
||||
'⋯': '⋯',
|
||||
'°': '°',
|
||||
'γ': 'γ',
|
||||
'φ': 'φ',
|
||||
'ϕ': 'ϕ',
|
||||
}
|
||||
|
||||
for entity, char in unicode_map.items():
|
||||
|
||||
Reference in New Issue
Block a user