fix: rm other attr
This commit is contained in:
@@ -339,8 +339,10 @@ class Converter:
|
||||
def _postprocess_mathml_for_word(mathml: str) -> str:
|
||||
"""Post-process MathML to improve Word compatibility.
|
||||
|
||||
Applies transformations to make MathML more compatible with Word:
|
||||
Applies transformations to make MathML more compatible and concise:
|
||||
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
|
||||
- Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
|
||||
- Remove redundant single <mrow> wrappers
|
||||
- Change display="inline" to display="block" for better rendering
|
||||
- Decode Unicode entities to actual characters (Word prefers this)
|
||||
- Ensure proper namespace
|
||||
@@ -349,7 +351,7 @@ class Converter:
|
||||
mathml: MathML string.
|
||||
|
||||
Returns:
|
||||
Word-compatible MathML string.
|
||||
Simplified, Word-compatible MathML string.
|
||||
"""
|
||||
import re
|
||||
|
||||
@@ -370,18 +372,52 @@ class Converter:
|
||||
# Rebuild without semantics
|
||||
mathml = f'<math{math_attrs}>{content}</math>'
|
||||
|
||||
# Step 2: Change display to block for better Word rendering
|
||||
# Step 2: Remove unnecessary attributes that don't affect rendering
|
||||
# These are verbose and Word doesn't need them
|
||||
unnecessary_attrs = [
|
||||
r'\s+form="prefix"',
|
||||
r'\s+form="postfix"',
|
||||
r'\s+form="infix"',
|
||||
r'\s+stretchy="true"',
|
||||
r'\s+stretchy="false"',
|
||||
r'\s+fence="true"',
|
||||
r'\s+fence="false"',
|
||||
r'\s+separator="true"',
|
||||
r'\s+separator="false"',
|
||||
r'\s+columnalign="[^"]*"',
|
||||
r'\s+columnspacing="[^"]*"',
|
||||
r'\s+rowspacing="[^"]*"',
|
||||
r'\s+class="[^"]*"',
|
||||
r'\s+style="[^"]*"',
|
||||
]
|
||||
|
||||
for attr_pattern in unnecessary_attrs:
|
||||
mathml = re.sub(attr_pattern, '', mathml)
|
||||
|
||||
# Step 3: Remove redundant single <mrow> wrapper at the top level
|
||||
# Pattern: <math ...><mrow>content</mrow></math>
|
||||
# Simplify to: <math ...>content</math>
|
||||
mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
|
||||
match = re.search(mrow_pattern, mathml, re.DOTALL)
|
||||
if match:
|
||||
# Check if there's only one mrow at the top level
|
||||
content = match.group(2)
|
||||
# Only remove if the content doesn't have other top-level elements
|
||||
if not re.search(r'</[^>]+>\s*<[^/]', content):
|
||||
mathml = f'{match.group(1)}{content}{match.group(3)}'
|
||||
|
||||
# Step 4: Change display to block for better Word rendering
|
||||
mathml = mathml.replace('display="inline"', 'display="block"')
|
||||
|
||||
# Step 3: If no display attribute, add it
|
||||
# Step 5: If no display attribute, add it
|
||||
if 'display=' not in mathml and '<math' in mathml:
|
||||
mathml = mathml.replace('<math', '<math display="block"', 1)
|
||||
|
||||
# Step 4: Ensure xmlns is present
|
||||
# Step 6: Ensure xmlns is present
|
||||
if 'xmlns=' not in mathml and '<math' in mathml:
|
||||
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
||||
|
||||
# Step 5: Decode common Unicode entities to actual characters (Word prefers this)
|
||||
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
|
||||
unicode_map = {
|
||||
'+': '+',
|
||||
'-': '-',
|
||||
@@ -402,11 +438,26 @@ class Converter:
|
||||
'γ': 'γ',
|
||||
'φ': 'φ',
|
||||
'ϕ': 'ϕ',
|
||||
'α': 'α',
|
||||
'β': 'β',
|
||||
'δ': 'δ',
|
||||
'ε': 'ε',
|
||||
'θ': 'θ',
|
||||
'λ': 'λ',
|
||||
'μ': 'μ',
|
||||
'π': 'π',
|
||||
'ρ': 'ρ',
|
||||
'σ': 'σ',
|
||||
'τ': 'τ',
|
||||
'ω': 'ω',
|
||||
}
|
||||
|
||||
for entity, char in unicode_map.items():
|
||||
mathml = mathml.replace(entity, char)
|
||||
|
||||
# Step 8: Clean up extra whitespace
|
||||
mathml = re.sub(r'>\s+<', '><', mathml)
|
||||
|
||||
return mathml
|
||||
|
||||
def _latex_to_mathml(self, latex_formula: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user