fix: update mathml

This commit is contained in:
liuyuanchuang
2026-02-04 15:49:13 +08:00
parent e31017cfe7
commit 56a02eb6da
3 changed files with 469 additions and 17 deletions

View File

@@ -296,14 +296,13 @@ class Converter:
def _latex_to_mathml_cached(latex_formula: str) -> str:
"""Cached conversion of LaTeX formula to MathML.
Uses Pandoc for conversion to ensure Word compatibility.
Pandoc generates standard MathML that Word can properly import.
Uses LRU cache to avoid recomputing for repeated formulas.
"""
try:
# Use latex2mathml library for conversion (fast, pure Python)
return latex_to_mathml(latex_formula)
except Exception as e:
# Fallback: try with Pandoc (slower, but more robust)
try:
# Use Pandoc for Word-compatible MathML (primary method)
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
"html",
@@ -313,13 +312,62 @@ class Converter:
# Extract just the <math> element from the HTML
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
if match:
return match.group(0)
mathml = match.group(0)
# Post-process for Word compatibility
return Converter._postprocess_mathml_for_word(mathml)
# If no match, return as-is
return mathml_html.rstrip("\n")
except Exception as pandoc_error:
# Fallback: try latex2mathml (less Word-compatible)
try:
mathml = latex_to_mathml(latex_formula)
return Converter._postprocess_mathml_for_word(mathml)
except Exception as e:
raise RuntimeError(
f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
) from e
@staticmethod
def _postprocess_mathml_for_word(mathml: str) -> str:
"""Post-process MathML to improve Word compatibility.
Applies transformations to make MathML more compatible with Word:
- Change display="inline" to display="block" for better rendering
- Decode Unicode entities to actual characters (Word prefers this)
- Clean up unnecessary attributes
Args:
mathml: MathML string.
Returns:
Word-compatible MathML string.
"""
# Change display to block for better Word rendering
mathml = mathml.replace('display="inline"', 'display="block"')
# If no display attribute, add it
if 'display=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math display="block"', 1)
# Decode common Unicode entities to actual characters (Word prefers this)
unicode_map = {
'&#x0002B;': '+',
'&#x02026;': '',
'&#x022EE;': '',
'&#x0003D;': '=',
'&#x0007C;': '|',
'&#x0002C;': ',',
'&#x00028;': '(',
'&#x00029;': ')',
}
for entity, char in unicode_map.items():
mathml = mathml.replace(entity, char)
return mathml
def _latex_to_mathml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to standard MathML.

202
docs/FORMAT_COMPARISON.md Normal file
View File

@@ -0,0 +1,202 @@
# MathML vs OMML 格式对比
## 快速选择指南
| 使用场景 | 推荐格式 | API 端点 |
|---------|---------|----------|
| 手动复制粘贴到 Word | MathML | `/image/ocr` 返回 `mathml` |
| 网页显示公式 | MathML | `/image/ocr` 返回 `mathml` |
| Office.js 插件开发 | OMML | `/convert/latex-to-omml` |
| Python 生成 Word 文档 | OMML | `/convert/latex-to-omml` |
| 跨平台显示 | MathML | `/image/ocr` 返回 `mathml` |
## 格式详解
### MathML (Mathematical Markup Language)
**标准**: W3C 标准
**浏览器支持**: Chrome, Firefox, Safari (原生支持)
**Word 支持**: 可粘贴 (Word 自动转换为 OMML)
#### 示例
```xml
<math xmlns="http://www.w3.org/1998/Math/MathML">
<mfrac>
<mi>a</mi>
<mi>b</mi>
</mfrac>
</math>
```
#### 优点
- ✅ 跨平台标准
- ✅ 浏览器原生支持
- ✅ 可读性好
- ✅ 可直接粘贴到 Word
#### 缺点
- ❌ Word 内部需要转换
- ❌ 渲染精度依赖 Word 转换器
### OMML (Office Math Markup Language)
**标准**: Microsoft 专有格式
**浏览器支持**: 不支持
**Word 支持**: 原生格式 (最佳兼容性)
#### 示例
```xml
<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math">
<m:f>
<m:num><m:r><m:t>a</m:t></m:r></m:num>
<m:den><m:r><m:t>b</m:t></m:r></m:den>
</m:f>
</m:oMath>
```
#### 优点
- ✅ Word 原生格式,渲染最准确
- ✅ 适合编程生成 Word 文档
- ✅ Office.js API 直接支持
#### 缺点
- ❌ 仅 Word 支持
- ❌ 可读性差
- ❌ 不能浏览器渲染
## API 使用示例
### 1. 获取 MathML (手动粘贴到 Word)
```bash
# OCR 识别图片,返回 MathML
curl -X POST "http://localhost:8000/api/v1/image/ocr" \
-H "Content-Type: application/json" \
-d '{
"image_url": "https://example.com/formula.png",
"model_name": "mineru"
}'
```
响应:
```json
{
"latex": "\\frac{a}{b}",
"markdown": "$\\frac{a}{b}$",
"mathml": "<math>...</math>", // 👈 复制这个粘贴到 Word
"mml": "<mml:math>...</mml:math>"
}
```
### 2. 获取 OMML (编程插入 Word)
```bash
# 转换 LaTeX 为 OMML
curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \
-H "Content-Type: application/json" \
-d '{
"latex": "\\frac{a}{b}"
}'
```
响应:
```json
{
"omml": "<m:oMath>...</m:oMath>" // 👈 用于编程插入
}
```
## 编程使用示例
### Python: 插入 OMML 到 Word
```python
from docx import Document
from docx.oxml import parse_xml
# 获取 OMML
import requests
response = requests.post(
"http://localhost:8000/api/v1/convert/latex-to-omml",
json={"latex": "\\frac{a}{b}"}
)
omml = response.json()["omml"]
# 插入到 Word 文档
doc = Document()
paragraph = doc.add_paragraph()
paragraph._element.append(parse_xml(omml))
doc.save("output.docx")
```
### JavaScript: Office Add-in 插入 OMML
```javascript
// 获取 OMML
const response = await fetch('http://localhost:8000/api/v1/convert/latex-to-omml', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ latex: '\\frac{a}{b}' })
});
const { omml } = await response.json();
// 插入到 Word
Office.context.document.setSelectedDataAsync(
omml,
{ coercionType: Office.CoercionType.Ooxml }
);
```
### Web: 显示 MathML
```html
<!DOCTYPE html>
<html>
<body>
<!-- MathML 可以直接在浏览器中渲染 -->
<math xmlns="http://www.w3.org/1998/Math/MathML">
<mfrac>
<mi>a</mi>
<mi>b</mi>
</mfrac>
</math>
</body>
</html>
```
## 性能对比
| 操作 | MathML | OMML |
|------|--------|------|
| 生成速度 | 快 (~100ms) | 慢 (~500ms, 需要 Pandoc) |
| 文件大小 | 较小 | 较大 |
| 转换质量 | 依赖转换器 | 原生最佳 |
## 常见问题
### Q1: 为什么我的 OMML 看起来很长?
**A**: OMML 包含了完整的命名空间和样式信息,所以比 MathML 长。这是正常的。
### Q2: 我应该使用哪个格式?
**A**:
- **手动操作** → MathML (复制粘贴)
- **编程操作** → OMML (API 插入)
### Q3: 能否将 MathML 转换为 OMML
**A**: 可以!使用我们的 API
1. 先从 OCR 获取 `latex`
2. 再调用 `/convert/latex-to-omml` 获取 OMML
### Q4: OMML 能在浏览器显示吗?
**A**: 不能。OMML 是 Word 专用格式。浏览器显示请使用 MathML。
## 总结
- 📋 **用户复制粘贴** → 使用 MathML
- 💻 **编程生成文档** → 使用 OMML
- 🌐 **网页显示** → 使用 MathML
- 🔌 **Office 插件** → 使用 OMML

202
test_word_mathml.py Normal file
View File

@@ -0,0 +1,202 @@
"""Test Word-compatible MathML generation."""
from app.services.converter import Converter
def test_mathml_word_compatibility():
"""Test that generated MathML is Word-compatible."""
converter = Converter()
print("=" * 80)
print("Testing Word-Compatible MathML Generation")
print("=" * 80)
# Test case: Matrix with determinant (the problematic example)
latex = r"""\left| \begin{array}{cccc} a_{11} & a_{12} & \dots & a_{1n} \\ \vdots & \vdots & & \vdots \\ a_{i1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a_{n1} & a_{n2} & \dots & a_{nn} \end{array} \right|"""
print(f"\nLaTeX: {latex[:80]}...")
print("\n" + "-" * 80)
# Convert to formats
result = converter.convert_to_formats(f"$${latex}$$")
if not result.mathml:
print("✗ No MathML generated")
return False
mathml = result.mathml
print("Checking Word compatibility features:")
print("-" * 80)
# Check 1: Display attribute
if 'display="block"' in mathml:
print("✓ Has display='block' attribute")
else:
print("✗ Missing or wrong display attribute")
print(f" Found: {mathml[:100]}...")
# Check 2: No Unicode entities for common symbols
unicode_issues = []
problematic_entities = ['&#x0002B;', '&#x02026;', '&#x022EE;', '&#x0003D;', '&#x0007C;']
for entity in problematic_entities:
if entity in mathml:
unicode_issues.append(entity)
if unicode_issues:
print(f"✗ Contains Unicode entities: {unicode_issues}")
else:
print("✓ No problematic Unicode entities")
# Check 3: Uses mfenced for brackets (Word-friendly)
if '<mfenced' in mathml or '<mo fence="true"' in mathml or 'stretchy="true"' in mathml:
print("✓ Uses fence elements")
else:
print("? No fence elements found (might be OK)")
# Check 4: Has proper namespace
if 'xmlns="http://www.w3.org/1998/Math/MathML"' in mathml:
print("✓ Has MathML namespace")
else:
print("✗ Missing MathML namespace")
# Show preview
print("\n" + "-" * 80)
print("MathML Preview (first 500 chars):")
print("-" * 80)
print(mathml[:500])
if len(mathml) > 500:
print("...")
print("\n" + "-" * 80)
print(f"Total length: {len(mathml)} characters")
# Check if this looks like Pandoc-generated MathML
if 'mfenced' in mathml or 'columnalign' in mathml:
print("✓ Appears to be Pandoc-generated (good for Word)")
elif 'stretchy' in mathml and 'fence' in mathml:
print("✓ Uses standard fence attributes")
else:
print("? MathML structure unclear")
return True
def test_simple_formulas():
"""Test simple formulas for Word compatibility."""
converter = Converter()
print("\n" + "=" * 80)
print("Testing Simple Formulas")
print("=" * 80)
test_cases = [
("Fraction", r"\frac{a}{b}"),
("Square root", r"\sqrt{x^2 + y^2}"),
("Summation", r"\sum_{i=1}^{n} i"),
("Equation", r"E = mc^2"),
("Matrix", r"\begin{pmatrix} a & b \\ c & d \end{pmatrix}"),
]
all_passed = True
for name, latex in test_cases:
print(f"\n{name}: ${latex}$")
try:
result = converter.convert_to_formats(f"${latex}$")
mathml = result.mathml
# Quick checks
checks = [
('display="block"' in mathml, "display=block"),
('&#x0002B;' not in mathml, "no +entity"),
('&#x0003D;' not in mathml, "no =entity"),
('xmlns=' in mathml, "namespace"),
]
status = "" if all(check[0] for check in checks) else ""
failed_checks = [check[1] for check in checks if not check[0]]
print(f" {status} Length: {len(mathml)} chars", end="")
if failed_checks:
print(f" | Issues: {', '.join(failed_checks)}")
all_passed = False
else:
print(" | All checks passed")
except Exception as e:
print(f" ✗ Error: {e}")
all_passed = False
return all_passed
def compare_with_reference():
"""Compare our MathML with reference Word-compatible MathML."""
print("\n" + "=" * 80)
print("Comparison with Reference MathML")
print("=" * 80)
converter = Converter()
# Simple matrix example
latex = r"\left| \begin{array}{cc} a & b \\ c & d \end{array} \right|"
result = converter.convert_to_formats(f"$${latex}$$")
our_mathml = result.mathml
print("\nOur MathML structure:")
print("-" * 80)
# Analyze structure
features = {
"mfenced": "<mfenced" in our_mathml,
"mo fence": '<mo fence="' in our_mathml or '<mo stretchy="true"' in our_mathml,
"mtable": "<mtable" in our_mathml,
"display block": 'display="block"' in our_mathml,
"unicode entities": any(f"&#x{x};" in our_mathml for x in ["0002B", "0003D", "0007C"]),
}
print("Features:")
for feature, present in features.items():
status = "" if present != (feature == "unicode entities") else ""
print(f" {status} {feature}: {present}")
print(f"\nLength: {len(our_mathml)} characters")
print(f"Preview:\n{our_mathml[:300]}...")
return not features["unicode entities"]
if __name__ == "__main__":
print("Word-Compatible MathML Test Suite\n")
try:
test1 = test_mathml_word_compatibility()
test2 = test_simple_formulas()
test3 = compare_with_reference()
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if test1 and test2 and test3:
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
print("\nMathML should be Word-compatible!")
print("Try copying the mathml output and pasting into Word.")
else:
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
print("\nMathML may not be fully Word-compatible.")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted")
except Exception as e:
print(f"\n\nTest error: {e}")
import traceback
traceback.print_exc()