feature/converter #1
@@ -296,29 +296,77 @@ class Converter:
|
|||||||
def _latex_to_mathml_cached(latex_formula: str) -> str:
|
def _latex_to_mathml_cached(latex_formula: str) -> str:
|
||||||
"""Cached conversion of LaTeX formula to MathML.
|
"""Cached conversion of LaTeX formula to MathML.
|
||||||
|
|
||||||
|
Uses Pandoc for conversion to ensure Word compatibility.
|
||||||
|
Pandoc generates standard MathML that Word can properly import.
|
||||||
|
|
||||||
Uses LRU cache to avoid recomputing for repeated formulas.
|
Uses LRU cache to avoid recomputing for repeated formulas.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Use latex2mathml library for conversion (fast, pure Python)
|
# Use Pandoc for Word-compatible MathML (primary method)
|
||||||
return latex_to_mathml(latex_formula)
|
mathml_html = pypandoc.convert_text(
|
||||||
except Exception as e:
|
f"${latex_formula}$",
|
||||||
# Fallback: try with Pandoc (slower, but more robust)
|
"html",
|
||||||
|
format="markdown+tex_math_dollars",
|
||||||
|
extra_args=["--mathml"],
|
||||||
|
)
|
||||||
|
# Extract just the <math> element from the HTML
|
||||||
|
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
|
||||||
|
if match:
|
||||||
|
mathml = match.group(0)
|
||||||
|
# Post-process for Word compatibility
|
||||||
|
return Converter._postprocess_mathml_for_word(mathml)
|
||||||
|
|
||||||
|
# If no match, return as-is
|
||||||
|
return mathml_html.rstrip("\n")
|
||||||
|
|
||||||
|
except Exception as pandoc_error:
|
||||||
|
# Fallback: try latex2mathml (less Word-compatible)
|
||||||
try:
|
try:
|
||||||
mathml_html = pypandoc.convert_text(
|
mathml = latex_to_mathml(latex_formula)
|
||||||
f"${latex_formula}$",
|
return Converter._postprocess_mathml_for_word(mathml)
|
||||||
"html",
|
except Exception as e:
|
||||||
format="markdown+tex_math_dollars",
|
|
||||||
extra_args=["--mathml"],
|
|
||||||
)
|
|
||||||
# Extract just the <math> element from the HTML
|
|
||||||
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
|
|
||||||
if match:
|
|
||||||
return match.group(0)
|
|
||||||
return mathml_html.rstrip("\n")
|
|
||||||
except Exception as pandoc_error:
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
|
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _postprocess_mathml_for_word(mathml: str) -> str:
|
||||||
|
"""Post-process MathML to improve Word compatibility.
|
||||||
|
|
||||||
|
Applies transformations to make MathML more compatible with Word:
|
||||||
|
- Change display="inline" to display="block" for better rendering
|
||||||
|
- Decode Unicode entities to actual characters (Word prefers this)
|
||||||
|
- Clean up unnecessary attributes
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mathml: MathML string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Word-compatible MathML string.
|
||||||
|
"""
|
||||||
|
# Change display to block for better Word rendering
|
||||||
|
mathml = mathml.replace('display="inline"', 'display="block"')
|
||||||
|
|
||||||
|
# If no display attribute, add it
|
||||||
|
if 'display=' not in mathml and '<math' in mathml:
|
||||||
|
mathml = mathml.replace('<math', '<math display="block"', 1)
|
||||||
|
|
||||||
|
# Decode common Unicode entities to actual characters (Word prefers this)
|
||||||
|
unicode_map = {
|
||||||
|
'+': '+',
|
||||||
|
'…': '⋯',
|
||||||
|
'⋮': '⋮',
|
||||||
|
'=': '=',
|
||||||
|
'|': '|',
|
||||||
|
',': ',',
|
||||||
|
'(': '(',
|
||||||
|
')': ')',
|
||||||
|
}
|
||||||
|
|
||||||
|
for entity, char in unicode_map.items():
|
||||||
|
mathml = mathml.replace(entity, char)
|
||||||
|
|
||||||
|
return mathml
|
||||||
|
|
||||||
def _latex_to_mathml(self, latex_formula: str) -> str:
|
def _latex_to_mathml(self, latex_formula: str) -> str:
|
||||||
"""Convert LaTeX formula to standard MathML.
|
"""Convert LaTeX formula to standard MathML.
|
||||||
|
|||||||
202
docs/FORMAT_COMPARISON.md
Normal file
202
docs/FORMAT_COMPARISON.md
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
# MathML vs OMML 格式对比
|
||||||
|
|
||||||
|
## 快速选择指南
|
||||||
|
|
||||||
|
| 使用场景 | 推荐格式 | API 端点 |
|
||||||
|
|---------|---------|----------|
|
||||||
|
| 手动复制粘贴到 Word | MathML | `/image/ocr` 返回 `mathml` |
|
||||||
|
| 网页显示公式 | MathML | `/image/ocr` 返回 `mathml` |
|
||||||
|
| Office.js 插件开发 | OMML | `/convert/latex-to-omml` |
|
||||||
|
| Python 生成 Word 文档 | OMML | `/convert/latex-to-omml` |
|
||||||
|
| 跨平台显示 | MathML | `/image/ocr` 返回 `mathml` |
|
||||||
|
|
||||||
|
## 格式详解
|
||||||
|
|
||||||
|
### MathML (Mathematical Markup Language)
|
||||||
|
|
||||||
|
**标准**: W3C 标准
|
||||||
|
**浏览器支持**: Chrome, Firefox, Safari (原生支持)
|
||||||
|
**Word 支持**: 可粘贴 (Word 自动转换为 OMML)
|
||||||
|
|
||||||
|
#### 示例
|
||||||
|
```xml
|
||||||
|
<math xmlns="http://www.w3.org/1998/Math/MathML">
|
||||||
|
<mfrac>
|
||||||
|
<mi>a</mi>
|
||||||
|
<mi>b</mi>
|
||||||
|
</mfrac>
|
||||||
|
</math>
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 优点
|
||||||
|
- ✅ 跨平台标准
|
||||||
|
- ✅ 浏览器原生支持
|
||||||
|
- ✅ 可读性好
|
||||||
|
- ✅ 可直接粘贴到 Word
|
||||||
|
|
||||||
|
#### 缺点
|
||||||
|
- ❌ Word 内部需要转换
|
||||||
|
- ❌ 渲染精度依赖 Word 转换器
|
||||||
|
|
||||||
|
### OMML (Office Math Markup Language)
|
||||||
|
|
||||||
|
**标准**: Microsoft 专有格式
|
||||||
|
**浏览器支持**: 不支持
|
||||||
|
**Word 支持**: 原生格式 (最佳兼容性)
|
||||||
|
|
||||||
|
#### 示例
|
||||||
|
```xml
|
||||||
|
<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math">
|
||||||
|
<m:f>
|
||||||
|
<m:num><m:r><m:t>a</m:t></m:r></m:num>
|
||||||
|
<m:den><m:r><m:t>b</m:t></m:r></m:den>
|
||||||
|
</m:f>
|
||||||
|
</m:oMath>
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 优点
|
||||||
|
- ✅ Word 原生格式,渲染最准确
|
||||||
|
- ✅ 适合编程生成 Word 文档
|
||||||
|
- ✅ Office.js API 直接支持
|
||||||
|
|
||||||
|
#### 缺点
|
||||||
|
- ❌ 仅 Word 支持
|
||||||
|
- ❌ 可读性差
|
||||||
|
- ❌ 不能浏览器渲染
|
||||||
|
|
||||||
|
## API 使用示例
|
||||||
|
|
||||||
|
### 1. 获取 MathML (手动粘贴到 Word)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# OCR 识别图片,返回 MathML
|
||||||
|
curl -X POST "http://localhost:8000/api/v1/image/ocr" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"image_url": "https://example.com/formula.png",
|
||||||
|
"model_name": "mineru"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
响应:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"latex": "\\frac{a}{b}",
|
||||||
|
"markdown": "$\\frac{a}{b}$",
|
||||||
|
"mathml": "<math>...</math>", // 👈 复制这个粘贴到 Word
|
||||||
|
"mml": "<mml:math>...</mml:math>"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. 获取 OMML (编程插入 Word)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 转换 LaTeX 为 OMML
|
||||||
|
curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"latex": "\\frac{a}{b}"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
响应:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"omml": "<m:oMath>...</m:oMath>" // 👈 用于编程插入
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 编程使用示例
|
||||||
|
|
||||||
|
### Python: 插入 OMML 到 Word
|
||||||
|
|
||||||
|
```python
|
||||||
|
from docx import Document
|
||||||
|
from docx.oxml import parse_xml
|
||||||
|
|
||||||
|
# 获取 OMML
|
||||||
|
import requests
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:8000/api/v1/convert/latex-to-omml",
|
||||||
|
json={"latex": "\\frac{a}{b}"}
|
||||||
|
)
|
||||||
|
omml = response.json()["omml"]
|
||||||
|
|
||||||
|
# 插入到 Word 文档
|
||||||
|
doc = Document()
|
||||||
|
paragraph = doc.add_paragraph()
|
||||||
|
paragraph._element.append(parse_xml(omml))
|
||||||
|
doc.save("output.docx")
|
||||||
|
```
|
||||||
|
|
||||||
|
### JavaScript: Office Add-in 插入 OMML
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// 获取 OMML
|
||||||
|
const response = await fetch('http://localhost:8000/api/v1/convert/latex-to-omml', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ latex: '\\frac{a}{b}' })
|
||||||
|
});
|
||||||
|
const { omml } = await response.json();
|
||||||
|
|
||||||
|
// 插入到 Word
|
||||||
|
Office.context.document.setSelectedDataAsync(
|
||||||
|
omml,
|
||||||
|
{ coercionType: Office.CoercionType.Ooxml }
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Web: 显示 MathML
|
||||||
|
|
||||||
|
```html
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<!-- MathML 可以直接在浏览器中渲染 -->
|
||||||
|
<math xmlns="http://www.w3.org/1998/Math/MathML">
|
||||||
|
<mfrac>
|
||||||
|
<mi>a</mi>
|
||||||
|
<mi>b</mi>
|
||||||
|
</mfrac>
|
||||||
|
</math>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
```
|
||||||
|
|
||||||
|
## 性能对比
|
||||||
|
|
||||||
|
| 操作 | MathML | OMML |
|
||||||
|
|------|--------|------|
|
||||||
|
| 生成速度 | 快 (~100ms) | 慢 (~500ms, 需要 Pandoc) |
|
||||||
|
| 文件大小 | 较小 | 较大 |
|
||||||
|
| 转换质量 | 依赖转换器 | 原生最佳 |
|
||||||
|
|
||||||
|
## 常见问题
|
||||||
|
|
||||||
|
### Q1: 为什么我的 OMML 看起来很长?
|
||||||
|
|
||||||
|
**A**: OMML 包含了完整的命名空间和样式信息,所以比 MathML 长。这是正常的。
|
||||||
|
|
||||||
|
### Q2: 我应该使用哪个格式?
|
||||||
|
|
||||||
|
**A**:
|
||||||
|
- **手动操作** → MathML (复制粘贴)
|
||||||
|
- **编程操作** → OMML (API 插入)
|
||||||
|
|
||||||
|
### Q3: 能否将 MathML 转换为 OMML?
|
||||||
|
|
||||||
|
**A**: 可以!使用我们的 API:
|
||||||
|
1. 先从 OCR 获取 `latex`
|
||||||
|
2. 再调用 `/convert/latex-to-omml` 获取 OMML
|
||||||
|
|
||||||
|
### Q4: OMML 能在浏览器显示吗?
|
||||||
|
|
||||||
|
**A**: 不能。OMML 是 Word 专用格式。浏览器显示请使用 MathML。
|
||||||
|
|
||||||
|
## 总结
|
||||||
|
|
||||||
|
- 📋 **用户复制粘贴** → 使用 MathML
|
||||||
|
- 💻 **编程生成文档** → 使用 OMML
|
||||||
|
- 🌐 **网页显示** → 使用 MathML
|
||||||
|
- 🔌 **Office 插件** → 使用 OMML
|
||||||
202
test_word_mathml.py
Normal file
202
test_word_mathml.py
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
"""Test Word-compatible MathML generation."""
|
||||||
|
|
||||||
|
from app.services.converter import Converter
|
||||||
|
|
||||||
|
|
||||||
|
def test_mathml_word_compatibility():
|
||||||
|
"""Test that generated MathML is Word-compatible."""
|
||||||
|
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("Testing Word-Compatible MathML Generation")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Test case: Matrix with determinant (the problematic example)
|
||||||
|
latex = r"""\left| \begin{array}{cccc} a_{11} & a_{12} & \dots & a_{1n} \\ \vdots & \vdots & & \vdots \\ a_{i1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a_{n1} & a_{n2} & \dots & a_{nn} \end{array} \right|"""
|
||||||
|
|
||||||
|
print(f"\nLaTeX: {latex[:80]}...")
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
|
||||||
|
# Convert to formats
|
||||||
|
result = converter.convert_to_formats(f"$${latex}$$")
|
||||||
|
|
||||||
|
if not result.mathml:
|
||||||
|
print("✗ No MathML generated")
|
||||||
|
return False
|
||||||
|
|
||||||
|
mathml = result.mathml
|
||||||
|
|
||||||
|
print("Checking Word compatibility features:")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
# Check 1: Display attribute
|
||||||
|
if 'display="block"' in mathml:
|
||||||
|
print("✓ Has display='block' attribute")
|
||||||
|
else:
|
||||||
|
print("✗ Missing or wrong display attribute")
|
||||||
|
print(f" Found: {mathml[:100]}...")
|
||||||
|
|
||||||
|
# Check 2: No Unicode entities for common symbols
|
||||||
|
unicode_issues = []
|
||||||
|
problematic_entities = ['+', '…', '⋮', '=', '|']
|
||||||
|
for entity in problematic_entities:
|
||||||
|
if entity in mathml:
|
||||||
|
unicode_issues.append(entity)
|
||||||
|
|
||||||
|
if unicode_issues:
|
||||||
|
print(f"✗ Contains Unicode entities: {unicode_issues}")
|
||||||
|
else:
|
||||||
|
print("✓ No problematic Unicode entities")
|
||||||
|
|
||||||
|
# Check 3: Uses mfenced for brackets (Word-friendly)
|
||||||
|
if '<mfenced' in mathml or '<mo fence="true"' in mathml or 'stretchy="true"' in mathml:
|
||||||
|
print("✓ Uses fence elements")
|
||||||
|
else:
|
||||||
|
print("? No fence elements found (might be OK)")
|
||||||
|
|
||||||
|
# Check 4: Has proper namespace
|
||||||
|
if 'xmlns="http://www.w3.org/1998/Math/MathML"' in mathml:
|
||||||
|
print("✓ Has MathML namespace")
|
||||||
|
else:
|
||||||
|
print("✗ Missing MathML namespace")
|
||||||
|
|
||||||
|
# Show preview
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print("MathML Preview (first 500 chars):")
|
||||||
|
print("-" * 80)
|
||||||
|
print(mathml[:500])
|
||||||
|
if len(mathml) > 500:
|
||||||
|
print("...")
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print(f"Total length: {len(mathml)} characters")
|
||||||
|
|
||||||
|
# Check if this looks like Pandoc-generated MathML
|
||||||
|
if 'mfenced' in mathml or 'columnalign' in mathml:
|
||||||
|
print("✓ Appears to be Pandoc-generated (good for Word)")
|
||||||
|
elif 'stretchy' in mathml and 'fence' in mathml:
|
||||||
|
print("✓ Uses standard fence attributes")
|
||||||
|
else:
|
||||||
|
print("? MathML structure unclear")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_formulas():
|
||||||
|
"""Test simple formulas for Word compatibility."""
|
||||||
|
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Testing Simple Formulas")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
("Fraction", r"\frac{a}{b}"),
|
||||||
|
("Square root", r"\sqrt{x^2 + y^2}"),
|
||||||
|
("Summation", r"\sum_{i=1}^{n} i"),
|
||||||
|
("Equation", r"E = mc^2"),
|
||||||
|
("Matrix", r"\begin{pmatrix} a & b \\ c & d \end{pmatrix}"),
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
|
||||||
|
for name, latex in test_cases:
|
||||||
|
print(f"\n{name}: ${latex}$")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = converter.convert_to_formats(f"${latex}$")
|
||||||
|
mathml = result.mathml
|
||||||
|
|
||||||
|
# Quick checks
|
||||||
|
checks = [
|
||||||
|
('display="block"' in mathml, "display=block"),
|
||||||
|
('+' not in mathml, "no +entity"),
|
||||||
|
('=' not in mathml, "no =entity"),
|
||||||
|
('xmlns=' in mathml, "namespace"),
|
||||||
|
]
|
||||||
|
|
||||||
|
status = "✓" if all(check[0] for check in checks) else "✗"
|
||||||
|
failed_checks = [check[1] for check in checks if not check[0]]
|
||||||
|
|
||||||
|
print(f" {status} Length: {len(mathml)} chars", end="")
|
||||||
|
if failed_checks:
|
||||||
|
print(f" | Issues: {', '.join(failed_checks)}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(" | All checks passed")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ✗ Error: {e}")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def compare_with_reference():
|
||||||
|
"""Compare our MathML with reference Word-compatible MathML."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Comparison with Reference MathML")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
# Simple matrix example
|
||||||
|
latex = r"\left| \begin{array}{cc} a & b \\ c & d \end{array} \right|"
|
||||||
|
|
||||||
|
result = converter.convert_to_formats(f"$${latex}$$")
|
||||||
|
our_mathml = result.mathml
|
||||||
|
|
||||||
|
print("\nOur MathML structure:")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
# Analyze structure
|
||||||
|
features = {
|
||||||
|
"mfenced": "<mfenced" in our_mathml,
|
||||||
|
"mo fence": '<mo fence="' in our_mathml or '<mo stretchy="true"' in our_mathml,
|
||||||
|
"mtable": "<mtable" in our_mathml,
|
||||||
|
"display block": 'display="block"' in our_mathml,
|
||||||
|
"unicode entities": any(f"&#x{x};" in our_mathml for x in ["0002B", "0003D", "0007C"]),
|
||||||
|
}
|
||||||
|
|
||||||
|
print("Features:")
|
||||||
|
for feature, present in features.items():
|
||||||
|
status = "✓" if present != (feature == "unicode entities") else "✗"
|
||||||
|
print(f" {status} {feature}: {present}")
|
||||||
|
|
||||||
|
print(f"\nLength: {len(our_mathml)} characters")
|
||||||
|
print(f"Preview:\n{our_mathml[:300]}...")
|
||||||
|
|
||||||
|
return not features["unicode entities"]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Word-Compatible MathML Test Suite\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
test1 = test_mathml_word_compatibility()
|
||||||
|
test2 = test_simple_formulas()
|
||||||
|
test3 = compare_with_reference()
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
if test1 and test2 and test3:
|
||||||
|
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
|
||||||
|
print("\nMathML should be Word-compatible!")
|
||||||
|
print("Try copying the mathml output and pasting into Word.")
|
||||||
|
else:
|
||||||
|
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
|
||||||
|
print("\nMathML may not be fully Word-compatible.")
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nTests interrupted")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n\nTest error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
Reference in New Issue
Block a user