diff --git a/app/services/converter.py b/app/services/converter.py index 04f3d9d..40b0bf6 100644 --- a/app/services/converter.py +++ b/app/services/converter.py @@ -296,29 +296,77 @@ class Converter: def _latex_to_mathml_cached(latex_formula: str) -> str: """Cached conversion of LaTeX formula to MathML. + Uses Pandoc for conversion to ensure Word compatibility. + Pandoc generates standard MathML that Word can properly import. + Uses LRU cache to avoid recomputing for repeated formulas. """ try: - # Use latex2mathml library for conversion (fast, pure Python) - return latex_to_mathml(latex_formula) - except Exception as e: - # Fallback: try with Pandoc (slower, but more robust) + # Use Pandoc for Word-compatible MathML (primary method) + mathml_html = pypandoc.convert_text( + f"${latex_formula}$", + "html", + format="markdown+tex_math_dollars", + extra_args=["--mathml"], + ) + # Extract just the element from the HTML + match = Converter._RE_MATH_ELEMENT.search(mathml_html) + if match: + mathml = match.group(0) + # Post-process for Word compatibility + return Converter._postprocess_mathml_for_word(mathml) + + # If no match, return as-is + return mathml_html.rstrip("\n") + + except Exception as pandoc_error: + # Fallback: try latex2mathml (less Word-compatible) try: - mathml_html = pypandoc.convert_text( - f"${latex_formula}$", - "html", - format="markdown+tex_math_dollars", - extra_args=["--mathml"], - ) - # Extract just the element from the HTML - match = Converter._RE_MATH_ELEMENT.search(mathml_html) - if match: - return match.group(0) - return mathml_html.rstrip("\n") - except Exception as pandoc_error: + mathml = latex_to_mathml(latex_formula) + return Converter._postprocess_mathml_for_word(mathml) + except Exception as e: raise RuntimeError( - f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}" + f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}" ) from e + + @staticmethod + def _postprocess_mathml_for_word(mathml: str) -> str: + """Post-process MathML to improve Word compatibility. + + Applies transformations to make MathML more compatible with Word: + - Change display="inline" to display="block" for better rendering + - Decode Unicode entities to actual characters (Word prefers this) + - Clean up unnecessary attributes + + Args: + mathml: MathML string. + + Returns: + Word-compatible MathML string. + """ + # Change display to block for better Word rendering + mathml = mathml.replace('display="inline"', 'display="block"') + + # If no display attribute, add it + if 'display=' not in mathml and ' str: """Convert LaTeX formula to standard MathML. diff --git a/docs/FORMAT_COMPARISON.md b/docs/FORMAT_COMPARISON.md new file mode 100644 index 0000000..3255726 --- /dev/null +++ b/docs/FORMAT_COMPARISON.md @@ -0,0 +1,202 @@ +# MathML vs OMML 格式对比 + +## 快速选择指南 + +| 使用场景 | 推荐格式 | API 端点 | +|---------|---------|----------| +| 手动复制粘贴到 Word | MathML | `/image/ocr` 返回 `mathml` | +| 网页显示公式 | MathML | `/image/ocr` 返回 `mathml` | +| Office.js 插件开发 | OMML | `/convert/latex-to-omml` | +| Python 生成 Word 文档 | OMML | `/convert/latex-to-omml` | +| 跨平台显示 | MathML | `/image/ocr` 返回 `mathml` | + +## 格式详解 + +### MathML (Mathematical Markup Language) + +**标准**: W3C 标准 +**浏览器支持**: Chrome, Firefox, Safari (原生支持) +**Word 支持**: 可粘贴 (Word 自动转换为 OMML) + +#### 示例 +```xml + + + a + b + + +``` + +#### 优点 +- ✅ 跨平台标准 +- ✅ 浏览器原生支持 +- ✅ 可读性好 +- ✅ 可直接粘贴到 Word + +#### 缺点 +- ❌ Word 内部需要转换 +- ❌ 渲染精度依赖 Word 转换器 + +### OMML (Office Math Markup Language) + +**标准**: Microsoft 专有格式 +**浏览器支持**: 不支持 +**Word 支持**: 原生格式 (最佳兼容性) + +#### 示例 +```xml + + + a + b + + +``` + +#### 优点 +- ✅ Word 原生格式,渲染最准确 +- ✅ 适合编程生成 Word 文档 +- ✅ Office.js API 直接支持 + +#### 缺点 +- ❌ 仅 Word 支持 +- ❌ 可读性差 +- ❌ 不能浏览器渲染 + +## API 使用示例 + +### 1. 获取 MathML (手动粘贴到 Word) + +```bash +# OCR 识别图片,返回 MathML +curl -X POST "http://localhost:8000/api/v1/image/ocr" \ + -H "Content-Type: application/json" \ + -d '{ + "image_url": "https://example.com/formula.png", + "model_name": "mineru" + }' +``` + +响应: +```json +{ + "latex": "\\frac{a}{b}", + "markdown": "$\\frac{a}{b}$", + "mathml": "...", // 👈 复制这个粘贴到 Word + "mml": "..." +} +``` + +### 2. 获取 OMML (编程插入 Word) + +```bash +# 转换 LaTeX 为 OMML +curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \ + -H "Content-Type: application/json" \ + -d '{ + "latex": "\\frac{a}{b}" + }' +``` + +响应: +```json +{ + "omml": "..." // 👈 用于编程插入 +} +``` + +## 编程使用示例 + +### Python: 插入 OMML 到 Word + +```python +from docx import Document +from docx.oxml import parse_xml + +# 获取 OMML +import requests +response = requests.post( + "http://localhost:8000/api/v1/convert/latex-to-omml", + json={"latex": "\\frac{a}{b}"} +) +omml = response.json()["omml"] + +# 插入到 Word 文档 +doc = Document() +paragraph = doc.add_paragraph() +paragraph._element.append(parse_xml(omml)) +doc.save("output.docx") +``` + +### JavaScript: Office Add-in 插入 OMML + +```javascript +// 获取 OMML +const response = await fetch('http://localhost:8000/api/v1/convert/latex-to-omml', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ latex: '\\frac{a}{b}' }) +}); +const { omml } = await response.json(); + +// 插入到 Word +Office.context.document.setSelectedDataAsync( + omml, + { coercionType: Office.CoercionType.Ooxml } +); +``` + +### Web: 显示 MathML + +```html + + + + + + + a + b + + + + +``` + +## 性能对比 + +| 操作 | MathML | OMML | +|------|--------|------| +| 生成速度 | 快 (~100ms) | 慢 (~500ms, 需要 Pandoc) | +| 文件大小 | 较小 | 较大 | +| 转换质量 | 依赖转换器 | 原生最佳 | + +## 常见问题 + +### Q1: 为什么我的 OMML 看起来很长? + +**A**: OMML 包含了完整的命名空间和样式信息,所以比 MathML 长。这是正常的。 + +### Q2: 我应该使用哪个格式? + +**A**: +- **手动操作** → MathML (复制粘贴) +- **编程操作** → OMML (API 插入) + +### Q3: 能否将 MathML 转换为 OMML? + +**A**: 可以!使用我们的 API: +1. 先从 OCR 获取 `latex` +2. 再调用 `/convert/latex-to-omml` 获取 OMML + +### Q4: OMML 能在浏览器显示吗? + +**A**: 不能。OMML 是 Word 专用格式。浏览器显示请使用 MathML。 + +## 总结 + +- 📋 **用户复制粘贴** → 使用 MathML +- 💻 **编程生成文档** → 使用 OMML +- 🌐 **网页显示** → 使用 MathML +- 🔌 **Office 插件** → 使用 OMML diff --git a/test_word_mathml.py b/test_word_mathml.py new file mode 100644 index 0000000..7a60a33 --- /dev/null +++ b/test_word_mathml.py @@ -0,0 +1,202 @@ +"""Test Word-compatible MathML generation.""" + +from app.services.converter import Converter + + +def test_mathml_word_compatibility(): + """Test that generated MathML is Word-compatible.""" + + converter = Converter() + + print("=" * 80) + print("Testing Word-Compatible MathML Generation") + print("=" * 80) + + # Test case: Matrix with determinant (the problematic example) + latex = r"""\left| \begin{array}{cccc} a_{11} & a_{12} & \dots & a_{1n} \\ \vdots & \vdots & & \vdots \\ a_{i1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a_{n1} & a_{n2} & \dots & a_{nn} \end{array} \right|""" + + print(f"\nLaTeX: {latex[:80]}...") + print("\n" + "-" * 80) + + # Convert to formats + result = converter.convert_to_formats(f"$${latex}$$") + + if not result.mathml: + print("✗ No MathML generated") + return False + + mathml = result.mathml + + print("Checking Word compatibility features:") + print("-" * 80) + + # Check 1: Display attribute + if 'display="block"' in mathml: + print("✓ Has display='block' attribute") + else: + print("✗ Missing or wrong display attribute") + print(f" Found: {mathml[:100]}...") + + # Check 2: No Unicode entities for common symbols + unicode_issues = [] + problematic_entities = ['+', '…', '⋮', '=', '|'] + for entity in problematic_entities: + if entity in mathml: + unicode_issues.append(entity) + + if unicode_issues: + print(f"✗ Contains Unicode entities: {unicode_issues}") + else: + print("✓ No problematic Unicode entities") + + # Check 3: Uses mfenced for brackets (Word-friendly) + if ' 500: + print("...") + + print("\n" + "-" * 80) + print(f"Total length: {len(mathml)} characters") + + # Check if this looks like Pandoc-generated MathML + if 'mfenced' in mathml or 'columnalign' in mathml: + print("✓ Appears to be Pandoc-generated (good for Word)") + elif 'stretchy' in mathml and 'fence' in mathml: + print("✓ Uses standard fence attributes") + else: + print("? MathML structure unclear") + + return True + + +def test_simple_formulas(): + """Test simple formulas for Word compatibility.""" + + converter = Converter() + + print("\n" + "=" * 80) + print("Testing Simple Formulas") + print("=" * 80) + + test_cases = [ + ("Fraction", r"\frac{a}{b}"), + ("Square root", r"\sqrt{x^2 + y^2}"), + ("Summation", r"\sum_{i=1}^{n} i"), + ("Equation", r"E = mc^2"), + ("Matrix", r"\begin{pmatrix} a & b \\ c & d \end{pmatrix}"), + ] + + all_passed = True + + for name, latex in test_cases: + print(f"\n{name}: ${latex}$") + + try: + result = converter.convert_to_formats(f"${latex}$") + mathml = result.mathml + + # Quick checks + checks = [ + ('display="block"' in mathml, "display=block"), + ('+' not in mathml, "no +entity"), + ('=' not in mathml, "no =entity"), + ('xmlns=' in mathml, "namespace"), + ] + + status = "✓" if all(check[0] for check in checks) else "✗" + failed_checks = [check[1] for check in checks if not check[0]] + + print(f" {status} Length: {len(mathml)} chars", end="") + if failed_checks: + print(f" | Issues: {', '.join(failed_checks)}") + all_passed = False + else: + print(" | All checks passed") + + except Exception as e: + print(f" ✗ Error: {e}") + all_passed = False + + return all_passed + + +def compare_with_reference(): + """Compare our MathML with reference Word-compatible MathML.""" + + print("\n" + "=" * 80) + print("Comparison with Reference MathML") + print("=" * 80) + + converter = Converter() + + # Simple matrix example + latex = r"\left| \begin{array}{cc} a & b \\ c & d \end{array} \right|" + + result = converter.convert_to_formats(f"$${latex}$$") + our_mathml = result.mathml + + print("\nOur MathML structure:") + print("-" * 80) + + # Analyze structure + features = { + "mfenced": "