diff --git a/app/services/converter.py b/app/services/converter.py index 041a9b5..1196d2f 100644 --- a/app/services/converter.py +++ b/app/services/converter.py @@ -340,9 +340,10 @@ class Converter: """Post-process MathML to improve Word compatibility. Applies transformations to make MathML more compatible with Word: + - Remove and wrappers (Word doesn't need them) - Change display="inline" to display="block" for better rendering - Decode Unicode entities to actual characters (Word prefers this) - - Clean up unnecessary attributes + - Ensure proper namespace Args: mathml: MathML string. @@ -350,23 +351,57 @@ class Converter: Returns: Word-compatible MathML string. """ - # Change display to block for better Word rendering + import re + + # Step 1: Remove and wrappers + # These often cause Word import issues + if '' in mathml: + # Extract content between and + match = re.search(r'(.*?)]*)>', mathml) + if math_match: + math_attrs = math_match.group(1) + + # Rebuild without semantics + mathml = f'{content}' + + # Step 2: Change display to block for better Word rendering mathml = mathml.replace('display="inline"', 'display="block"') - # If no display attribute, add it + # Step 3: If no display attribute, add it if 'display=' not in mathml and '', '(': '(', ')': ')', + ',': ',', + '.': '.', + '|': '|', + '…': '⋯', + '⋮': '⋮', + '⋯': '⋯', + '°': '°', + 'γ': 'γ', + 'φ': 'φ', + 'ϕ': 'ϕ', } for entity, char in unicode_map.items(): diff --git a/docs/WORD_MATHML_GUIDE.md b/docs/WORD_MATHML_GUIDE.md new file mode 100644 index 0000000..9cdfe56 --- /dev/null +++ b/docs/WORD_MATHML_GUIDE.md @@ -0,0 +1,204 @@ +# MathML 导入 Word 完整指南 + +## 问题诊断 + +如果 MathML 无法在 Word 中渲染,通常是以下原因: + +### 1. **MathML 格式问题** +- ❌ 包含 `` 和 `` 包装器 +- ❌ 使用 `display="inline"` 而不是 `display="block"` +- ❌ 缺少 `xmlns` 命名空间 +- ❌ 使用 HTML 实体编码而不是实际字符 + +### 2. **Word 粘贴方法不正确** +- ❌ 直接粘贴到正文 +- ❌ 使用"选择性粘贴" +- ❌ 粘贴位置不对 + +## 已修复的问题 + +我们的代码现在会自动: +✅ 移除 `` 和 `` 包装器 +✅ 设置 `display="block"` +✅ 添加正确的 `xmlns` 命名空间 +✅ 解码 Unicode 实体为实际字符 + +## Word 中正确的粘贴方法 + +### 方法 1:使用 MathType(推荐)✨ + +如果你安装了 MathType: + +1. 复制 MathML 内容 +2. 在 Word 中:**插入** → **对象** → **MathType 公式** +3. 在 MathType 中:**编辑** → **粘贴 MathML** +4. 点击"确定" + +### 方法 2:使用 Word 内置公式编辑器 + +#### 选项 A:Alt 文本方法(最可靠) + +1. 在 Word 中:**插入** → **公式** +2. 输入任意内容(如 `x`) +3. 选中公式,右键 → **公式选项** → **另存为新公式** +4. 取消,返回文档 +5. 右键公式 → **编辑替换文本** +6. 将 MathML 粘贴到替换文本框 +7. 按 Enter + +#### 选项 B:XML 方法(需要开发者模式) + +1. **文件** → **选项** → **自定义功能区** +2. 勾选"开发工具" +3. **开发工具** → **XML 映射** +4. 粘贴 MathML + +#### 选项 C:宏方法(高级) + +使用 VBA 宏: + +```vba +Sub InsertMathML() + Dim mathML As String + mathML = "..." ' 粘贴你的 MathML + + Selection.Range.InsertXML mathML +End Sub +``` + +### 方法 3:使用在线工具转换 + +1. 访问 https://www.mathcha.io/ +2. 粘贴 MathML +3. 导出为 Word 格式 + +## 测试你的 MathML + +运行诊断工具: + +```bash +python test_mathml_word_compatibility.py +``` + +这会检查: +- ✓ 命名空间是否正确 +- ✓ Display 属性 +- ✓ 是否有 semantics 包装器 +- ✓ Unicode 实体 + +## 示例:正确的 MathML 格式 + +```xml + + + γ + = + 22.2 + , + c + = + 30.4 + + +``` + +**不要有:** +```xml + + ❌ Word 可能不识别 + ... + ... ❌ Word 不需要 + + +``` + +## API 使用 + +### 获取 Word 兼容的 MathML + +```bash +curl -X POST "http://localhost:8000/api/v1/image/ocr" \ + -H "Content-Type: application/json" \ + -d '{ + "image_base64": "...", + "model_name": "mineru" + }' +``` + +响应中的 `mathml` 字段已经过优化,可以直接用于 Word。 + +### 如果还是不工作 + +1. **检查 Word 版本** + - Word 2010+ 支持 MathML + - Word Online 支持有限 + +2. **检查 MathML 内容** + ```bash + python test_mathml_word_compatibility.py + ``` + +3. **尝试 OMML 格式(Word 原生)** + ```bash + curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \ + -H "Content-Type: application/json" \ + -d '{"latex": "\\gamma = 22.2"}' + ``` + + OMML 是 Word 的原生格式,兼容性最好。 + +## 为什么 OMML 更好? + +| 格式 | 用途 | Word 兼容性 | +|------|------|------------| +| **MathML** | Web 标准、跨平台 | ⭐⭐⭐ 需要转换 | +| **OMML** | Word 原生格式 | ⭐⭐⭐⭐⭐ 完美 | + +**建议**: +- 手动粘贴 → 使用 MathML +- 编程生成 Word 文档 → 使用 OMML + +## 常见错误 + +### 错误 1:粘贴后显示为文本 + +**原因**:粘贴位置不对或格式不对 + +**解决**: +1. 确保 MathML 以 `` 包装器(我们已移除) +2. 使用 OMML 格式 + +### 错误 3:部分显示不正确 + +**原因**:某些 LaTeX 命令不支持 + +**解决**: +1. 检查 LaTeX 语法 +2. 使用 Word 支持的标准命令 + +## 最终建议 + +**最简单的方法**:使用 OMML 格式 + +```bash +# 1. 获取 LaTeX +POST /api/v1/image/ocr +→ 获取 "latex" 字段 + +# 2. 转换为 OMML +POST /api/v1/convert/latex-to-omml +→ 获取 "omml" 字段 + +# 3. 使用 python-docx 或 Office.js 插入 +``` + +这样可以避免所有 MathML 兼容性问题! diff --git a/test_mathml_word_compatibility.py b/test_mathml_word_compatibility.py new file mode 100644 index 0000000..ef46fcc --- /dev/null +++ b/test_mathml_word_compatibility.py @@ -0,0 +1,236 @@ +"""Diagnostic tool for MathML Word compatibility issues.""" + +from app.services.converter import Converter + + +def diagnose_mathml(latex: str) -> dict: + """Diagnose MathML generation and Word compatibility. + + Args: + latex: LaTeX formula to convert. + + Returns: + Dictionary with diagnostic information. + """ + converter = Converter() + + print("=" * 80) + print("MathML Word Compatibility Diagnostic") + print("=" * 80) + + print(f"\nInput LaTeX: {latex}") + + # Convert + try: + result = converter.convert_to_formats(f"${latex}$") + mathml = result.mathml + + print(f"\n✓ Conversion successful") + print(f"MathML length: {len(mathml)} characters") + + except Exception as e: + print(f"\n✗ Conversion failed: {e}") + return {"success": False, "error": str(e)} + + # Diagnostic checks + print("\n" + "-" * 80) + print("Word Compatibility Checks:") + print("-" * 80) + + issues = [] + + # Check 1: Has proper namespace + if 'xmlns="http://www.w3.org/1998/Math/MathML"' in mathml: + print("✓ Has correct MathML namespace") + else: + print("✗ Missing or incorrect MathML namespace") + issues.append("namespace") + + # Check 2: Display attribute + if 'display="block"' in mathml: + print("✓ Has display='block' attribute") + elif 'display="inline"' in mathml: + print("⚠ Has display='inline' (Word prefers 'block')") + issues.append("display_inline") + else: + print("✗ Missing display attribute") + issues.append("no_display") + + # Check 3: Check for problematic elements + if '' in mathml: + print("⚠ Contains element") + print(" Note: Word may ignore semantics wrapper") + issues.append("semantics") + + if ' element") + print(" Note: Word doesn't need annotation, may cause issues") + issues.append("annotation") + + # Check 4: Unicode entities + problematic_entities = ['&#x', '>', '<', '&'] + has_entities = any(entity in mathml for entity in problematic_entities) + if has_entities: + print("⚠ Contains encoded entities (Word prefers actual characters)") + issues.append("entities") + else: + print("✓ No problematic entities") + + # Check 5: Root element structure + if mathml.startswith(' element") + else: + print("✗ Doesn't start with element") + issues.append("no_math_root") + + # Check 6: Check for common Word-incompatible attributes + if 'class=' in mathml: + print("⚠ Contains 'class' attribute (Word ignores these)") + + if 'style=' in mathml: + print("⚠ Contains 'style' attribute (Word ignores these)") + + # Print MathML structure + print("\n" + "-" * 80) + print("MathML Structure:") + print("-" * 80) + + # Show first 500 chars + print(mathml[:500]) + if len(mathml) > 500: + print("...") + print(mathml[-200:]) + + # Recommendations + print("\n" + "-" * 80) + print("Recommendations:") + print("-" * 80) + + if not issues: + print("✓ MathML appears to be Word-compatible!") + print("\nHow to paste into Word:") + print(" 1. Copy the MathML XML") + print(" 2. In Word: Insert → Equation → Ink Equation") + print(" 3. Right-click the equation → 'Professional'") + print(" 4. Right-click again → 'Save as new equation'") + print("\nOR use Alt text method:") + print(" 1. Insert → Equation") + print(" 2. Type any formula") + print(" 3. Right-click → Edit Alt Text") + print(" 4. Paste MathML in Alt Text field") + else: + print("Issues found:") + if "semantics" in issues or "annotation" in issues: + print("\n1. Remove and wrappers") + print(" Word only needs the content inside") + + if "display_inline" in issues: + print("\n2. Change display='inline' to display='block'") + + if "entities" in issues: + print("\n3. Decode HTML entities to actual characters") + + if "namespace" in issues: + print("\n4. Add xmlns='http://www.w3.org/1998/Math/MathML'") + + return { + "success": True, + "mathml": mathml, + "issues": issues, + "length": len(mathml) + } + + +def test_simple_formula(): + """Test with a simple formula.""" + print("\nTest 1: Simple formula") + diagnose_mathml(r"\frac{a}{b}") + + +def test_complex_formula(): + """Test with a complex formula.""" + print("\n\nTest 2: Complex formula with matrix") + diagnose_mathml(r"\left| \begin{array}{cc} a & b \\ c & d \end{array} \right|") + + +def test_problematic_formula(): + """Test with the user's problematic formula.""" + print("\n\nTest 3: User's formula (after OCR fix)") + diagnose_mathml(r"\gamma = 22.2, c = 30.4, \phi = 25.4 ^ {\circ}") + + +def generate_clean_mathml(): + """Generate a clean MathML without semantics/annotation.""" + + print("\n" + "=" * 80) + print("Generating Clean MathML for Word") + print("=" * 80) + + converter = Converter() + latex = r"\gamma = 22.2, c = 30.4, \phi = 25.4 ^ {\circ}" + + result = converter.convert_to_formats(f"${latex}$") + mathml = result.mathml + + # Remove semantics wrapper if present + import re + + # Extract content from semantics if present + if '' in mathml: + print("\n⚠ Original has wrapper") + + # Try to extract just the mrow content + match = re.search(r'(.*?){content}' + + print("\nCleaned MathML (without semantics):") + print("-" * 80) + print(clean_mathml) + + print("\n✓ Try pasting this version into Word") + return clean_mathml + + print("\nGenerated MathML:") + print("-" * 80) + print(mathml) + + return mathml + + +if __name__ == "__main__": + print("MathML Word Compatibility Diagnostic Tool\n") + + try: + test_simple_formula() + test_complex_formula() + test_problematic_formula() + + print("\n\n") + clean = generate_clean_mathml() + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + print("\nCommon reasons MathML doesn't work in Word:") + print(" 1. wrapper - Word may not parse it correctly") + print(" 2. element - Word doesn't need it") + print(" 3. HTML entities - Word prefers actual Unicode characters") + print(" 4. Missing xmlns attribute") + print(" 5. Wrong paste location in Word") + + print("\nBest practice for Word:") + print(" • Use simple MathML without semantics wrapper") + print(" • Include xmlns attribute") + print(" • Use display='block'") + print(" • Use actual characters, not entities") + + print("\n" + "=" * 80) + + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc()