From 720cd05add1c879347b990cd22a0d489acfd41f3 Mon Sep 17 00:00:00 2001 From: liuyuanchuang Date: Wed, 4 Feb 2026 15:52:04 +0800 Subject: [PATCH] fix: handle mathml preprocess --- app/services/converter.py | 16 ++- test_array_fix_complete.py | 254 +++++++++++++++++++++++++++++++++++++ 2 files changed, 264 insertions(+), 6 deletions(-) create mode 100644 test_array_fix_complete.py diff --git a/app/services/converter.py b/app/services/converter.py index 40b0bf6..0d69942 100644 --- a/app/services/converter.py +++ b/app/services/converter.py @@ -200,8 +200,11 @@ class Converter: # Extract the LaTeX formula content (remove delimiters) latex_formula = self._extract_latex_formula(md_text) + # Preprocess formula for better conversion (fix array specifiers, etc.) + preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula) + # Convert to MathML - mathml = self._latex_to_mathml(latex_formula) + mathml = self._latex_to_mathml(preprocessed_formula) # Convert MathML to mml:math format (with namespace prefix) mml = self._mathml_to_mml(mathml) @@ -234,15 +237,16 @@ class Converter: raise ValueError("LaTeX formula cannot be empty") # Preprocess formula using the same preprocessing as export - preprocessed = self._preprocess_formula_for_omml(latex_formula.strip()) + preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip()) return self._latex_to_omml(preprocessed) - def _preprocess_formula_for_omml(self, latex_formula: str) -> str: - """Preprocess LaTeX formula for OMML conversion. + def _preprocess_formula_for_conversion(self, latex_formula: str) -> str: + """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.). Applies the same preprocessing steps as preprocess_for_export to ensure - consistency. This fixes common issues that cause Pandoc OMML conversion to fail. + consistency across all conversion paths. This fixes common issues that + cause Pandoc conversion to fail. Args: latex_formula: Pure LaTeX formula. @@ -254,7 +258,7 @@ class Converter: # 1. Convert matrix environments latex_formula = self._convert_matrix_environments(latex_formula) - # 2. Fix array column specifiers (remove spaces) + # 2. Fix array column specifiers (remove spaces) - THIS IS THE KEY FIX latex_formula = self._fix_array_column_specifiers(latex_formula) # 3. Fix brace spacing diff --git a/test_array_fix_complete.py b/test_array_fix_complete.py new file mode 100644 index 0000000..3fb88d1 --- /dev/null +++ b/test_array_fix_complete.py @@ -0,0 +1,254 @@ +"""Comprehensive test for array column specifier fix in all conversion paths.""" + +from app.services.converter import Converter + + +def test_problematic_array(): + """Test the exact LaTeX that caused the error.""" + + print("=" * 80) + print("Testing Problematic Array (from error log)") + print("=" * 80) + + converter = Converter() + + # The exact LaTeX from the error log + latex = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}""" + + print(f"\nLaTeX length: {len(latex)} characters") + print(f"Contains '{{{\"c c c c\"}}}': {'{c c c c}' in latex}") + + # Test 1: Preprocessing + print("\n" + "-" * 80) + print("Test 1: Preprocessing") + print("-" * 80) + + preprocessed = converter._preprocess_formula_for_conversion(latex) + + if '{c c c c}' in preprocessed: + print("✗ FAILED: Spaces NOT removed from array specifiers") + print(f" Still found: {preprocessed[preprocessed.find('{c c c c}'):preprocessed.find('{c c c c}')+15]}") + return False + elif '{cccc}' in preprocessed: + print("✓ SUCCESS: Spaces removed from array specifiers") + print(f" '{{{\"c c c c\"}}}' → '{{cccc}}'") + else: + print("? WARNING: Could not verify specifier fix") + + # Test 2: MathML Conversion + print("\n" + "-" * 80) + print("Test 2: MathML Conversion (via convert_to_formats)") + print("-" * 80) + + try: + result = converter.convert_to_formats(f"$${latex}$$") + + if result.mathml: + print(f"✓ SUCCESS: MathML generated ({len(result.mathml)} chars)") + + # Check for Word compatibility + if 'display="block"' in result.mathml: + print(" ✓ Has display='block' (Word-friendly)") + + if '+' not in result.mathml and '=' not in result.mathml: + print(" ✓ No problematic Unicode entities") + + print(f"\n MathML preview:\n {result.mathml[:200]}...") + else: + print("✗ FAILED: No MathML generated") + return False + + except Exception as e: + print(f"✗ FAILED: MathML conversion error: {e}") + return False + + # Test 3: OMML Conversion + print("\n" + "-" * 80) + print("Test 3: OMML Conversion") + print("-" * 80) + + try: + omml = converter.convert_to_omml(latex) + + if omml: + print(f"✓ SUCCESS: OMML generated ({len(omml)} chars)") + + if 'oMath' in omml: + print(" ✓ Valid OMML structure") + + print(f"\n OMML preview:\n {omml[:200]}...") + else: + print("✗ FAILED: No OMML generated") + return False + + except Exception as e: + print(f"✗ FAILED: OMML conversion error: {e}") + return False + + print("\n" + "=" * 80) + print("✓✓✓ ALL CONVERSION PATHS WORKING ✓✓✓") + print("=" * 80) + + return True + + +def test_simple_arrays(): + """Test simple arrays with spaces in column specifiers.""" + + print("\n" + "=" * 80) + print("Testing Simple Arrays") + print("=" * 80) + + converter = Converter() + + test_cases = [ + ("2x2 array", r"\begin{array}{c c} a & b \\ c & d \end{array}"), + ("3x3 array", r"\begin{array}{c c c} 1 & 2 & 3 \\ 4 & 5 & 6 \\ 7 & 8 & 9 \end{array}"), + ("Array with pipes", r"\left| \begin{array}{c c} a & b \\ c & d \end{array} \right|"), + ("Mixed alignment", r"\begin{array}{l r c} left & right & center \end{array}"), + ] + + all_passed = True + + for name, latex in test_cases: + print(f"\n{name}") + print("-" * 40) + print(f"LaTeX: {latex}") + + # Check preprocessing + preprocessed = converter._preprocess_formula_for_conversion(latex) + has_spaces = any(f"{{{' '.join(chars)}}}" in preprocessed for chars in [['c', 'c'], ['c', 'c', 'c'], ['l', 'r', 'c']]) + + try: + result = converter.convert_to_formats(f"${latex}$") + + if result.mathml and result.mml: + status = "✓" if not has_spaces else "✗" + print(f"{status} MathML: {len(result.mathml)} chars, MML: {len(result.mml)} chars") + + if not has_spaces: + print(" ✓ Array specifiers fixed") + else: + print(" ✗ Array specifiers still have spaces") + all_passed = False + else: + print("✗ Conversion failed") + all_passed = False + + except Exception as e: + print(f"✗ Error: {e}") + all_passed = False + + return all_passed + + +def test_conversion_consistency(): + """Test that all conversion paths use the same preprocessing.""" + + print("\n" + "=" * 80) + print("Testing Conversion Consistency") + print("=" * 80) + + converter = Converter() + + # Test formula with multiple issues + latex = r""" + \left\{ \begin{array}{l c} + \begin{vmatrix} a & b \\ c & d \end{vmatrix} & = ad - bc \\ + \begin{cases} x & x > 0 \\ 0 & x \leq 0 \end{cases} & \text{sign} + \end{array} \right. + """.strip() + + print(f"\nComplex formula with:") + print(" - array with spaces: {l c}") + print(" - vmatrix environment") + print(" - cases environment") + + print("\n" + "-" * 80) + print("Preprocessing check:") + print("-" * 80) + + preprocessed = converter._preprocess_formula_for_conversion(latex) + + checks = { + "Array spaces removed": '{l c}' not in preprocessed and '{lc}' in preprocessed, + "vmatrix converted": 'vmatrix' not in preprocessed, + "cases converted": 'cases' not in preprocessed and 'array' in preprocessed, + } + + for check, passed in checks.items(): + status = "✓" if passed else "✗" + print(f"{status} {check}") + + print("\n" + "-" * 80) + print("Conversion paths:") + print("-" * 80) + + all_passed = True + + # Test MathML + try: + result = converter.convert_to_formats(f"$${latex}$$") + print(f"✓ MathML: {len(result.mathml)} chars") + print(f"✓ MML: {len(result.mml)} chars") + except Exception as e: + print(f"✗ MathML failed: {e}") + all_passed = False + + # Test OMML + try: + omml = converter.convert_to_omml(latex) + print(f"✓ OMML: {len(omml)} chars") + except Exception as e: + print(f"✗ OMML failed: {e}") + all_passed = False + + return all_passed and all(checks.values()) + + +if __name__ == "__main__": + print("=" * 80) + print("COMPREHENSIVE ARRAY FIX TEST SUITE") + print("Testing all conversion paths with preprocessing") + print("=" * 80) + + try: + test1 = test_problematic_array() + test2 = test_simple_arrays() + test3 = test_conversion_consistency() + + print("\n" + "=" * 80) + print("FINAL SUMMARY") + print("=" * 80) + + results = [ + ("Problematic array fix", test1), + ("Simple arrays", test2), + ("Conversion consistency", test3), + ] + + for name, passed in results: + status = "✓ PASS" if passed else "✗ FAIL" + print(f"{status}: {name}") + + all_passed = all(result[1] for result in results) + + print("\n" + "-" * 80) + + if all_passed: + print("✓✓✓ ALL TESTS PASSED ✓✓✓") + print("\nThe array column specifier fix is working in ALL conversion paths:") + print(" • MathML conversion (for Word paste)") + print(" • MML conversion (namespaced MathML)") + print(" • OMML conversion (Word native)") + else: + print("✗✗✗ SOME TESTS FAILED ✗✗✗") + + print("=" * 80) + + except KeyboardInterrupt: + print("\n\nTests interrupted") + except Exception as e: + print(f"\n\nTest error: {e}") + import traceback + traceback.print_exc()