From e31017cfe7b7c24e597a7a8ff26ba9cd8bdf31ad Mon Sep 17 00:00:00 2001 From: liuyuanchuang Date: Wed, 4 Feb 2026 12:45:34 +0800 Subject: [PATCH] fix: add preprocess --- app/services/converter.py | 35 +++++- test_array_fix.py | 102 +++++++++++++++++ test_omml_preprocessing.py | 218 +++++++++++++++++++++++++++++++++++++ 3 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 test_array_fix.py create mode 100644 test_omml_preprocessing.py diff --git a/app/services/converter.py b/app/services/converter.py index b5ff2ba..04f3d9d 100644 --- a/app/services/converter.py +++ b/app/services/converter.py @@ -217,6 +217,9 @@ class Converter: This is a separate method due to the performance overhead of OMML conversion, which requires creating a temporary DOCX file. + The formula is preprocessed using the same logic as export_to_file to ensure + proper conversion. + Args: latex_formula: Pure LaTeX formula (without delimiters like $ or $$). @@ -230,7 +233,37 @@ class Converter: if not latex_formula or not latex_formula.strip(): raise ValueError("LaTeX formula cannot be empty") - return self._latex_to_omml(latex_formula.strip()) + # Preprocess formula using the same preprocessing as export + preprocessed = self._preprocess_formula_for_omml(latex_formula.strip()) + + return self._latex_to_omml(preprocessed) + + def _preprocess_formula_for_omml(self, latex_formula: str) -> str: + """Preprocess LaTeX formula for OMML conversion. + + Applies the same preprocessing steps as preprocess_for_export to ensure + consistency. This fixes common issues that cause Pandoc OMML conversion to fail. + + Args: + latex_formula: Pure LaTeX formula. + + Returns: + Preprocessed LaTeX formula. + """ + # Use the same preprocessing methods as export + # 1. Convert matrix environments + latex_formula = self._convert_matrix_environments(latex_formula) + + # 2. Fix array column specifiers (remove spaces) + latex_formula = self._fix_array_column_specifiers(latex_formula) + + # 3. Fix brace spacing + latex_formula = self._fix_brace_spacing(latex_formula) + + # 4. Convert special environments (cases, aligned) + latex_formula = self._convert_special_environments(latex_formula) + + return latex_formula def _extract_latex_formula(self, text: str) -> str: """Extract LaTeX formula from text by removing delimiters. diff --git a/test_array_fix.py b/test_array_fix.py new file mode 100644 index 0000000..324239e --- /dev/null +++ b/test_array_fix.py @@ -0,0 +1,102 @@ +"""Test script for array column specifier fix.""" + +from app.services.converter import Converter + + +def test_array_specifier_fix(): + """Test that array column specifiers with spaces are fixed.""" + + converter = Converter() + + # The problematic LaTeX from the error + latex_formula = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}""" + + print("Testing array column specifier fix") + print("=" * 80) + print(f"\nOriginal LaTeX (first 200 chars):\n{latex_formula[:200]}...") + + # Test preprocessing + print("\n" + "-" * 80) + print("Step 1: Preprocessing") + preprocessed = converter._preprocess_formula_for_omml(latex_formula) + + # Check if spaces were removed from array specifiers + if "{c c c c}" in preprocessed: + print("✗ FAILED: Spaces not removed from array specifiers") + print(f"Found: {preprocessed[preprocessed.find('{c c c c}'):preprocessed.find('{c c c c}')+10]}") + elif "{cccc}" in preprocessed: + print("✓ SUCCESS: Spaces removed from array specifiers") + print(f"Changed '{{{\"c c c c\"}}}' → '{{cccc}}'") + else: + print("? Could not find array specifier in preprocessed output") + + # Test OMML conversion + print("\n" + "-" * 80) + print("Step 2: OMML Conversion") + try: + omml = converter.convert_to_omml(latex_formula) + print(f"✓ SUCCESS: OMML conversion completed") + print(f"OMML length: {len(omml)} characters") + print(f"OMML preview (first 300 chars):\n{omml[:300]}...") + + # Check if it contains oMath element + if "oMath" in omml: + print("\n✓ Valid OMML: Contains oMath element") + else: + print("\n✗ WARNING: OMML might be incomplete (no oMath element found)") + + except Exception as e: + print(f"✗ FAILED: OMML conversion error") + print(f"Error: {e}") + return False + + print("\n" + "=" * 80) + print("✓ All tests passed!") + return True + + +def test_simple_array(): + """Test with a simpler array example.""" + + converter = Converter() + + print("\nTesting simple array") + print("=" * 80) + + # Simple array with spaces in column specifier + latex_formula = r"\begin{array}{c c c} a & b & c \\ d & e & f \end{array}" + + print(f"LaTeX: {latex_formula}") + + try: + omml = converter.convert_to_omml(latex_formula) + print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)") + print(f"Preview: {omml[:200]}...") + return True + except Exception as e: + print(f"✗ FAILED: {e}") + return False + + +if __name__ == "__main__": + print("Array Column Specifier Fix Test Suite\n") + + try: + test1 = test_simple_array() + test2 = test_array_specifier_fix() + + if test1 and test2: + print("\n" + "=" * 80) + print("✓✓✓ ALL TESTS PASSED ✓✓✓") + print("=" * 80) + else: + print("\n" + "=" * 80) + print("✗✗✗ SOME TESTS FAILED ✗✗✗") + print("=" * 80) + + except KeyboardInterrupt: + print("\n\nTests interrupted by user") + except Exception as e: + print(f"\n\nTest suite error: {e}") + import traceback + traceback.print_exc() diff --git a/test_omml_preprocessing.py b/test_omml_preprocessing.py new file mode 100644 index 0000000..b36616c --- /dev/null +++ b/test_omml_preprocessing.py @@ -0,0 +1,218 @@ +"""Comprehensive test for OMML conversion with preprocessing.""" + +from app.services.converter import Converter + + +def test_case_1_array_with_spaces(): + """Test: Array with spaces in column specifier (the original issue).""" + print("\n" + "=" * 80) + print("Test 1: Array with spaces in column specifier") + print("=" * 80) + + converter = Converter() + + # The problematic LaTeX from the error + latex = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}""" + + print(f"LaTeX length: {len(latex)} chars") + print(f"Preview: {latex[:100]}...") + + try: + omml = converter.convert_to_omml(latex) + print(f"\n✓ SUCCESS: Converted to OMML") + print(f"OMML length: {len(omml)} chars") + + if "oMath" in omml: + print("✓ Valid OMML structure detected") + + # Check preprocessing worked + preprocessed = converter._preprocess_formula_for_omml(latex) + if "{c c c c}" not in preprocessed and "{cccc}" in preprocessed: + print("✓ Array column specifiers fixed: '{c c c c}' → '{cccc}'") + + return True + + except Exception as e: + print(f"\n✗ FAILED: {e}") + return False + + +def test_case_2_vmatrix(): + """Test: vmatrix environment conversion.""" + print("\n" + "=" * 80) + print("Test 2: vmatrix environment") + print("=" * 80) + + converter = Converter() + + latex = r"\begin{vmatrix} a & b \\ c & d \end{vmatrix}" + print(f"LaTeX: {latex}") + + try: + omml = converter.convert_to_omml(latex) + print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)") + + # Check if vmatrix was converted + preprocessed = converter._preprocess_formula_for_omml(latex) + if "vmatrix" not in preprocessed and r"\left|" in preprocessed: + print("✓ vmatrix converted to \\left| ... \\right|") + + return True + + except Exception as e: + print(f"✗ FAILED: {e}") + return False + + +def test_case_3_cases_environment(): + """Test: cases environment conversion.""" + print("\n" + "=" * 80) + print("Test 3: cases environment") + print("=" * 80) + + converter = Converter() + + latex = r"f(x) = \begin{cases} x^2 & x \geq 0 \\ -x & x < 0 \end{cases}" + print(f"LaTeX: {latex}") + + try: + omml = converter.convert_to_omml(latex) + print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)") + + # Check if cases was converted to array + preprocessed = converter._preprocess_formula_for_omml(latex) + if "cases" not in preprocessed and "array" in preprocessed: + print("✓ cases converted to array environment") + + return True + + except Exception as e: + print(f"✗ FAILED: {e}") + return False + + +def test_case_4_aligned_environment(): + """Test: aligned environment conversion.""" + print("\n" + "=" * 80) + print("Test 4: aligned environment") + print("=" * 80) + + converter = Converter() + + latex = r"\begin{aligned} x + y &= 5 \\ 2x - y &= 1 \end{aligned}" + print(f"LaTeX: {latex}") + + try: + omml = converter.convert_to_omml(latex) + print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)") + + # Check if aligned was converted + preprocessed = converter._preprocess_formula_for_omml(latex) + if "aligned" not in preprocessed and "array" in preprocessed: + print("✓ aligned converted to array environment") + if "&" not in preprocessed or preprocessed.count("&") < latex.count("&"): + print("✓ Alignment markers removed") + + return True + + except Exception as e: + print(f"✗ FAILED: {e}") + return False + + +def test_case_5_simple_formula(): + """Test: Simple formula (should work without preprocessing).""" + print("\n" + "=" * 80) + print("Test 5: Simple formula") + print("=" * 80) + + converter = Converter() + + latex = r"x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}" + print(f"LaTeX: {latex}") + + try: + omml = converter.convert_to_omml(latex) + print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)") + return True + + except Exception as e: + print(f"✗ FAILED: {e}") + return False + + +def test_case_6_nested_structures(): + """Test: Nested structures with multiple issues.""" + print("\n" + "=" * 80) + print("Test 6: Nested structures") + print("=" * 80) + + converter = Converter() + + latex = r"\left\{ \begin{array}{l c} \begin{vmatrix} a & b \\ c & d \end{vmatrix} & = ad - bc \\ f(x) = \begin{cases} 1 & x > 0 \\ 0 & x \leq 0 \end{cases} & \text{step function} \end{array} \right." + print(f"LaTeX: {latex}") + + try: + omml = converter.convert_to_omml(latex) + print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)") + + preprocessed = converter._preprocess_formula_for_omml(latex) + print("\nPreprocessing applied:") + if "vmatrix" not in preprocessed: + print(" ✓ vmatrix converted") + if "cases" not in preprocessed: + print(" ✓ cases converted") + if "{l c}" not in preprocessed and "{lc}" in preprocessed: + print(" ✓ Array specifiers fixed") + + return True + + except Exception as e: + print(f"✗ FAILED: {e}") + return False + + +if __name__ == "__main__": + print("=" * 80) + print("OMML CONVERSION TEST SUITE") + print("Testing preprocessing and conversion") + print("=" * 80) + + results = [] + + try: + results.append(("Simple formula", test_case_5_simple_formula())) + results.append(("Array with spaces", test_case_1_array_with_spaces())) + results.append(("vmatrix", test_case_2_vmatrix())) + results.append(("cases", test_case_3_cases_environment())) + results.append(("aligned", test_case_4_aligned_environment())) + results.append(("Nested structures", test_case_6_nested_structures())) + + # Summary + print("\n" + "=" * 80) + print("TEST SUMMARY") + print("=" * 80) + + passed = sum(1 for _, result in results if result) + total = len(results) + + for name, result in results: + status = "✓ PASS" if result else "✗ FAIL" + print(f"{status}: {name}") + + print("\n" + "-" * 80) + print(f"Total: {passed}/{total} tests passed") + + if passed == total: + print("\n✓✓✓ ALL TESTS PASSED ✓✓✓") + else: + print(f"\n✗✗✗ {total - passed} TESTS FAILED ✗✗✗") + + print("=" * 80) + + except KeyboardInterrupt: + print("\n\nTests interrupted by user") + except Exception as e: + print(f"\n\nTest suite error: {e}") + import traceback + traceback.print_exc()