fix: add preprocess

This commit is contained in:
liuyuanchuang
2026-02-04 12:45:34 +08:00
parent 69f9a70ae5
commit e31017cfe7
3 changed files with 354 additions and 1 deletions

View File

@@ -217,6 +217,9 @@ class Converter:
This is a separate method due to the performance overhead of OMML conversion,
which requires creating a temporary DOCX file.
The formula is preprocessed using the same logic as export_to_file to ensure
proper conversion.
Args:
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
@@ -230,7 +233,37 @@ class Converter:
if not latex_formula or not latex_formula.strip():
raise ValueError("LaTeX formula cannot be empty")
return self._latex_to_omml(latex_formula.strip())
# Preprocess formula using the same preprocessing as export
preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
return self._latex_to_omml(preprocessed)
def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for OMML conversion.
Applies the same preprocessing steps as preprocess_for_export to ensure
consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
Args:
latex_formula: Pure LaTeX formula.
Returns:
Preprocessed LaTeX formula.
"""
# Use the same preprocessing methods as export
# 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces)
latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing
latex_formula = self._fix_brace_spacing(latex_formula)
# 4. Convert special environments (cases, aligned)
latex_formula = self._convert_special_environments(latex_formula)
return latex_formula
def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters.

102
test_array_fix.py Normal file
View File

@@ -0,0 +1,102 @@
"""Test script for array column specifier fix."""
from app.services.converter import Converter
def test_array_specifier_fix():
"""Test that array column specifiers with spaces are fixed."""
converter = Converter()
# The problematic LaTeX from the error
latex_formula = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
print("Testing array column specifier fix")
print("=" * 80)
print(f"\nOriginal LaTeX (first 200 chars):\n{latex_formula[:200]}...")
# Test preprocessing
print("\n" + "-" * 80)
print("Step 1: Preprocessing")
preprocessed = converter._preprocess_formula_for_omml(latex_formula)
# Check if spaces were removed from array specifiers
if "{c c c c}" in preprocessed:
print("✗ FAILED: Spaces not removed from array specifiers")
print(f"Found: {preprocessed[preprocessed.find('{c c c c}'):preprocessed.find('{c c c c}')+10]}")
elif "{cccc}" in preprocessed:
print("✓ SUCCESS: Spaces removed from array specifiers")
print(f"Changed '{{{\"c c c c\"}}}''{{cccc}}'")
else:
print("? Could not find array specifier in preprocessed output")
# Test OMML conversion
print("\n" + "-" * 80)
print("Step 2: OMML Conversion")
try:
omml = converter.convert_to_omml(latex_formula)
print(f"✓ SUCCESS: OMML conversion completed")
print(f"OMML length: {len(omml)} characters")
print(f"OMML preview (first 300 chars):\n{omml[:300]}...")
# Check if it contains oMath element
if "oMath" in omml:
print("\n✓ Valid OMML: Contains oMath element")
else:
print("\n✗ WARNING: OMML might be incomplete (no oMath element found)")
except Exception as e:
print(f"✗ FAILED: OMML conversion error")
print(f"Error: {e}")
return False
print("\n" + "=" * 80)
print("✓ All tests passed!")
return True
def test_simple_array():
"""Test with a simpler array example."""
converter = Converter()
print("\nTesting simple array")
print("=" * 80)
# Simple array with spaces in column specifier
latex_formula = r"\begin{array}{c c c} a & b & c \\ d & e & f \end{array}"
print(f"LaTeX: {latex_formula}")
try:
omml = converter.convert_to_omml(latex_formula)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
print(f"Preview: {omml[:200]}...")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
if __name__ == "__main__":
print("Array Column Specifier Fix Test Suite\n")
try:
test1 = test_simple_array()
test2 = test_array_specifier_fix()
if test1 and test2:
print("\n" + "=" * 80)
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
print("=" * 80)
else:
print("\n" + "=" * 80)
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted by user")
except Exception as e:
print(f"\n\nTest suite error: {e}")
import traceback
traceback.print_exc()

218
test_omml_preprocessing.py Normal file
View File

@@ -0,0 +1,218 @@
"""Comprehensive test for OMML conversion with preprocessing."""
from app.services.converter import Converter
def test_case_1_array_with_spaces():
"""Test: Array with spaces in column specifier (the original issue)."""
print("\n" + "=" * 80)
print("Test 1: Array with spaces in column specifier")
print("=" * 80)
converter = Converter()
# The problematic LaTeX from the error
latex = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
print(f"LaTeX length: {len(latex)} chars")
print(f"Preview: {latex[:100]}...")
try:
omml = converter.convert_to_omml(latex)
print(f"\n✓ SUCCESS: Converted to OMML")
print(f"OMML length: {len(omml)} chars")
if "oMath" in omml:
print("✓ Valid OMML structure detected")
# Check preprocessing worked
preprocessed = converter._preprocess_formula_for_omml(latex)
if "{c c c c}" not in preprocessed and "{cccc}" in preprocessed:
print("✓ Array column specifiers fixed: '{c c c c}''{cccc}'")
return True
except Exception as e:
print(f"\n✗ FAILED: {e}")
return False
def test_case_2_vmatrix():
"""Test: vmatrix environment conversion."""
print("\n" + "=" * 80)
print("Test 2: vmatrix environment")
print("=" * 80)
converter = Converter()
latex = r"\begin{vmatrix} a & b \\ c & d \end{vmatrix}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
# Check if vmatrix was converted
preprocessed = converter._preprocess_formula_for_omml(latex)
if "vmatrix" not in preprocessed and r"\left|" in preprocessed:
print("✓ vmatrix converted to \\left| ... \\right|")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_3_cases_environment():
"""Test: cases environment conversion."""
print("\n" + "=" * 80)
print("Test 3: cases environment")
print("=" * 80)
converter = Converter()
latex = r"f(x) = \begin{cases} x^2 & x \geq 0 \\ -x & x < 0 \end{cases}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
# Check if cases was converted to array
preprocessed = converter._preprocess_formula_for_omml(latex)
if "cases" not in preprocessed and "array" in preprocessed:
print("✓ cases converted to array environment")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_4_aligned_environment():
"""Test: aligned environment conversion."""
print("\n" + "=" * 80)
print("Test 4: aligned environment")
print("=" * 80)
converter = Converter()
latex = r"\begin{aligned} x + y &= 5 \\ 2x - y &= 1 \end{aligned}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
# Check if aligned was converted
preprocessed = converter._preprocess_formula_for_omml(latex)
if "aligned" not in preprocessed and "array" in preprocessed:
print("✓ aligned converted to array environment")
if "&" not in preprocessed or preprocessed.count("&") < latex.count("&"):
print("✓ Alignment markers removed")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_5_simple_formula():
"""Test: Simple formula (should work without preprocessing)."""
print("\n" + "=" * 80)
print("Test 5: Simple formula")
print("=" * 80)
converter = Converter()
latex = r"x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_6_nested_structures():
"""Test: Nested structures with multiple issues."""
print("\n" + "=" * 80)
print("Test 6: Nested structures")
print("=" * 80)
converter = Converter()
latex = r"\left\{ \begin{array}{l c} \begin{vmatrix} a & b \\ c & d \end{vmatrix} & = ad - bc \\ f(x) = \begin{cases} 1 & x > 0 \\ 0 & x \leq 0 \end{cases} & \text{step function} \end{array} \right."
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
preprocessed = converter._preprocess_formula_for_omml(latex)
print("\nPreprocessing applied:")
if "vmatrix" not in preprocessed:
print(" ✓ vmatrix converted")
if "cases" not in preprocessed:
print(" ✓ cases converted")
if "{l c}" not in preprocessed and "{lc}" in preprocessed:
print(" ✓ Array specifiers fixed")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
if __name__ == "__main__":
print("=" * 80)
print("OMML CONVERSION TEST SUITE")
print("Testing preprocessing and conversion")
print("=" * 80)
results = []
try:
results.append(("Simple formula", test_case_5_simple_formula()))
results.append(("Array with spaces", test_case_1_array_with_spaces()))
results.append(("vmatrix", test_case_2_vmatrix()))
results.append(("cases", test_case_3_cases_environment()))
results.append(("aligned", test_case_4_aligned_environment()))
results.append(("Nested structures", test_case_6_nested_structures()))
# Summary
print("\n" + "=" * 80)
print("TEST SUMMARY")
print("=" * 80)
passed = sum(1 for _, result in results if result)
total = len(results)
for name, result in results:
status = "✓ PASS" if result else "✗ FAIL"
print(f"{status}: {name}")
print("\n" + "-" * 80)
print(f"Total: {passed}/{total} tests passed")
if passed == total:
print("\n✓✓✓ ALL TESTS PASSED ✓✓✓")
else:
print(f"\n✗✗✗ {total - passed} TESTS FAILED ✗✗✗")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted by user")
except Exception as e:
print(f"\n\nTest suite error: {e}")
import traceback
traceback.print_exc()