fix: handle mathml preprocess

This commit is contained in:
liuyuanchuang
2026-02-04 15:52:04 +08:00
parent 56a02eb6da
commit 720cd05add
2 changed files with 264 additions and 6 deletions

View File

@@ -200,8 +200,11 @@ class Converter:
# Extract the LaTeX formula content (remove delimiters)
latex_formula = self._extract_latex_formula(md_text)
# Preprocess formula for better conversion (fix array specifiers, etc.)
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
# Convert to MathML
mathml = self._latex_to_mathml(latex_formula)
mathml = self._latex_to_mathml(preprocessed_formula)
# Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml)
@@ -234,15 +237,16 @@ class Converter:
raise ValueError("LaTeX formula cannot be empty")
# Preprocess formula using the same preprocessing as export
preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())
return self._latex_to_omml(preprocessed)
def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for OMML conversion.
def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).
Applies the same preprocessing steps as preprocess_for_export to ensure
consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
consistency across all conversion paths. This fixes common issues that
cause Pandoc conversion to fail.
Args:
latex_formula: Pure LaTeX formula.
@@ -254,7 +258,7 @@ class Converter:
# 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces)
# 2. Fix array column specifiers (remove spaces) - THIS IS THE KEY FIX
latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing

254
test_array_fix_complete.py Normal file
View File

@@ -0,0 +1,254 @@
"""Comprehensive test for array column specifier fix in all conversion paths."""
from app.services.converter import Converter
def test_problematic_array():
"""Test the exact LaTeX that caused the error."""
print("=" * 80)
print("Testing Problematic Array (from error log)")
print("=" * 80)
converter = Converter()
# The exact LaTeX from the error log
latex = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
print(f"\nLaTeX length: {len(latex)} characters")
print(f"Contains '{{{\"c c c c\"}}}': {'{c c c c}' in latex}")
# Test 1: Preprocessing
print("\n" + "-" * 80)
print("Test 1: Preprocessing")
print("-" * 80)
preprocessed = converter._preprocess_formula_for_conversion(latex)
if '{c c c c}' in preprocessed:
print("✗ FAILED: Spaces NOT removed from array specifiers")
print(f" Still found: {preprocessed[preprocessed.find('{c c c c}'):preprocessed.find('{c c c c}')+15]}")
return False
elif '{cccc}' in preprocessed:
print("✓ SUCCESS: Spaces removed from array specifiers")
print(f" '{{{\"c c c c\"}}}''{{cccc}}'")
else:
print("? WARNING: Could not verify specifier fix")
# Test 2: MathML Conversion
print("\n" + "-" * 80)
print("Test 2: MathML Conversion (via convert_to_formats)")
print("-" * 80)
try:
result = converter.convert_to_formats(f"$${latex}$$")
if result.mathml:
print(f"✓ SUCCESS: MathML generated ({len(result.mathml)} chars)")
# Check for Word compatibility
if 'display="block"' in result.mathml:
print(" ✓ Has display='block' (Word-friendly)")
if '+' not in result.mathml and '=' not in result.mathml:
print(" ✓ No problematic Unicode entities")
print(f"\n MathML preview:\n {result.mathml[:200]}...")
else:
print("✗ FAILED: No MathML generated")
return False
except Exception as e:
print(f"✗ FAILED: MathML conversion error: {e}")
return False
# Test 3: OMML Conversion
print("\n" + "-" * 80)
print("Test 3: OMML Conversion")
print("-" * 80)
try:
omml = converter.convert_to_omml(latex)
if omml:
print(f"✓ SUCCESS: OMML generated ({len(omml)} chars)")
if 'oMath' in omml:
print(" ✓ Valid OMML structure")
print(f"\n OMML preview:\n {omml[:200]}...")
else:
print("✗ FAILED: No OMML generated")
return False
except Exception as e:
print(f"✗ FAILED: OMML conversion error: {e}")
return False
print("\n" + "=" * 80)
print("✓✓✓ ALL CONVERSION PATHS WORKING ✓✓✓")
print("=" * 80)
return True
def test_simple_arrays():
"""Test simple arrays with spaces in column specifiers."""
print("\n" + "=" * 80)
print("Testing Simple Arrays")
print("=" * 80)
converter = Converter()
test_cases = [
("2x2 array", r"\begin{array}{c c} a & b \\ c & d \end{array}"),
("3x3 array", r"\begin{array}{c c c} 1 & 2 & 3 \\ 4 & 5 & 6 \\ 7 & 8 & 9 \end{array}"),
("Array with pipes", r"\left| \begin{array}{c c} a & b \\ c & d \end{array} \right|"),
("Mixed alignment", r"\begin{array}{l r c} left & right & center \end{array}"),
]
all_passed = True
for name, latex in test_cases:
print(f"\n{name}")
print("-" * 40)
print(f"LaTeX: {latex}")
# Check preprocessing
preprocessed = converter._preprocess_formula_for_conversion(latex)
has_spaces = any(f"{{{' '.join(chars)}}}" in preprocessed for chars in [['c', 'c'], ['c', 'c', 'c'], ['l', 'r', 'c']])
try:
result = converter.convert_to_formats(f"${latex}$")
if result.mathml and result.mml:
status = "" if not has_spaces else ""
print(f"{status} MathML: {len(result.mathml)} chars, MML: {len(result.mml)} chars")
if not has_spaces:
print(" ✓ Array specifiers fixed")
else:
print(" ✗ Array specifiers still have spaces")
all_passed = False
else:
print("✗ Conversion failed")
all_passed = False
except Exception as e:
print(f"✗ Error: {e}")
all_passed = False
return all_passed
def test_conversion_consistency():
"""Test that all conversion paths use the same preprocessing."""
print("\n" + "=" * 80)
print("Testing Conversion Consistency")
print("=" * 80)
converter = Converter()
# Test formula with multiple issues
latex = r"""
\left\{ \begin{array}{l c}
\begin{vmatrix} a & b \\ c & d \end{vmatrix} & = ad - bc \\
\begin{cases} x & x > 0 \\ 0 & x \leq 0 \end{cases} & \text{sign}
\end{array} \right.
""".strip()
print(f"\nComplex formula with:")
print(" - array with spaces: {l c}")
print(" - vmatrix environment")
print(" - cases environment")
print("\n" + "-" * 80)
print("Preprocessing check:")
print("-" * 80)
preprocessed = converter._preprocess_formula_for_conversion(latex)
checks = {
"Array spaces removed": '{l c}' not in preprocessed and '{lc}' in preprocessed,
"vmatrix converted": 'vmatrix' not in preprocessed,
"cases converted": 'cases' not in preprocessed and 'array' in preprocessed,
}
for check, passed in checks.items():
status = "" if passed else ""
print(f"{status} {check}")
print("\n" + "-" * 80)
print("Conversion paths:")
print("-" * 80)
all_passed = True
# Test MathML
try:
result = converter.convert_to_formats(f"$${latex}$$")
print(f"✓ MathML: {len(result.mathml)} chars")
print(f"✓ MML: {len(result.mml)} chars")
except Exception as e:
print(f"✗ MathML failed: {e}")
all_passed = False
# Test OMML
try:
omml = converter.convert_to_omml(latex)
print(f"✓ OMML: {len(omml)} chars")
except Exception as e:
print(f"✗ OMML failed: {e}")
all_passed = False
return all_passed and all(checks.values())
if __name__ == "__main__":
print("=" * 80)
print("COMPREHENSIVE ARRAY FIX TEST SUITE")
print("Testing all conversion paths with preprocessing")
print("=" * 80)
try:
test1 = test_problematic_array()
test2 = test_simple_arrays()
test3 = test_conversion_consistency()
print("\n" + "=" * 80)
print("FINAL SUMMARY")
print("=" * 80)
results = [
("Problematic array fix", test1),
("Simple arrays", test2),
("Conversion consistency", test3),
]
for name, passed in results:
status = "✓ PASS" if passed else "✗ FAIL"
print(f"{status}: {name}")
all_passed = all(result[1] for result in results)
print("\n" + "-" * 80)
if all_passed:
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
print("\nThe array column specifier fix is working in ALL conversion paths:")
print(" • MathML conversion (for Word paste)")
print(" • MML conversion (namespaced MathML)")
print(" • OMML conversion (Word native)")
else:
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted")
except Exception as e:
print(f"\n\nTest error: {e}")
import traceback
traceback.print_exc()