Files
doc_processer/test_latex_space_cleaning.py

155 lines
5.2 KiB
Python
Raw Normal View History

2026-02-05 13:32:13 +08:00
"""Test LaTeX syntax space cleaning functionality.
Tests the _clean_latex_syntax_spaces() function which removes
unwanted spaces in LaTeX syntax that are common OCR errors.
"""
import re
def _clean_latex_syntax_spaces(expr: str) -> str:
"""Clean unwanted spaces in LaTeX syntax (common OCR errors)."""
# Pattern 1: Spaces around _ and ^
expr = re.sub(r'\s*_\s*', '_', expr)
expr = re.sub(r'\s*\^\s*', '^', expr)
# Pattern 2: Spaces inside braces that follow _ or ^
def clean_subscript_superscript_braces(match):
operator = match.group(1)
content = match.group(2)
# Remove spaces but preserve LaTeX commands
cleaned = re.sub(r'(?<!\\)\s+(?!\\)', '', content)
return f"{operator}{{{cleaned}}}"
expr = re.sub(r'([_^])\{([^}]+)\}', clean_subscript_superscript_braces, expr)
# Pattern 3: Spaces inside \frac arguments
def clean_frac_braces(match):
numerator = match.group(1).strip()
denominator = match.group(2).strip()
return f"\\frac{{{numerator}}}{{{denominator}}}"
expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}',
clean_frac_braces, expr)
# Pattern 4: Spaces after backslash
expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr)
# Pattern 5: Spaces after LaTeX commands before braces
expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr)
return expr
# Test cases
test_cases = [
# Subscripts with spaces
(r"a _ {i 1}", r"a_{i1}", "subscript with spaces"),
(r"x _ { n }", r"x_{n}", "subscript with spaces around"),
(r"a_{i 1}", r"a_{i1}", "subscript braces with spaces"),
(r"y _ { i j k }", r"y_{ijk}", "subscript multiple spaces"),
# Superscripts with spaces
(r"x ^ {2 3}", r"x^{23}", "superscript with spaces"),
(r"a ^ { n }", r"a^{n}", "superscript with spaces around"),
(r"e^{ 2 x }", r"e^{2x}", "superscript expression with spaces"),
# Fractions with spaces
(r"\frac { a } { b }", r"\frac{a}{b}", "fraction with spaces"),
(r"\frac{ x + y }{ z }", r"\frac{x+y}{z}", "fraction expression with spaces"),
(r"\frac { 1 } { 2 }", r"\frac{1}{2}", "fraction numbers with spaces"),
# LaTeX commands with spaces
(r"\ alpha", r"\alpha", "command with space after backslash"),
(r"\ beta + \ gamma", r"\beta+\gamma", "multiple commands with spaces"),
(r"\sqrt { x }", r"\sqrt{x}", "sqrt with space before brace"),
(r"\sin { x }", r"\sin{x}", "sin with space"),
# Combined cases
(r"a _ {i 1} + b ^ {2 3}", r"a_{i1}+b^{23}", "subscript and superscript"),
(r"\frac { a _ {i} } { b ^ {2} }", r"\frac{a_{i}}{b^{2}}", "fraction with sub/superscripts"),
(r"x _ { \alpha }", r"x_{\alpha}", "subscript with LaTeX command"),
(r"y ^ { \beta + 1 }", r"y^{\beta+1}", "superscript with expression"),
# Edge cases - should preserve necessary spaces
(r"a + b", r"a+b", "arithmetic operators (space removed)"),
(r"\int x dx", r"\intxdx", "integral (spaces removed - might be too aggressive)"),
(r"f(x) = x^2", r"f(x)=x^2", "function definition (spaces removed)"),
# LaTeX commands should be preserved
(r"\lambda_{1}", r"\lambda_{1}", "lambda with subscript (already clean)"),
(r"\vdots", r"\vdots", "vdots (should not be affected)"),
(r"\alpha \beta \gamma", r"\alpha\beta\gamma", "Greek letters (spaces removed between commands)"),
]
print("=" * 80)
print("LaTeX Syntax Space Cleaning Test")
print("=" * 80)
passed = 0
failed = 0
warnings = 0
for original, expected, description in test_cases:
result = _clean_latex_syntax_spaces(original)
if result == expected:
status = "✅ PASS"
passed += 1
else:
status = "❌ FAIL"
failed += 1
# Check if it's close but not exact
if result.replace(" ", "") == expected.replace(" ", ""):
status = "⚠️ CLOSE"
warnings += 1
print(f"{status} {description:40s}")
print(f" Input: {original}")
print(f" Expected: {expected}")
print(f" Got: {result}")
if result != expected:
print(f" >>> Mismatch!")
print()
print("=" * 80)
print("USER'S SPECIFIC EXAMPLE")
print("=" * 80)
user_example = r"a _ {i 1}"
expected_output = r"a_{i1}"
result = _clean_latex_syntax_spaces(user_example)
print(f"Input: {user_example}")
print(f"Expected: {expected_output}")
print(f"Got: {result}")
print(f"Status: {'✅ CORRECT' if result == expected_output else '❌ INCORRECT'}")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total tests: {len(test_cases)}")
print(f"✅ Passed: {passed}")
print(f"❌ Failed: {failed}")
print(f"⚠️ Close: {warnings}")
if failed == 0:
print("\n✅ All tests passed!")
else:
print(f"\n⚠️ {failed} test(s) failed")
print("\n" + "=" * 80)
print("IMPORTANT NOTES")
print("=" * 80)
print("""
1. Subscript/superscript spaces: a _ {i 1} -> a_{i1}
2. Fraction spaces: \\frac { a } { b } -> \\frac{a}{b}
3. Command spaces: \\ alpha -> \\alpha
4. This might remove some intentional spaces in expressions
5. LaTeX commands inside braces are preserved (e.g., _{\\alpha})
If any edge cases are broken, the patterns can be adjusted to be more conservative.
""")
print("=" * 80)