155 lines
5.2 KiB
Python
155 lines
5.2 KiB
Python
|
|
"""Test LaTeX syntax space cleaning functionality.
|
|||
|
|
|
|||
|
|
Tests the _clean_latex_syntax_spaces() function which removes
|
|||
|
|
unwanted spaces in LaTeX syntax that are common OCR errors.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _clean_latex_syntax_spaces(expr: str) -> str:
|
|||
|
|
"""Clean unwanted spaces in LaTeX syntax (common OCR errors)."""
|
|||
|
|
# Pattern 1: Spaces around _ and ^
|
|||
|
|
expr = re.sub(r'\s*_\s*', '_', expr)
|
|||
|
|
expr = re.sub(r'\s*\^\s*', '^', expr)
|
|||
|
|
|
|||
|
|
# Pattern 2: Spaces inside braces that follow _ or ^
|
|||
|
|
def clean_subscript_superscript_braces(match):
|
|||
|
|
operator = match.group(1)
|
|||
|
|
content = match.group(2)
|
|||
|
|
# Remove spaces but preserve LaTeX commands
|
|||
|
|
cleaned = re.sub(r'(?<!\\)\s+(?!\\)', '', content)
|
|||
|
|
return f"{operator}{{{cleaned}}}"
|
|||
|
|
|
|||
|
|
expr = re.sub(r'([_^])\{([^}]+)\}', clean_subscript_superscript_braces, expr)
|
|||
|
|
|
|||
|
|
# Pattern 3: Spaces inside \frac arguments
|
|||
|
|
def clean_frac_braces(match):
|
|||
|
|
numerator = match.group(1).strip()
|
|||
|
|
denominator = match.group(2).strip()
|
|||
|
|
return f"\\frac{{{numerator}}}{{{denominator}}}"
|
|||
|
|
|
|||
|
|
expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}',
|
|||
|
|
clean_frac_braces, expr)
|
|||
|
|
|
|||
|
|
# Pattern 4: Spaces after backslash
|
|||
|
|
expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr)
|
|||
|
|
|
|||
|
|
# Pattern 5: Spaces after LaTeX commands before braces
|
|||
|
|
expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr)
|
|||
|
|
|
|||
|
|
return expr
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Test cases
|
|||
|
|
test_cases = [
|
|||
|
|
# Subscripts with spaces
|
|||
|
|
(r"a _ {i 1}", r"a_{i1}", "subscript with spaces"),
|
|||
|
|
(r"x _ { n }", r"x_{n}", "subscript with spaces around"),
|
|||
|
|
(r"a_{i 1}", r"a_{i1}", "subscript braces with spaces"),
|
|||
|
|
(r"y _ { i j k }", r"y_{ijk}", "subscript multiple spaces"),
|
|||
|
|
|
|||
|
|
# Superscripts with spaces
|
|||
|
|
(r"x ^ {2 3}", r"x^{23}", "superscript with spaces"),
|
|||
|
|
(r"a ^ { n }", r"a^{n}", "superscript with spaces around"),
|
|||
|
|
(r"e^{ 2 x }", r"e^{2x}", "superscript expression with spaces"),
|
|||
|
|
|
|||
|
|
# Fractions with spaces
|
|||
|
|
(r"\frac { a } { b }", r"\frac{a}{b}", "fraction with spaces"),
|
|||
|
|
(r"\frac{ x + y }{ z }", r"\frac{x+y}{z}", "fraction expression with spaces"),
|
|||
|
|
(r"\frac { 1 } { 2 }", r"\frac{1}{2}", "fraction numbers with spaces"),
|
|||
|
|
|
|||
|
|
# LaTeX commands with spaces
|
|||
|
|
(r"\ alpha", r"\alpha", "command with space after backslash"),
|
|||
|
|
(r"\ beta + \ gamma", r"\beta+\gamma", "multiple commands with spaces"),
|
|||
|
|
(r"\sqrt { x }", r"\sqrt{x}", "sqrt with space before brace"),
|
|||
|
|
(r"\sin { x }", r"\sin{x}", "sin with space"),
|
|||
|
|
|
|||
|
|
# Combined cases
|
|||
|
|
(r"a _ {i 1} + b ^ {2 3}", r"a_{i1}+b^{23}", "subscript and superscript"),
|
|||
|
|
(r"\frac { a _ {i} } { b ^ {2} }", r"\frac{a_{i}}{b^{2}}", "fraction with sub/superscripts"),
|
|||
|
|
(r"x _ { \alpha }", r"x_{\alpha}", "subscript with LaTeX command"),
|
|||
|
|
(r"y ^ { \beta + 1 }", r"y^{\beta+1}", "superscript with expression"),
|
|||
|
|
|
|||
|
|
# Edge cases - should preserve necessary spaces
|
|||
|
|
(r"a + b", r"a+b", "arithmetic operators (space removed)"),
|
|||
|
|
(r"\int x dx", r"\intxdx", "integral (spaces removed - might be too aggressive)"),
|
|||
|
|
(r"f(x) = x^2", r"f(x)=x^2", "function definition (spaces removed)"),
|
|||
|
|
|
|||
|
|
# LaTeX commands should be preserved
|
|||
|
|
(r"\lambda_{1}", r"\lambda_{1}", "lambda with subscript (already clean)"),
|
|||
|
|
(r"\vdots", r"\vdots", "vdots (should not be affected)"),
|
|||
|
|
(r"\alpha \beta \gamma", r"\alpha\beta\gamma", "Greek letters (spaces removed between commands)"),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
print("=" * 80)
|
|||
|
|
print("LaTeX Syntax Space Cleaning Test")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
passed = 0
|
|||
|
|
failed = 0
|
|||
|
|
warnings = 0
|
|||
|
|
|
|||
|
|
for original, expected, description in test_cases:
|
|||
|
|
result = _clean_latex_syntax_spaces(original)
|
|||
|
|
|
|||
|
|
if result == expected:
|
|||
|
|
status = "✅ PASS"
|
|||
|
|
passed += 1
|
|||
|
|
else:
|
|||
|
|
status = "❌ FAIL"
|
|||
|
|
failed += 1
|
|||
|
|
# Check if it's close but not exact
|
|||
|
|
if result.replace(" ", "") == expected.replace(" ", ""):
|
|||
|
|
status = "⚠️ CLOSE"
|
|||
|
|
warnings += 1
|
|||
|
|
|
|||
|
|
print(f"{status} {description:40s}")
|
|||
|
|
print(f" Input: {original}")
|
|||
|
|
print(f" Expected: {expected}")
|
|||
|
|
print(f" Got: {result}")
|
|||
|
|
if result != expected:
|
|||
|
|
print(f" >>> Mismatch!")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print("=" * 80)
|
|||
|
|
print("USER'S SPECIFIC EXAMPLE")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
user_example = r"a _ {i 1}"
|
|||
|
|
expected_output = r"a_{i1}"
|
|||
|
|
result = _clean_latex_syntax_spaces(user_example)
|
|||
|
|
|
|||
|
|
print(f"Input: {user_example}")
|
|||
|
|
print(f"Expected: {expected_output}")
|
|||
|
|
print(f"Got: {result}")
|
|||
|
|
print(f"Status: {'✅ CORRECT' if result == expected_output else '❌ INCORRECT'}")
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
print("SUMMARY")
|
|||
|
|
print("=" * 80)
|
|||
|
|
print(f"Total tests: {len(test_cases)}")
|
|||
|
|
print(f"✅ Passed: {passed}")
|
|||
|
|
print(f"❌ Failed: {failed}")
|
|||
|
|
print(f"⚠️ Close: {warnings}")
|
|||
|
|
|
|||
|
|
if failed == 0:
|
|||
|
|
print("\n✅ All tests passed!")
|
|||
|
|
else:
|
|||
|
|
print(f"\n⚠️ {failed} test(s) failed")
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
print("IMPORTANT NOTES")
|
|||
|
|
print("=" * 80)
|
|||
|
|
print("""
|
|||
|
|
1. ✅ Subscript/superscript spaces: a _ {i 1} -> a_{i1}
|
|||
|
|
2. ✅ Fraction spaces: \\frac { a } { b } -> \\frac{a}{b}
|
|||
|
|
3. ✅ Command spaces: \\ alpha -> \\alpha
|
|||
|
|
4. ⚠️ This might remove some intentional spaces in expressions
|
|||
|
|
5. ⚠️ LaTeX commands inside braces are preserved (e.g., _{\\alpha})
|
|||
|
|
|
|||
|
|
If any edge cases are broken, the patterns can be adjusted to be more conservative.
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
print("=" * 80)
|