diff --git a/app/services/converter.py b/app/services/converter.py index 0d69942..041a9b5 100644 --- a/app/services/converter.py +++ b/app/services/converter.py @@ -248,17 +248,19 @@ class Converter: consistency across all conversion paths. This fixes common issues that cause Pandoc conversion to fail. + Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py), + so we don't need to handle them here. + Args: latex_formula: Pure LaTeX formula. Returns: Preprocessed LaTeX formula. """ - # Use the same preprocessing methods as export # 1. Convert matrix environments latex_formula = self._convert_matrix_environments(latex_formula) - # 2. Fix array column specifiers (remove spaces) - THIS IS THE KEY FIX + # 2. Fix array column specifiers (remove spaces) latex_formula = self._fix_array_column_specifiers(latex_formula) # 3. Fix brace spacing diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 35435bf..2a68033 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -85,6 +85,8 @@ def _split_glued_command_token(token: str) -> str: def _postprocess_math(expr: str) -> str: """Postprocess a *math* expression (already inside $...$ or $$...$$).""" + # stage0: fix OCR number errors (digits with spaces) + expr = _fix_ocr_number_errors(expr) # stage1: split glued command tokens (e.g. \cdotdS) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) # stage2: normalize differentials (keep conservative) @@ -93,6 +95,42 @@ def _postprocess_math(expr: str) -> str: return expr +def _fix_ocr_number_errors(expr: str) -> str: + """Fix common OCR errors in LaTeX math expressions. + + OCR often splits numbers incorrectly, especially decimals: + - "2 2. 2" should be "22.2" + - "3 0. 4" should be "30.4" + - "1 5 0" should be "150" + + This function merges digit sequences that are separated by spaces. + + Args: + expr: LaTeX math expression. + + Returns: + LaTeX expression with number errors fixed. + """ + # Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)" + # Example: "2 2. 2" → "22.2" + expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr) + + # Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)" + # Example: "22. 2" → "22.2" + expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr) + + # Fix pattern 3: "digit space digit" (no decimal point, within same number context) + # Be careful: only merge if followed by decimal point or comma/end + # Example: "1 5 0" → "150" when followed by comma or end + expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr) + + # Fix pattern 4: Multiple spaces in decimal numbers + # Example: "2 2 . 2" → "22.2" + expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr) + + return expr + + def _postprocess_markdown(markdown_content: str) -> str: """Apply LaTeX postprocessing only within $...$ / $$...$$ segments.""" if not markdown_content: diff --git a/test_ocr_number_fix.py b/test_ocr_number_fix.py new file mode 100644 index 0000000..688327d --- /dev/null +++ b/test_ocr_number_fix.py @@ -0,0 +1,294 @@ +"""Test OCR number error fixing.""" + +from app.services.converter import Converter + + +def test_ocr_number_errors(): + """Test fixing of common OCR number errors.""" + + print("=" * 80) + print("Testing OCR Number Error Fixes") + print("=" * 80) + + converter = Converter() + + # Test cases from the error + test_cases = [ + { + "name": "Original error case", + "latex": r"\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}", + "expected_fixes": ["22.2", "30.4", "25.4"], + "should_not_have": ["2 2", "3 0", "2 5"], + }, + { + "name": "Simple decimal with space", + "latex": r"x = 3. 14", + "expected_fixes": ["3.14"], + "should_not_have": ["3. 14"], + }, + { + "name": "Multiple decimals", + "latex": r"a = 1 2. 5, b = 9. 8 7", + "expected_fixes": ["12.5", "9.87"], + "should_not_have": ["1 2", "9. 8"], + }, + { + "name": "Large numbers with spaces", + "latex": r"n = 1 5 0, m = 2 0 0 0", + "expected_fixes": ["150", "2000"], + "should_not_have": ["1 5", "2 0 0"], + }, + { + "name": "Don't merge across operators", + "latex": r"2 + 3 = 5", + "expected_fixes": ["2 + 3 = 5"], # Should stay the same + "should_not_have": ["23=5"], + }, + ] + + all_passed = True + + for i, test in enumerate(test_cases, 1): + print(f"\nTest {i}: {test['name']}") + print("-" * 80) + print(f"Input: {test['latex']}") + + # Apply fix + fixed = converter._fix_ocr_number_errors(test['latex']) + print(f"Fixed: {fixed}") + + # Check expected fixes + checks_passed = [] + + for expected in test['expected_fixes']: + if expected in fixed: + checks_passed.append(f"✓ Contains '{expected}'") + else: + checks_passed.append(f"✗ Missing '{expected}'") + all_passed = False + + for should_not in test['should_not_have']: + if should_not not in fixed: + checks_passed.append(f"✓ Removed '{should_not}'") + else: + checks_passed.append(f"✗ Still has '{should_not}'") + all_passed = False + + for check in checks_passed: + print(f" {check}") + + return all_passed + + +def test_mathml_quality(): + """Test that fixed LaTeX produces better MathML.""" + + print("\n" + "=" * 80) + print("Testing MathML Quality After OCR Fix") + print("=" * 80) + + converter = Converter() + + # The problematic LaTeX from the error + latex = r"\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}" + + print(f"\nOriginal LaTeX: {latex}") + + # Convert to MathML + result = converter.convert_to_formats(f"${latex}$") + mathml = result.mathml + + print(f"\nMathML length: {len(mathml)} chars") + + # Check quality indicators + print("\nQuality checks:") + print("-" * 80) + + checks = { + "No separate digits for decimals": "22.2" in mathml or "22.2" in mathml, + "No dot as identifier": "." not in mathml, + "Properly formatted numbers": "30.4" in mathml or "30.4" in mathml, + "Has namespace": 'xmlns=' in mathml, + "Display block": 'display="block"' in mathml, + } + + all_passed = True + + for check, passed in checks.items(): + status = "✓" if passed else "✗" + print(f"{status} {check}") + if not passed: + all_passed = False + + # Show a preview + print("\n" + "-" * 80) + print("MathML preview:") + print("-" * 80) + print(mathml[:400]) + if len(mathml) > 400: + print("...") + + return all_passed + + +def test_edge_cases(): + """Test edge cases for OCR number fixing.""" + + print("\n" + "=" * 80) + print("Testing Edge Cases") + print("=" * 80) + + converter = Converter() + + test_cases = [ + { + "name": "Should NOT merge: arithmetic", + "input": r"2 + 3 = 5", + "should_stay": "2 + 3 = 5", + }, + { + "name": "Should NOT merge: multiplication", + "input": r"2 \times 3", + "should_stay": r"2 \times 3", + }, + { + "name": "Should merge: decimal at end", + "input": r"x = 1 2. 5", + "should_become": "12.5", + }, + { + "name": "Should merge: multiple spaces", + "input": r"n = 1 2 . 3 4", + "should_have": "12.34", + }, + { + "name": "Complex: mixed scenarios", + "input": r"a = 1 2. 3 + 4 5. 6 - 7", + "should_have": ["12.3", "45.6", "- 7"], + }, + ] + + all_passed = True + + for test in test_cases: + print(f"\n{test['name']}") + print(f" Input: {test['input']}") + + fixed = converter._fix_ocr_number_errors(test['input']) + print(f" Output: {fixed}") + + if 'should_stay' in test: + if fixed == test['should_stay']: + print(f" ✓ Correctly unchanged") + else: + print(f" ✗ Should stay '{test['should_stay']}' but got '{fixed}'") + all_passed = False + + if 'should_become' in test: + if test['should_become'] in fixed: + print(f" ✓ Contains '{test['should_become']}'") + else: + print(f" ✗ Should contain '{test['should_become']}'") + all_passed = False + + if 'should_have' in test: + for expected in test['should_have']: + if expected in fixed: + print(f" ✓ Contains '{expected}'") + else: + print(f" ✗ Should contain '{expected}'") + all_passed = False + + return all_passed + + +def compare_before_after(): + """Compare MathML before and after OCR fix.""" + + print("\n" + "=" * 80) + print("Before/After Comparison") + print("=" * 80) + + converter = Converter() + + # Simulate OCR error + ocr_latex = r"\gamma = 2 2. 2, c = 3 0. 4" + correct_latex = r"\gamma = 22.2, c = 30.4" + + print(f"\nOCR LaTeX: {ocr_latex}") + print(f"Correct LaTeX: {correct_latex}") + + # Convert both + ocr_result = converter.convert_to_formats(f"${ocr_latex}$") + correct_result = converter.convert_to_formats(f"${correct_latex}$") + + print("\n" + "-" * 80) + print("MathML comparison:") + print("-" * 80) + + # Check if they produce similar quality output + ocr_has_decimal = "22.2" in ocr_result.mathml + correct_has_decimal = "22.2" in correct_result.mathml + + ocr_has_dot_error = "." in ocr_result.mathml + correct_has_dot_error = "." in correct_result.mathml + + print(f"OCR output has proper decimals: {'✓' if ocr_has_decimal else '✗'}") + print(f"Correct output has proper decimals: {'✓' if correct_has_decimal else '✗'}") + print(f"OCR output has dot errors: {'✗ Yes' if ocr_has_dot_error else '✓ No'}") + print(f"Correct output has dot errors: {'✗ Yes' if correct_has_dot_error else '✓ No'}") + + if ocr_has_decimal and not ocr_has_dot_error: + print("\n✓ OCR fix is working! Output quality matches correct input.") + return True + else: + print("\n✗ OCR fix may need improvement.") + return False + + +if __name__ == "__main__": + print("OCR Number Error Fix Test Suite\n") + + try: + test1 = test_ocr_number_errors() + test2 = test_mathml_quality() + test3 = test_edge_cases() + test4 = compare_before_after() + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + + results = [ + ("OCR error fixes", test1), + ("MathML quality", test2), + ("Edge cases", test3), + ("Before/after comparison", test4), + ] + + for name, passed in results: + status = "✓ PASS" if passed else "✗ FAIL" + print(f"{status}: {name}") + + all_passed = all(r[1] for r in results) + + print("\n" + "-" * 80) + + if all_passed: + print("✓✓✓ ALL TESTS PASSED ✓✓✓") + print("\nOCR number errors are being fixed automatically!") + print("Examples:") + print(" • '2 2. 2' → '22.2'") + print(" • '3 0. 4' → '30.4'") + print(" • '1 5 0' → '150'") + else: + print("✗✗✗ SOME TESTS FAILED ✗✗✗") + + print("=" * 80) + + except KeyboardInterrupt: + print("\n\nTests interrupted") + except Exception as e: + print(f"\n\nTest error: {e}") + import traceback + traceback.print_exc() diff --git a/test_ocr_pipeline.py b/test_ocr_pipeline.py new file mode 100644 index 0000000..2d76f76 --- /dev/null +++ b/test_ocr_pipeline.py @@ -0,0 +1,265 @@ +"""Test OCR number error fixing in the complete pipeline.""" + +from app.services.ocr_service import _postprocess_markdown + + +def test_ocr_postprocessing(): + """Test that OCR postprocessing fixes number errors.""" + + print("=" * 80) + print("Testing OCR Postprocessing Pipeline") + print("=" * 80) + + # Simulate OCR output with common errors + test_cases = [ + { + "name": "Inline formula with decimal errors", + "input": r"The value is $\gamma = 2 2. 2$ and $c = 3 0. 4$.", + "should_have": ["22.2", "30.4"], + "should_not_have": ["2 2", "3 0"], + }, + { + "name": "Display formula with decimal errors", + "input": r"$$\phi = 2 5. 4 ^ {\circ}$$", + "should_have": ["25.4"], + "should_not_have": ["2 5"], + }, + { + "name": "Multiple formulas", + "input": r"$a = 1 2. 5$, $b = 9. 8 7$, and $c = 1 5 0$", + "should_have": ["12.5", "9.87", "150"], + "should_not_have": ["1 2", "9. 8", "1 5"], + }, + { + "name": "Mixed content (text + formulas)", + "input": r"The equation $x = 3. 14$ is approximately pi. Then $y = 2 7. 3$.", + "should_have": ["3.14", "27.3"], + "should_not_have": ["3. 14", "2 7"], + }, + { + "name": "Normal arithmetic (should not be affected)", + "input": r"$2 + 3 = 5$ and $10 - 7 = 3$", + "should_stay": True, + }, + ] + + all_passed = True + + for i, test in enumerate(test_cases, 1): + print(f"\nTest {i}: {test['name']}") + print("-" * 80) + print(f"Input: {test['input']}") + + # Apply postprocessing + output = _postprocess_markdown(test['input']) + print(f"Output: {output}") + + # Check results + if 'should_have' in test: + for expected in test['should_have']: + if expected in output: + print(f" ✓ Contains '{expected}'") + else: + print(f" ✗ Missing '{expected}'") + all_passed = False + + if 'should_not_have' in test: + for unexpected in test['should_not_have']: + if unexpected not in output: + print(f" ✓ Removed '{unexpected}'") + else: + print(f" ✗ Still has '{unexpected}'") + all_passed = False + + if test.get('should_stay'): + if test['input'] == output: + print(f" ✓ Correctly unchanged") + else: + print(f" ✗ Should not change but did") + all_passed = False + + return all_passed + + +def test_real_world_case(): + """Test the exact case from the error report.""" + + print("\n" + "=" * 80) + print("Testing Real-World Error Case") + print("=" * 80) + + # The exact input from the error report + ocr_output = r"$$\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}$$" + + print(f"\nOCR Output (with errors):") + print(f" {ocr_output}") + + # Apply postprocessing + fixed = _postprocess_markdown(ocr_output) + + print(f"\nAfter Postprocessing:") + print(f" {fixed}") + + # Check if fixed + checks = { + "Has 22.2": "22.2" in fixed, + "Has 30.4": "30.4" in fixed, + "Has 25.4": "25.4" in fixed, + "No '2 2'": "2 2" not in fixed, + "No '3 0'": "3 0" not in fixed, + "No '2 5'": "2 5" not in fixed, + } + + print("\nQuality Checks:") + print("-" * 80) + + all_passed = True + for check, passed in checks.items(): + status = "✓" if passed else "✗" + print(f"{status} {check}") + if not passed: + all_passed = False + + if all_passed: + print("\n✓ Real-world case fixed successfully!") + else: + print("\n✗ Real-world case still has issues") + + return all_passed + + +def test_edge_cases(): + """Test edge cases to ensure we don't break valid formulas.""" + + print("\n" + "=" * 80) + print("Testing Edge Cases") + print("=" * 80) + + test_cases = [ + { + "name": "Arithmetic operations", + "input": r"$2 + 3 = 5$ and $10 - 7 = 3$", + "should_stay": True, + }, + { + "name": "Multiplication", + "input": r"$2 \times 3 = 6$", + "should_stay": True, + }, + { + "name": "Exponents", + "input": r"$x ^ 2 + y ^ 2 = r ^ 2$", + "should_stay": True, + }, + { + "name": "Fractions", + "input": r"$\frac{1}{2} + \frac{3}{4}$", + "should_stay": True, + }, + { + "name": "Subscripts", + "input": r"$x _ 1 + x _ 2$", + "should_stay": True, + }, + ] + + all_passed = True + + for test in test_cases: + print(f"\n{test['name']}") + print(f" Input: {test['input']}") + + output = _postprocess_markdown(test['input']) + print(f" Output: {output}") + + if test.get('should_stay'): + # For these cases, we allow some whitespace changes but structure should stay + if output.replace(" ", "") == test['input'].replace(" ", ""): + print(f" ✓ Structure preserved") + else: + print(f" ✗ Structure changed unexpectedly") + all_passed = False + + return all_passed + + +def test_performance(): + """Test performance with large content.""" + + print("\n" + "=" * 80) + print("Testing Performance") + print("=" * 80) + + # Create a large markdown with many formulas + large_content = "" + for i in range(100): + large_content += f"Formula {i}: $x = {i} {i}. {i}$ and $y = {i*2} {i*2}. {i*2}$\n" + + print(f"\nContent size: {len(large_content)} characters") + print(f"Number of formulas: ~200") + + import time + start = time.time() + output = _postprocess_markdown(large_content) + elapsed = time.time() - start + + print(f"Processing time: {elapsed*1000:.2f}ms") + + if elapsed < 1.0: + print("✓ Performance is acceptable (< 1s)") + return True + else: + print("✗ Performance may need optimization") + return False + + +if __name__ == "__main__": + print("OCR Pipeline Integration Test Suite\n") + + try: + test1 = test_ocr_postprocessing() + test2 = test_real_world_case() + test3 = test_edge_cases() + test4 = test_performance() + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + + results = [ + ("OCR postprocessing", test1), + ("Real-world case", test2), + ("Edge cases", test3), + ("Performance", test4), + ] + + for name, passed in results: + status = "✓ PASS" if passed else "✗ FAIL" + print(f"{status}: {name}") + + all_passed = all(r[1] for r in results) + + print("\n" + "-" * 80) + + if all_passed: + print("✓✓✓ ALL TESTS PASSED ✓✓✓") + print("\nOCR number error fixing is integrated into the pipeline!") + print("\nFlow:") + print(" 1. OCR recognizes image → produces Markdown with LaTeX") + print(" 2. _postprocess_markdown() fixes number errors") + print(" 3. Clean LaTeX is used for all conversions") + print("\nBenefits:") + print(" • Fixed once at the source") + print(" • All output formats benefit (MathML, MML, OMML)") + print(" • Better performance (no repeated fixes)") + else: + print("✗✗✗ SOME TESTS FAILED ✗✗✗") + + print("=" * 80) + + except KeyboardInterrupt: + print("\n\nTests interrupted") + except Exception as e: + print(f"\n\nTest error: {e}") + import traceback + traceback.print_exc()