fix: add post markdown
This commit is contained in:
@@ -248,17 +248,19 @@ class Converter:
|
|||||||
consistency across all conversion paths. This fixes common issues that
|
consistency across all conversion paths. This fixes common issues that
|
||||||
cause Pandoc conversion to fail.
|
cause Pandoc conversion to fail.
|
||||||
|
|
||||||
|
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
|
||||||
|
so we don't need to handle them here.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
latex_formula: Pure LaTeX formula.
|
latex_formula: Pure LaTeX formula.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Preprocessed LaTeX formula.
|
Preprocessed LaTeX formula.
|
||||||
"""
|
"""
|
||||||
# Use the same preprocessing methods as export
|
|
||||||
# 1. Convert matrix environments
|
# 1. Convert matrix environments
|
||||||
latex_formula = self._convert_matrix_environments(latex_formula)
|
latex_formula = self._convert_matrix_environments(latex_formula)
|
||||||
|
|
||||||
# 2. Fix array column specifiers (remove spaces) - THIS IS THE KEY FIX
|
# 2. Fix array column specifiers (remove spaces)
|
||||||
latex_formula = self._fix_array_column_specifiers(latex_formula)
|
latex_formula = self._fix_array_column_specifiers(latex_formula)
|
||||||
|
|
||||||
# 3. Fix brace spacing
|
# 3. Fix brace spacing
|
||||||
|
|||||||
@@ -85,6 +85,8 @@ def _split_glued_command_token(token: str) -> str:
|
|||||||
|
|
||||||
def _postprocess_math(expr: str) -> str:
|
def _postprocess_math(expr: str) -> str:
|
||||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
||||||
|
# stage0: fix OCR number errors (digits with spaces)
|
||||||
|
expr = _fix_ocr_number_errors(expr)
|
||||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||||
# stage2: normalize differentials (keep conservative)
|
# stage2: normalize differentials (keep conservative)
|
||||||
@@ -93,6 +95,42 @@ def _postprocess_math(expr: str) -> str:
|
|||||||
return expr
|
return expr
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_ocr_number_errors(expr: str) -> str:
|
||||||
|
"""Fix common OCR errors in LaTeX math expressions.
|
||||||
|
|
||||||
|
OCR often splits numbers incorrectly, especially decimals:
|
||||||
|
- "2 2. 2" should be "22.2"
|
||||||
|
- "3 0. 4" should be "30.4"
|
||||||
|
- "1 5 0" should be "150"
|
||||||
|
|
||||||
|
This function merges digit sequences that are separated by spaces.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
expr: LaTeX math expression.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LaTeX expression with number errors fixed.
|
||||||
|
"""
|
||||||
|
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
|
||||||
|
# Example: "2 2. 2" → "22.2"
|
||||||
|
expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
|
||||||
|
|
||||||
|
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
|
||||||
|
# Example: "22. 2" → "22.2"
|
||||||
|
expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
|
||||||
|
|
||||||
|
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
|
||||||
|
# Be careful: only merge if followed by decimal point or comma/end
|
||||||
|
# Example: "1 5 0" → "150" when followed by comma or end
|
||||||
|
expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
|
||||||
|
|
||||||
|
# Fix pattern 4: Multiple spaces in decimal numbers
|
||||||
|
# Example: "2 2 . 2" → "22.2"
|
||||||
|
expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
|
||||||
|
|
||||||
|
return expr
|
||||||
|
|
||||||
|
|
||||||
def _postprocess_markdown(markdown_content: str) -> str:
|
def _postprocess_markdown(markdown_content: str) -> str:
|
||||||
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
|
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
|
||||||
if not markdown_content:
|
if not markdown_content:
|
||||||
|
|||||||
294
test_ocr_number_fix.py
Normal file
294
test_ocr_number_fix.py
Normal file
@@ -0,0 +1,294 @@
|
|||||||
|
"""Test OCR number error fixing."""
|
||||||
|
|
||||||
|
from app.services.converter import Converter
|
||||||
|
|
||||||
|
|
||||||
|
def test_ocr_number_errors():
|
||||||
|
"""Test fixing of common OCR number errors."""
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("Testing OCR Number Error Fixes")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
# Test cases from the error
|
||||||
|
test_cases = [
|
||||||
|
{
|
||||||
|
"name": "Original error case",
|
||||||
|
"latex": r"\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}",
|
||||||
|
"expected_fixes": ["22.2", "30.4", "25.4"],
|
||||||
|
"should_not_have": ["2 2", "3 0", "2 5"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Simple decimal with space",
|
||||||
|
"latex": r"x = 3. 14",
|
||||||
|
"expected_fixes": ["3.14"],
|
||||||
|
"should_not_have": ["3. 14"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Multiple decimals",
|
||||||
|
"latex": r"a = 1 2. 5, b = 9. 8 7",
|
||||||
|
"expected_fixes": ["12.5", "9.87"],
|
||||||
|
"should_not_have": ["1 2", "9. 8"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Large numbers with spaces",
|
||||||
|
"latex": r"n = 1 5 0, m = 2 0 0 0",
|
||||||
|
"expected_fixes": ["150", "2000"],
|
||||||
|
"should_not_have": ["1 5", "2 0 0"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Don't merge across operators",
|
||||||
|
"latex": r"2 + 3 = 5",
|
||||||
|
"expected_fixes": ["2 + 3 = 5"], # Should stay the same
|
||||||
|
"should_not_have": ["23=5"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
|
||||||
|
for i, test in enumerate(test_cases, 1):
|
||||||
|
print(f"\nTest {i}: {test['name']}")
|
||||||
|
print("-" * 80)
|
||||||
|
print(f"Input: {test['latex']}")
|
||||||
|
|
||||||
|
# Apply fix
|
||||||
|
fixed = converter._fix_ocr_number_errors(test['latex'])
|
||||||
|
print(f"Fixed: {fixed}")
|
||||||
|
|
||||||
|
# Check expected fixes
|
||||||
|
checks_passed = []
|
||||||
|
|
||||||
|
for expected in test['expected_fixes']:
|
||||||
|
if expected in fixed:
|
||||||
|
checks_passed.append(f"✓ Contains '{expected}'")
|
||||||
|
else:
|
||||||
|
checks_passed.append(f"✗ Missing '{expected}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
for should_not in test['should_not_have']:
|
||||||
|
if should_not not in fixed:
|
||||||
|
checks_passed.append(f"✓ Removed '{should_not}'")
|
||||||
|
else:
|
||||||
|
checks_passed.append(f"✗ Still has '{should_not}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
for check in checks_passed:
|
||||||
|
print(f" {check}")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_mathml_quality():
|
||||||
|
"""Test that fixed LaTeX produces better MathML."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Testing MathML Quality After OCR Fix")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
# The problematic LaTeX from the error
|
||||||
|
latex = r"\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}"
|
||||||
|
|
||||||
|
print(f"\nOriginal LaTeX: {latex}")
|
||||||
|
|
||||||
|
# Convert to MathML
|
||||||
|
result = converter.convert_to_formats(f"${latex}$")
|
||||||
|
mathml = result.mathml
|
||||||
|
|
||||||
|
print(f"\nMathML length: {len(mathml)} chars")
|
||||||
|
|
||||||
|
# Check quality indicators
|
||||||
|
print("\nQuality checks:")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
checks = {
|
||||||
|
"No separate digits for decimals": "<mn>22.2</mn>" in mathml or "22.2" in mathml,
|
||||||
|
"No dot as identifier": "<mi>.</mi>" not in mathml,
|
||||||
|
"Properly formatted numbers": "<mn>30.4</mn>" in mathml or "30.4" in mathml,
|
||||||
|
"Has namespace": 'xmlns=' in mathml,
|
||||||
|
"Display block": 'display="block"' in mathml,
|
||||||
|
}
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
|
||||||
|
for check, passed in checks.items():
|
||||||
|
status = "✓" if passed else "✗"
|
||||||
|
print(f"{status} {check}")
|
||||||
|
if not passed:
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
# Show a preview
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print("MathML preview:")
|
||||||
|
print("-" * 80)
|
||||||
|
print(mathml[:400])
|
||||||
|
if len(mathml) > 400:
|
||||||
|
print("...")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_cases():
|
||||||
|
"""Test edge cases for OCR number fixing."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Testing Edge Cases")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
{
|
||||||
|
"name": "Should NOT merge: arithmetic",
|
||||||
|
"input": r"2 + 3 = 5",
|
||||||
|
"should_stay": "2 + 3 = 5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Should NOT merge: multiplication",
|
||||||
|
"input": r"2 \times 3",
|
||||||
|
"should_stay": r"2 \times 3",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Should merge: decimal at end",
|
||||||
|
"input": r"x = 1 2. 5",
|
||||||
|
"should_become": "12.5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Should merge: multiple spaces",
|
||||||
|
"input": r"n = 1 2 . 3 4",
|
||||||
|
"should_have": "12.34",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Complex: mixed scenarios",
|
||||||
|
"input": r"a = 1 2. 3 + 4 5. 6 - 7",
|
||||||
|
"should_have": ["12.3", "45.6", "- 7"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
|
||||||
|
for test in test_cases:
|
||||||
|
print(f"\n{test['name']}")
|
||||||
|
print(f" Input: {test['input']}")
|
||||||
|
|
||||||
|
fixed = converter._fix_ocr_number_errors(test['input'])
|
||||||
|
print(f" Output: {fixed}")
|
||||||
|
|
||||||
|
if 'should_stay' in test:
|
||||||
|
if fixed == test['should_stay']:
|
||||||
|
print(f" ✓ Correctly unchanged")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Should stay '{test['should_stay']}' but got '{fixed}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
if 'should_become' in test:
|
||||||
|
if test['should_become'] in fixed:
|
||||||
|
print(f" ✓ Contains '{test['should_become']}'")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Should contain '{test['should_become']}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
if 'should_have' in test:
|
||||||
|
for expected in test['should_have']:
|
||||||
|
if expected in fixed:
|
||||||
|
print(f" ✓ Contains '{expected}'")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Should contain '{expected}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def compare_before_after():
|
||||||
|
"""Compare MathML before and after OCR fix."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Before/After Comparison")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
# Simulate OCR error
|
||||||
|
ocr_latex = r"\gamma = 2 2. 2, c = 3 0. 4"
|
||||||
|
correct_latex = r"\gamma = 22.2, c = 30.4"
|
||||||
|
|
||||||
|
print(f"\nOCR LaTeX: {ocr_latex}")
|
||||||
|
print(f"Correct LaTeX: {correct_latex}")
|
||||||
|
|
||||||
|
# Convert both
|
||||||
|
ocr_result = converter.convert_to_formats(f"${ocr_latex}$")
|
||||||
|
correct_result = converter.convert_to_formats(f"${correct_latex}$")
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print("MathML comparison:")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
# Check if they produce similar quality output
|
||||||
|
ocr_has_decimal = "22.2" in ocr_result.mathml
|
||||||
|
correct_has_decimal = "22.2" in correct_result.mathml
|
||||||
|
|
||||||
|
ocr_has_dot_error = "<mi>.</mi>" in ocr_result.mathml
|
||||||
|
correct_has_dot_error = "<mi>.</mi>" in correct_result.mathml
|
||||||
|
|
||||||
|
print(f"OCR output has proper decimals: {'✓' if ocr_has_decimal else '✗'}")
|
||||||
|
print(f"Correct output has proper decimals: {'✓' if correct_has_decimal else '✗'}")
|
||||||
|
print(f"OCR output has dot errors: {'✗ Yes' if ocr_has_dot_error else '✓ No'}")
|
||||||
|
print(f"Correct output has dot errors: {'✗ Yes' if correct_has_dot_error else '✓ No'}")
|
||||||
|
|
||||||
|
if ocr_has_decimal and not ocr_has_dot_error:
|
||||||
|
print("\n✓ OCR fix is working! Output quality matches correct input.")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("\n✗ OCR fix may need improvement.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("OCR Number Error Fix Test Suite\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
test1 = test_ocr_number_errors()
|
||||||
|
test2 = test_mathml_quality()
|
||||||
|
test3 = test_edge_cases()
|
||||||
|
test4 = compare_before_after()
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
results = [
|
||||||
|
("OCR error fixes", test1),
|
||||||
|
("MathML quality", test2),
|
||||||
|
("Edge cases", test3),
|
||||||
|
("Before/after comparison", test4),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, passed in results:
|
||||||
|
status = "✓ PASS" if passed else "✗ FAIL"
|
||||||
|
print(f"{status}: {name}")
|
||||||
|
|
||||||
|
all_passed = all(r[1] for r in results)
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
|
||||||
|
print("\nOCR number errors are being fixed automatically!")
|
||||||
|
print("Examples:")
|
||||||
|
print(" • '2 2. 2' → '22.2'")
|
||||||
|
print(" • '3 0. 4' → '30.4'")
|
||||||
|
print(" • '1 5 0' → '150'")
|
||||||
|
else:
|
||||||
|
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nTests interrupted")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n\nTest error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
265
test_ocr_pipeline.py
Normal file
265
test_ocr_pipeline.py
Normal file
@@ -0,0 +1,265 @@
|
|||||||
|
"""Test OCR number error fixing in the complete pipeline."""
|
||||||
|
|
||||||
|
from app.services.ocr_service import _postprocess_markdown
|
||||||
|
|
||||||
|
|
||||||
|
def test_ocr_postprocessing():
|
||||||
|
"""Test that OCR postprocessing fixes number errors."""
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("Testing OCR Postprocessing Pipeline")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Simulate OCR output with common errors
|
||||||
|
test_cases = [
|
||||||
|
{
|
||||||
|
"name": "Inline formula with decimal errors",
|
||||||
|
"input": r"The value is $\gamma = 2 2. 2$ and $c = 3 0. 4$.",
|
||||||
|
"should_have": ["22.2", "30.4"],
|
||||||
|
"should_not_have": ["2 2", "3 0"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Display formula with decimal errors",
|
||||||
|
"input": r"$$\phi = 2 5. 4 ^ {\circ}$$",
|
||||||
|
"should_have": ["25.4"],
|
||||||
|
"should_not_have": ["2 5"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Multiple formulas",
|
||||||
|
"input": r"$a = 1 2. 5$, $b = 9. 8 7$, and $c = 1 5 0$",
|
||||||
|
"should_have": ["12.5", "9.87", "150"],
|
||||||
|
"should_not_have": ["1 2", "9. 8", "1 5"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Mixed content (text + formulas)",
|
||||||
|
"input": r"The equation $x = 3. 14$ is approximately pi. Then $y = 2 7. 3$.",
|
||||||
|
"should_have": ["3.14", "27.3"],
|
||||||
|
"should_not_have": ["3. 14", "2 7"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Normal arithmetic (should not be affected)",
|
||||||
|
"input": r"$2 + 3 = 5$ and $10 - 7 = 3$",
|
||||||
|
"should_stay": True,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
|
||||||
|
for i, test in enumerate(test_cases, 1):
|
||||||
|
print(f"\nTest {i}: {test['name']}")
|
||||||
|
print("-" * 80)
|
||||||
|
print(f"Input: {test['input']}")
|
||||||
|
|
||||||
|
# Apply postprocessing
|
||||||
|
output = _postprocess_markdown(test['input'])
|
||||||
|
print(f"Output: {output}")
|
||||||
|
|
||||||
|
# Check results
|
||||||
|
if 'should_have' in test:
|
||||||
|
for expected in test['should_have']:
|
||||||
|
if expected in output:
|
||||||
|
print(f" ✓ Contains '{expected}'")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Missing '{expected}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
if 'should_not_have' in test:
|
||||||
|
for unexpected in test['should_not_have']:
|
||||||
|
if unexpected not in output:
|
||||||
|
print(f" ✓ Removed '{unexpected}'")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Still has '{unexpected}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
if test.get('should_stay'):
|
||||||
|
if test['input'] == output:
|
||||||
|
print(f" ✓ Correctly unchanged")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Should not change but did")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_real_world_case():
|
||||||
|
"""Test the exact case from the error report."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Testing Real-World Error Case")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# The exact input from the error report
|
||||||
|
ocr_output = r"$$\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}$$"
|
||||||
|
|
||||||
|
print(f"\nOCR Output (with errors):")
|
||||||
|
print(f" {ocr_output}")
|
||||||
|
|
||||||
|
# Apply postprocessing
|
||||||
|
fixed = _postprocess_markdown(ocr_output)
|
||||||
|
|
||||||
|
print(f"\nAfter Postprocessing:")
|
||||||
|
print(f" {fixed}")
|
||||||
|
|
||||||
|
# Check if fixed
|
||||||
|
checks = {
|
||||||
|
"Has 22.2": "22.2" in fixed,
|
||||||
|
"Has 30.4": "30.4" in fixed,
|
||||||
|
"Has 25.4": "25.4" in fixed,
|
||||||
|
"No '2 2'": "2 2" not in fixed,
|
||||||
|
"No '3 0'": "3 0" not in fixed,
|
||||||
|
"No '2 5'": "2 5" not in fixed,
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\nQuality Checks:")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for check, passed in checks.items():
|
||||||
|
status = "✓" if passed else "✗"
|
||||||
|
print(f"{status} {check}")
|
||||||
|
if not passed:
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("\n✓ Real-world case fixed successfully!")
|
||||||
|
else:
|
||||||
|
print("\n✗ Real-world case still has issues")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_cases():
|
||||||
|
"""Test edge cases to ensure we don't break valid formulas."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Testing Edge Cases")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
{
|
||||||
|
"name": "Arithmetic operations",
|
||||||
|
"input": r"$2 + 3 = 5$ and $10 - 7 = 3$",
|
||||||
|
"should_stay": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Multiplication",
|
||||||
|
"input": r"$2 \times 3 = 6$",
|
||||||
|
"should_stay": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Exponents",
|
||||||
|
"input": r"$x ^ 2 + y ^ 2 = r ^ 2$",
|
||||||
|
"should_stay": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Fractions",
|
||||||
|
"input": r"$\frac{1}{2} + \frac{3}{4}$",
|
||||||
|
"should_stay": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Subscripts",
|
||||||
|
"input": r"$x _ 1 + x _ 2$",
|
||||||
|
"should_stay": True,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
|
||||||
|
for test in test_cases:
|
||||||
|
print(f"\n{test['name']}")
|
||||||
|
print(f" Input: {test['input']}")
|
||||||
|
|
||||||
|
output = _postprocess_markdown(test['input'])
|
||||||
|
print(f" Output: {output}")
|
||||||
|
|
||||||
|
if test.get('should_stay'):
|
||||||
|
# For these cases, we allow some whitespace changes but structure should stay
|
||||||
|
if output.replace(" ", "") == test['input'].replace(" ", ""):
|
||||||
|
print(f" ✓ Structure preserved")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Structure changed unexpectedly")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_performance():
|
||||||
|
"""Test performance with large content."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Testing Performance")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Create a large markdown with many formulas
|
||||||
|
large_content = ""
|
||||||
|
for i in range(100):
|
||||||
|
large_content += f"Formula {i}: $x = {i} {i}. {i}$ and $y = {i*2} {i*2}. {i*2}$\n"
|
||||||
|
|
||||||
|
print(f"\nContent size: {len(large_content)} characters")
|
||||||
|
print(f"Number of formulas: ~200")
|
||||||
|
|
||||||
|
import time
|
||||||
|
start = time.time()
|
||||||
|
output = _postprocess_markdown(large_content)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
print(f"Processing time: {elapsed*1000:.2f}ms")
|
||||||
|
|
||||||
|
if elapsed < 1.0:
|
||||||
|
print("✓ Performance is acceptable (< 1s)")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("✗ Performance may need optimization")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("OCR Pipeline Integration Test Suite\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
test1 = test_ocr_postprocessing()
|
||||||
|
test2 = test_real_world_case()
|
||||||
|
test3 = test_edge_cases()
|
||||||
|
test4 = test_performance()
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
results = [
|
||||||
|
("OCR postprocessing", test1),
|
||||||
|
("Real-world case", test2),
|
||||||
|
("Edge cases", test3),
|
||||||
|
("Performance", test4),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, passed in results:
|
||||||
|
status = "✓ PASS" if passed else "✗ FAIL"
|
||||||
|
print(f"{status}: {name}")
|
||||||
|
|
||||||
|
all_passed = all(r[1] for r in results)
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
|
||||||
|
print("\nOCR number error fixing is integrated into the pipeline!")
|
||||||
|
print("\nFlow:")
|
||||||
|
print(" 1. OCR recognizes image → produces Markdown with LaTeX")
|
||||||
|
print(" 2. _postprocess_markdown() fixes number errors")
|
||||||
|
print(" 3. Clean LaTeX is used for all conversions")
|
||||||
|
print("\nBenefits:")
|
||||||
|
print(" • Fixed once at the source")
|
||||||
|
print(" • All output formats benefit (MathML, MML, OMML)")
|
||||||
|
print(" • Better performance (no repeated fixes)")
|
||||||
|
else:
|
||||||
|
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nTests interrupted")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n\nTest error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
Reference in New Issue
Block a user