fix: add post markdown

This commit is contained in:
liuyuanchuang
2026-02-04 16:04:18 +08:00
parent 720cd05add
commit 61fd5441b7
4 changed files with 601 additions and 2 deletions

View File

@@ -248,17 +248,19 @@ class Converter:
consistency across all conversion paths. This fixes common issues that consistency across all conversion paths. This fixes common issues that
cause Pandoc conversion to fail. cause Pandoc conversion to fail.
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
so we don't need to handle them here.
Args: Args:
latex_formula: Pure LaTeX formula. latex_formula: Pure LaTeX formula.
Returns: Returns:
Preprocessed LaTeX formula. Preprocessed LaTeX formula.
""" """
# Use the same preprocessing methods as export
# 1. Convert matrix environments # 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula) latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces) - THIS IS THE KEY FIX # 2. Fix array column specifiers (remove spaces)
latex_formula = self._fix_array_column_specifiers(latex_formula) latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing # 3. Fix brace spacing

View File

@@ -85,6 +85,8 @@ def _split_glued_command_token(token: str) -> str:
def _postprocess_math(expr: str) -> str: def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$).""" """Postprocess a *math* expression (already inside $...$ or $$...$$)."""
# stage0: fix OCR number errors (digits with spaces)
expr = _fix_ocr_number_errors(expr)
# stage1: split glued command tokens (e.g. \cdotdS) # stage1: split glued command tokens (e.g. \cdotdS)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: normalize differentials (keep conservative) # stage2: normalize differentials (keep conservative)
@@ -93,6 +95,42 @@ def _postprocess_math(expr: str) -> str:
return expr return expr
def _fix_ocr_number_errors(expr: str) -> str:
"""Fix common OCR errors in LaTeX math expressions.
OCR often splits numbers incorrectly, especially decimals:
- "2 2. 2" should be "22.2"
- "3 0. 4" should be "30.4"
- "1 5 0" should be "150"
This function merges digit sequences that are separated by spaces.
Args:
expr: LaTeX math expression.
Returns:
LaTeX expression with number errors fixed.
"""
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
# Example: "2 2. 2" → "22.2"
expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
# Example: "22. 2" → "22.2"
expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
# Be careful: only merge if followed by decimal point or comma/end
# Example: "1 5 0" → "150" when followed by comma or end
expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
# Fix pattern 4: Multiple spaces in decimal numbers
# Example: "2 2 . 2" → "22.2"
expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
return expr
def _postprocess_markdown(markdown_content: str) -> str: def _postprocess_markdown(markdown_content: str) -> str:
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments.""" """Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
if not markdown_content: if not markdown_content:

294
test_ocr_number_fix.py Normal file
View File

@@ -0,0 +1,294 @@
"""Test OCR number error fixing."""
from app.services.converter import Converter
def test_ocr_number_errors():
"""Test fixing of common OCR number errors."""
print("=" * 80)
print("Testing OCR Number Error Fixes")
print("=" * 80)
converter = Converter()
# Test cases from the error
test_cases = [
{
"name": "Original error case",
"latex": r"\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}",
"expected_fixes": ["22.2", "30.4", "25.4"],
"should_not_have": ["2 2", "3 0", "2 5"],
},
{
"name": "Simple decimal with space",
"latex": r"x = 3. 14",
"expected_fixes": ["3.14"],
"should_not_have": ["3. 14"],
},
{
"name": "Multiple decimals",
"latex": r"a = 1 2. 5, b = 9. 8 7",
"expected_fixes": ["12.5", "9.87"],
"should_not_have": ["1 2", "9. 8"],
},
{
"name": "Large numbers with spaces",
"latex": r"n = 1 5 0, m = 2 0 0 0",
"expected_fixes": ["150", "2000"],
"should_not_have": ["1 5", "2 0 0"],
},
{
"name": "Don't merge across operators",
"latex": r"2 + 3 = 5",
"expected_fixes": ["2 + 3 = 5"], # Should stay the same
"should_not_have": ["23=5"],
},
]
all_passed = True
for i, test in enumerate(test_cases, 1):
print(f"\nTest {i}: {test['name']}")
print("-" * 80)
print(f"Input: {test['latex']}")
# Apply fix
fixed = converter._fix_ocr_number_errors(test['latex'])
print(f"Fixed: {fixed}")
# Check expected fixes
checks_passed = []
for expected in test['expected_fixes']:
if expected in fixed:
checks_passed.append(f"✓ Contains '{expected}'")
else:
checks_passed.append(f"✗ Missing '{expected}'")
all_passed = False
for should_not in test['should_not_have']:
if should_not not in fixed:
checks_passed.append(f"✓ Removed '{should_not}'")
else:
checks_passed.append(f"✗ Still has '{should_not}'")
all_passed = False
for check in checks_passed:
print(f" {check}")
return all_passed
def test_mathml_quality():
"""Test that fixed LaTeX produces better MathML."""
print("\n" + "=" * 80)
print("Testing MathML Quality After OCR Fix")
print("=" * 80)
converter = Converter()
# The problematic LaTeX from the error
latex = r"\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}"
print(f"\nOriginal LaTeX: {latex}")
# Convert to MathML
result = converter.convert_to_formats(f"${latex}$")
mathml = result.mathml
print(f"\nMathML length: {len(mathml)} chars")
# Check quality indicators
print("\nQuality checks:")
print("-" * 80)
checks = {
"No separate digits for decimals": "<mn>22.2</mn>" in mathml or "22.2" in mathml,
"No dot as identifier": "<mi>.</mi>" not in mathml,
"Properly formatted numbers": "<mn>30.4</mn>" in mathml or "30.4" in mathml,
"Has namespace": 'xmlns=' in mathml,
"Display block": 'display="block"' in mathml,
}
all_passed = True
for check, passed in checks.items():
status = "" if passed else ""
print(f"{status} {check}")
if not passed:
all_passed = False
# Show a preview
print("\n" + "-" * 80)
print("MathML preview:")
print("-" * 80)
print(mathml[:400])
if len(mathml) > 400:
print("...")
return all_passed
def test_edge_cases():
"""Test edge cases for OCR number fixing."""
print("\n" + "=" * 80)
print("Testing Edge Cases")
print("=" * 80)
converter = Converter()
test_cases = [
{
"name": "Should NOT merge: arithmetic",
"input": r"2 + 3 = 5",
"should_stay": "2 + 3 = 5",
},
{
"name": "Should NOT merge: multiplication",
"input": r"2 \times 3",
"should_stay": r"2 \times 3",
},
{
"name": "Should merge: decimal at end",
"input": r"x = 1 2. 5",
"should_become": "12.5",
},
{
"name": "Should merge: multiple spaces",
"input": r"n = 1 2 . 3 4",
"should_have": "12.34",
},
{
"name": "Complex: mixed scenarios",
"input": r"a = 1 2. 3 + 4 5. 6 - 7",
"should_have": ["12.3", "45.6", "- 7"],
},
]
all_passed = True
for test in test_cases:
print(f"\n{test['name']}")
print(f" Input: {test['input']}")
fixed = converter._fix_ocr_number_errors(test['input'])
print(f" Output: {fixed}")
if 'should_stay' in test:
if fixed == test['should_stay']:
print(f" ✓ Correctly unchanged")
else:
print(f" ✗ Should stay '{test['should_stay']}' but got '{fixed}'")
all_passed = False
if 'should_become' in test:
if test['should_become'] in fixed:
print(f" ✓ Contains '{test['should_become']}'")
else:
print(f" ✗ Should contain '{test['should_become']}'")
all_passed = False
if 'should_have' in test:
for expected in test['should_have']:
if expected in fixed:
print(f" ✓ Contains '{expected}'")
else:
print(f" ✗ Should contain '{expected}'")
all_passed = False
return all_passed
def compare_before_after():
"""Compare MathML before and after OCR fix."""
print("\n" + "=" * 80)
print("Before/After Comparison")
print("=" * 80)
converter = Converter()
# Simulate OCR error
ocr_latex = r"\gamma = 2 2. 2, c = 3 0. 4"
correct_latex = r"\gamma = 22.2, c = 30.4"
print(f"\nOCR LaTeX: {ocr_latex}")
print(f"Correct LaTeX: {correct_latex}")
# Convert both
ocr_result = converter.convert_to_formats(f"${ocr_latex}$")
correct_result = converter.convert_to_formats(f"${correct_latex}$")
print("\n" + "-" * 80)
print("MathML comparison:")
print("-" * 80)
# Check if they produce similar quality output
ocr_has_decimal = "22.2" in ocr_result.mathml
correct_has_decimal = "22.2" in correct_result.mathml
ocr_has_dot_error = "<mi>.</mi>" in ocr_result.mathml
correct_has_dot_error = "<mi>.</mi>" in correct_result.mathml
print(f"OCR output has proper decimals: {'' if ocr_has_decimal else ''}")
print(f"Correct output has proper decimals: {'' if correct_has_decimal else ''}")
print(f"OCR output has dot errors: {'✗ Yes' if ocr_has_dot_error else '✓ No'}")
print(f"Correct output has dot errors: {'✗ Yes' if correct_has_dot_error else '✓ No'}")
if ocr_has_decimal and not ocr_has_dot_error:
print("\n✓ OCR fix is working! Output quality matches correct input.")
return True
else:
print("\n✗ OCR fix may need improvement.")
return False
if __name__ == "__main__":
print("OCR Number Error Fix Test Suite\n")
try:
test1 = test_ocr_number_errors()
test2 = test_mathml_quality()
test3 = test_edge_cases()
test4 = compare_before_after()
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
results = [
("OCR error fixes", test1),
("MathML quality", test2),
("Edge cases", test3),
("Before/after comparison", test4),
]
for name, passed in results:
status = "✓ PASS" if passed else "✗ FAIL"
print(f"{status}: {name}")
all_passed = all(r[1] for r in results)
print("\n" + "-" * 80)
if all_passed:
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
print("\nOCR number errors are being fixed automatically!")
print("Examples:")
print("'2 2. 2''22.2'")
print("'3 0. 4''30.4'")
print("'1 5 0''150'")
else:
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted")
except Exception as e:
print(f"\n\nTest error: {e}")
import traceback
traceback.print_exc()

265
test_ocr_pipeline.py Normal file
View File

@@ -0,0 +1,265 @@
"""Test OCR number error fixing in the complete pipeline."""
from app.services.ocr_service import _postprocess_markdown
def test_ocr_postprocessing():
"""Test that OCR postprocessing fixes number errors."""
print("=" * 80)
print("Testing OCR Postprocessing Pipeline")
print("=" * 80)
# Simulate OCR output with common errors
test_cases = [
{
"name": "Inline formula with decimal errors",
"input": r"The value is $\gamma = 2 2. 2$ and $c = 3 0. 4$.",
"should_have": ["22.2", "30.4"],
"should_not_have": ["2 2", "3 0"],
},
{
"name": "Display formula with decimal errors",
"input": r"$$\phi = 2 5. 4 ^ {\circ}$$",
"should_have": ["25.4"],
"should_not_have": ["2 5"],
},
{
"name": "Multiple formulas",
"input": r"$a = 1 2. 5$, $b = 9. 8 7$, and $c = 1 5 0$",
"should_have": ["12.5", "9.87", "150"],
"should_not_have": ["1 2", "9. 8", "1 5"],
},
{
"name": "Mixed content (text + formulas)",
"input": r"The equation $x = 3. 14$ is approximately pi. Then $y = 2 7. 3$.",
"should_have": ["3.14", "27.3"],
"should_not_have": ["3. 14", "2 7"],
},
{
"name": "Normal arithmetic (should not be affected)",
"input": r"$2 + 3 = 5$ and $10 - 7 = 3$",
"should_stay": True,
},
]
all_passed = True
for i, test in enumerate(test_cases, 1):
print(f"\nTest {i}: {test['name']}")
print("-" * 80)
print(f"Input: {test['input']}")
# Apply postprocessing
output = _postprocess_markdown(test['input'])
print(f"Output: {output}")
# Check results
if 'should_have' in test:
for expected in test['should_have']:
if expected in output:
print(f" ✓ Contains '{expected}'")
else:
print(f" ✗ Missing '{expected}'")
all_passed = False
if 'should_not_have' in test:
for unexpected in test['should_not_have']:
if unexpected not in output:
print(f" ✓ Removed '{unexpected}'")
else:
print(f" ✗ Still has '{unexpected}'")
all_passed = False
if test.get('should_stay'):
if test['input'] == output:
print(f" ✓ Correctly unchanged")
else:
print(f" ✗ Should not change but did")
all_passed = False
return all_passed
def test_real_world_case():
"""Test the exact case from the error report."""
print("\n" + "=" * 80)
print("Testing Real-World Error Case")
print("=" * 80)
# The exact input from the error report
ocr_output = r"$$\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}$$"
print(f"\nOCR Output (with errors):")
print(f" {ocr_output}")
# Apply postprocessing
fixed = _postprocess_markdown(ocr_output)
print(f"\nAfter Postprocessing:")
print(f" {fixed}")
# Check if fixed
checks = {
"Has 22.2": "22.2" in fixed,
"Has 30.4": "30.4" in fixed,
"Has 25.4": "25.4" in fixed,
"No '2 2'": "2 2" not in fixed,
"No '3 0'": "3 0" not in fixed,
"No '2 5'": "2 5" not in fixed,
}
print("\nQuality Checks:")
print("-" * 80)
all_passed = True
for check, passed in checks.items():
status = "" if passed else ""
print(f"{status} {check}")
if not passed:
all_passed = False
if all_passed:
print("\n✓ Real-world case fixed successfully!")
else:
print("\n✗ Real-world case still has issues")
return all_passed
def test_edge_cases():
"""Test edge cases to ensure we don't break valid formulas."""
print("\n" + "=" * 80)
print("Testing Edge Cases")
print("=" * 80)
test_cases = [
{
"name": "Arithmetic operations",
"input": r"$2 + 3 = 5$ and $10 - 7 = 3$",
"should_stay": True,
},
{
"name": "Multiplication",
"input": r"$2 \times 3 = 6$",
"should_stay": True,
},
{
"name": "Exponents",
"input": r"$x ^ 2 + y ^ 2 = r ^ 2$",
"should_stay": True,
},
{
"name": "Fractions",
"input": r"$\frac{1}{2} + \frac{3}{4}$",
"should_stay": True,
},
{
"name": "Subscripts",
"input": r"$x _ 1 + x _ 2$",
"should_stay": True,
},
]
all_passed = True
for test in test_cases:
print(f"\n{test['name']}")
print(f" Input: {test['input']}")
output = _postprocess_markdown(test['input'])
print(f" Output: {output}")
if test.get('should_stay'):
# For these cases, we allow some whitespace changes but structure should stay
if output.replace(" ", "") == test['input'].replace(" ", ""):
print(f" ✓ Structure preserved")
else:
print(f" ✗ Structure changed unexpectedly")
all_passed = False
return all_passed
def test_performance():
"""Test performance with large content."""
print("\n" + "=" * 80)
print("Testing Performance")
print("=" * 80)
# Create a large markdown with many formulas
large_content = ""
for i in range(100):
large_content += f"Formula {i}: $x = {i} {i}. {i}$ and $y = {i*2} {i*2}. {i*2}$\n"
print(f"\nContent size: {len(large_content)} characters")
print(f"Number of formulas: ~200")
import time
start = time.time()
output = _postprocess_markdown(large_content)
elapsed = time.time() - start
print(f"Processing time: {elapsed*1000:.2f}ms")
if elapsed < 1.0:
print("✓ Performance is acceptable (< 1s)")
return True
else:
print("✗ Performance may need optimization")
return False
if __name__ == "__main__":
print("OCR Pipeline Integration Test Suite\n")
try:
test1 = test_ocr_postprocessing()
test2 = test_real_world_case()
test3 = test_edge_cases()
test4 = test_performance()
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
results = [
("OCR postprocessing", test1),
("Real-world case", test2),
("Edge cases", test3),
("Performance", test4),
]
for name, passed in results:
status = "✓ PASS" if passed else "✗ FAIL"
print(f"{status}: {name}")
all_passed = all(r[1] for r in results)
print("\n" + "-" * 80)
if all_passed:
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
print("\nOCR number error fixing is integrated into the pipeline!")
print("\nFlow:")
print(" 1. OCR recognizes image → produces Markdown with LaTeX")
print(" 2. _postprocess_markdown() fixes number errors")
print(" 3. Clean LaTeX is used for all conversions")
print("\nBenefits:")
print(" • Fixed once at the source")
print(" • All output formats benefit (MathML, MML, OMML)")
print(" • Better performance (no repeated fixes)")
else:
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted")
except Exception as e:
print(f"\n\nTest error: {e}")
import traceback
traceback.print_exc()