diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 2a68033..26d6c48 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -346,7 +346,8 @@ class MineruOCRService(OCRServiceBase): if "results" in result and "image" in result["results"]: markdown_content = result["results"]["image"].get("md_content", "") - # markdown_content = _postprocess_markdown(markdown_content) + # Apply postprocessing to fix OCR errors + markdown_content = _postprocess_markdown(markdown_content) # Convert to other formats if converter is available latex = "" diff --git a/test_mineru_fix.py b/test_mineru_fix.py new file mode 100644 index 0000000..edbe620 --- /dev/null +++ b/test_mineru_fix.py @@ -0,0 +1,105 @@ +"""Quick test to verify MinerU postprocessing is enabled.""" + +from app.services.ocr_service import _postprocess_markdown + + +def test_mineru_postprocessing(): + """Test that postprocessing works for MinerU output.""" + + print("=" * 80) + print("Testing MinerU Postprocessing") + print("=" * 80) + + # Simulate MinerU OCR output (with number errors) + mineru_markdown = r"""$$ +\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ} +$$""" + + print("\nMinerU OCR Output (raw):") + print(mineru_markdown) + + # Apply postprocessing + fixed = _postprocess_markdown(mineru_markdown) + + print("\nAfter Postprocessing:") + print(fixed) + + print("\n" + "-" * 80) + print("Verification:") + print("-" * 80) + + checks = [ + ("Has '22.2'", "22.2" in fixed), + ("Has '30.4'", "30.4" in fixed), + ("Has '25.4'", "25.4" in fixed), + ("No '2 2'", "2 2" not in fixed), + ("No '3 0'", "3 0" not in fixed), + ("No '2 5'", "2 5" not in fixed), + ] + + all_passed = True + for check_name, passed in checks: + status = "✓" if passed else "✗" + print(f"{status} {check_name}") + if not passed: + all_passed = False + + if all_passed: + print("\n✓✓✓ MinerU postprocessing is working! ✓✓✓") + else: + print("\n✗✗✗ MinerU postprocessing has issues ✗✗✗") + + return all_passed + + +def test_expected_api_response(): + """Test what the API response should look like.""" + + print("\n" + "=" * 80) + print("Expected API Response Format") + print("=" * 80) + + ocr_output = r"$$\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}$$" + fixed = _postprocess_markdown(ocr_output) + + print("\nBefore postprocessing:") + print(f" markdown: {ocr_output}") + + print("\nAfter postprocessing (what API should return):") + print(f" markdown: {fixed}") + + print("\nExpected changes:") + print(" • '2 2. 2' → '22.2'") + print(" • '3 0. 4' → '30.4'") + print(" • '2 5. 4' → '25.4'") + + print("\n" + "-" * 80) + print("Note: The API should return the FIXED markdown") + print(" All other formats (latex, mathml, mml) are derived from this") + print("-" * 80) + + +if __name__ == "__main__": + print("MinerU Postprocessing Verification\n") + + try: + test1 = test_mineru_postprocessing() + test_expected_api_response() + + print("\n" + "=" * 80) + + if test1: + print("✓ MinerU postprocessing is NOW ENABLED") + print("\nNext steps:") + print(" 1. Restart the server") + print(" 2. Test with the same request") + print(" 3. The markdown field should now have '22.2' instead of '2 2. 2'") + else: + print("✗ There may still be issues") + + print("=" * 80) + + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc()