fix: mineru post handel

This commit is contained in:
liuyuanchuang
2026-02-04 16:07:04 +08:00
parent 61fd5441b7
commit 35419b2102
2 changed files with 107 additions and 1 deletions

View File

@@ -346,7 +346,8 @@ class MineruOCRService(OCRServiceBase):
if "results" in result and "image" in result["results"]:
markdown_content = result["results"]["image"].get("md_content", "")
# markdown_content = _postprocess_markdown(markdown_content)
# Apply postprocessing to fix OCR errors
markdown_content = _postprocess_markdown(markdown_content)
# Convert to other formats if converter is available
latex = ""

105
test_mineru_fix.py Normal file
View File

@@ -0,0 +1,105 @@
"""Quick test to verify MinerU postprocessing is enabled."""
from app.services.ocr_service import _postprocess_markdown
def test_mineru_postprocessing():
"""Test that postprocessing works for MinerU output."""
print("=" * 80)
print("Testing MinerU Postprocessing")
print("=" * 80)
# Simulate MinerU OCR output (with number errors)
mineru_markdown = r"""$$
\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}
$$"""
print("\nMinerU OCR Output (raw):")
print(mineru_markdown)
# Apply postprocessing
fixed = _postprocess_markdown(mineru_markdown)
print("\nAfter Postprocessing:")
print(fixed)
print("\n" + "-" * 80)
print("Verification:")
print("-" * 80)
checks = [
("Has '22.2'", "22.2" in fixed),
("Has '30.4'", "30.4" in fixed),
("Has '25.4'", "25.4" in fixed),
("No '2 2'", "2 2" not in fixed),
("No '3 0'", "3 0" not in fixed),
("No '2 5'", "2 5" not in fixed),
]
all_passed = True
for check_name, passed in checks:
status = "" if passed else ""
print(f"{status} {check_name}")
if not passed:
all_passed = False
if all_passed:
print("\n✓✓✓ MinerU postprocessing is working! ✓✓✓")
else:
print("\n✗✗✗ MinerU postprocessing has issues ✗✗✗")
return all_passed
def test_expected_api_response():
"""Test what the API response should look like."""
print("\n" + "=" * 80)
print("Expected API Response Format")
print("=" * 80)
ocr_output = r"$$\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}$$"
fixed = _postprocess_markdown(ocr_output)
print("\nBefore postprocessing:")
print(f" markdown: {ocr_output}")
print("\nAfter postprocessing (what API should return):")
print(f" markdown: {fixed}")
print("\nExpected changes:")
print("'2 2. 2''22.2'")
print("'3 0. 4''30.4'")
print("'2 5. 4''25.4'")
print("\n" + "-" * 80)
print("Note: The API should return the FIXED markdown")
print(" All other formats (latex, mathml, mml) are derived from this")
print("-" * 80)
if __name__ == "__main__":
print("MinerU Postprocessing Verification\n")
try:
test1 = test_mineru_postprocessing()
test_expected_api_response()
print("\n" + "=" * 80)
if test1:
print("✓ MinerU postprocessing is NOW ENABLED")
print("\nNext steps:")
print(" 1. Restart the server")
print(" 2. Test with the same request")
print(" 3. The markdown field should now have '22.2' instead of '2 2. 2'")
else:
print("✗ There may still be issues")
print("=" * 80)
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()