fix: mineru post handel
This commit is contained in:
@@ -346,7 +346,8 @@ class MineruOCRService(OCRServiceBase):
|
|||||||
if "results" in result and "image" in result["results"]:
|
if "results" in result and "image" in result["results"]:
|
||||||
markdown_content = result["results"]["image"].get("md_content", "")
|
markdown_content = result["results"]["image"].get("md_content", "")
|
||||||
|
|
||||||
# markdown_content = _postprocess_markdown(markdown_content)
|
# Apply postprocessing to fix OCR errors
|
||||||
|
markdown_content = _postprocess_markdown(markdown_content)
|
||||||
|
|
||||||
# Convert to other formats if converter is available
|
# Convert to other formats if converter is available
|
||||||
latex = ""
|
latex = ""
|
||||||
|
|||||||
105
test_mineru_fix.py
Normal file
105
test_mineru_fix.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""Quick test to verify MinerU postprocessing is enabled."""
|
||||||
|
|
||||||
|
from app.services.ocr_service import _postprocess_markdown
|
||||||
|
|
||||||
|
|
||||||
|
def test_mineru_postprocessing():
|
||||||
|
"""Test that postprocessing works for MinerU output."""
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("Testing MinerU Postprocessing")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Simulate MinerU OCR output (with number errors)
|
||||||
|
mineru_markdown = r"""$$
|
||||||
|
\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}
|
||||||
|
$$"""
|
||||||
|
|
||||||
|
print("\nMinerU OCR Output (raw):")
|
||||||
|
print(mineru_markdown)
|
||||||
|
|
||||||
|
# Apply postprocessing
|
||||||
|
fixed = _postprocess_markdown(mineru_markdown)
|
||||||
|
|
||||||
|
print("\nAfter Postprocessing:")
|
||||||
|
print(fixed)
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print("Verification:")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
checks = [
|
||||||
|
("Has '22.2'", "22.2" in fixed),
|
||||||
|
("Has '30.4'", "30.4" in fixed),
|
||||||
|
("Has '25.4'", "25.4" in fixed),
|
||||||
|
("No '2 2'", "2 2" not in fixed),
|
||||||
|
("No '3 0'", "3 0" not in fixed),
|
||||||
|
("No '2 5'", "2 5" not in fixed),
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for check_name, passed in checks:
|
||||||
|
status = "✓" if passed else "✗"
|
||||||
|
print(f"{status} {check_name}")
|
||||||
|
if not passed:
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("\n✓✓✓ MinerU postprocessing is working! ✓✓✓")
|
||||||
|
else:
|
||||||
|
print("\n✗✗✗ MinerU postprocessing has issues ✗✗✗")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_expected_api_response():
|
||||||
|
"""Test what the API response should look like."""
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Expected API Response Format")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
ocr_output = r"$$\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}$$"
|
||||||
|
fixed = _postprocess_markdown(ocr_output)
|
||||||
|
|
||||||
|
print("\nBefore postprocessing:")
|
||||||
|
print(f" markdown: {ocr_output}")
|
||||||
|
|
||||||
|
print("\nAfter postprocessing (what API should return):")
|
||||||
|
print(f" markdown: {fixed}")
|
||||||
|
|
||||||
|
print("\nExpected changes:")
|
||||||
|
print(" • '2 2. 2' → '22.2'")
|
||||||
|
print(" • '3 0. 4' → '30.4'")
|
||||||
|
print(" • '2 5. 4' → '25.4'")
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print("Note: The API should return the FIXED markdown")
|
||||||
|
print(" All other formats (latex, mathml, mml) are derived from this")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("MinerU Postprocessing Verification\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
test1 = test_mineru_postprocessing()
|
||||||
|
test_expected_api_response()
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
|
||||||
|
if test1:
|
||||||
|
print("✓ MinerU postprocessing is NOW ENABLED")
|
||||||
|
print("\nNext steps:")
|
||||||
|
print(" 1. Restart the server")
|
||||||
|
print(" 2. Test with the same request")
|
||||||
|
print(" 3. The markdown field should now have '22.2' instead of '2 2. 2'")
|
||||||
|
else:
|
||||||
|
print("✗ There may still be issues")
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nError: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
Reference in New Issue
Block a user