106 lines
2.9 KiB
Python
106 lines
2.9 KiB
Python
|
|
"""Quick test to verify MinerU postprocessing is enabled."""
|
||
|
|
|
||
|
|
from app.services.ocr_service import _postprocess_markdown
|
||
|
|
|
||
|
|
|
||
|
|
def test_mineru_postprocessing():
|
||
|
|
"""Test that postprocessing works for MinerU output."""
|
||
|
|
|
||
|
|
print("=" * 80)
|
||
|
|
print("Testing MinerU Postprocessing")
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
# Simulate MinerU OCR output (with number errors)
|
||
|
|
mineru_markdown = r"""$$
|
||
|
|
\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}
|
||
|
|
$$"""
|
||
|
|
|
||
|
|
print("\nMinerU OCR Output (raw):")
|
||
|
|
print(mineru_markdown)
|
||
|
|
|
||
|
|
# Apply postprocessing
|
||
|
|
fixed = _postprocess_markdown(mineru_markdown)
|
||
|
|
|
||
|
|
print("\nAfter Postprocessing:")
|
||
|
|
print(fixed)
|
||
|
|
|
||
|
|
print("\n" + "-" * 80)
|
||
|
|
print("Verification:")
|
||
|
|
print("-" * 80)
|
||
|
|
|
||
|
|
checks = [
|
||
|
|
("Has '22.2'", "22.2" in fixed),
|
||
|
|
("Has '30.4'", "30.4" in fixed),
|
||
|
|
("Has '25.4'", "25.4" in fixed),
|
||
|
|
("No '2 2'", "2 2" not in fixed),
|
||
|
|
("No '3 0'", "3 0" not in fixed),
|
||
|
|
("No '2 5'", "2 5" not in fixed),
|
||
|
|
]
|
||
|
|
|
||
|
|
all_passed = True
|
||
|
|
for check_name, passed in checks:
|
||
|
|
status = "✓" if passed else "✗"
|
||
|
|
print(f"{status} {check_name}")
|
||
|
|
if not passed:
|
||
|
|
all_passed = False
|
||
|
|
|
||
|
|
if all_passed:
|
||
|
|
print("\n✓✓✓ MinerU postprocessing is working! ✓✓✓")
|
||
|
|
else:
|
||
|
|
print("\n✗✗✗ MinerU postprocessing has issues ✗✗✗")
|
||
|
|
|
||
|
|
return all_passed
|
||
|
|
|
||
|
|
|
||
|
|
def test_expected_api_response():
|
||
|
|
"""Test what the API response should look like."""
|
||
|
|
|
||
|
|
print("\n" + "=" * 80)
|
||
|
|
print("Expected API Response Format")
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
ocr_output = r"$$\gamma = 2 2. 2, c = 3 0. 4, \phi = 2 5. 4 ^ {\circ}$$"
|
||
|
|
fixed = _postprocess_markdown(ocr_output)
|
||
|
|
|
||
|
|
print("\nBefore postprocessing:")
|
||
|
|
print(f" markdown: {ocr_output}")
|
||
|
|
|
||
|
|
print("\nAfter postprocessing (what API should return):")
|
||
|
|
print(f" markdown: {fixed}")
|
||
|
|
|
||
|
|
print("\nExpected changes:")
|
||
|
|
print(" • '2 2. 2' → '22.2'")
|
||
|
|
print(" • '3 0. 4' → '30.4'")
|
||
|
|
print(" • '2 5. 4' → '25.4'")
|
||
|
|
|
||
|
|
print("\n" + "-" * 80)
|
||
|
|
print("Note: The API should return the FIXED markdown")
|
||
|
|
print(" All other formats (latex, mathml, mml) are derived from this")
|
||
|
|
print("-" * 80)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
print("MinerU Postprocessing Verification\n")
|
||
|
|
|
||
|
|
try:
|
||
|
|
test1 = test_mineru_postprocessing()
|
||
|
|
test_expected_api_response()
|
||
|
|
|
||
|
|
print("\n" + "=" * 80)
|
||
|
|
|
||
|
|
if test1:
|
||
|
|
print("✓ MinerU postprocessing is NOW ENABLED")
|
||
|
|
print("\nNext steps:")
|
||
|
|
print(" 1. Restart the server")
|
||
|
|
print(" 2. Test with the same request")
|
||
|
|
print(" 3. The markdown field should now have '22.2' instead of '2 2. 2'")
|
||
|
|
else:
|
||
|
|
print("✗ There may still be issues")
|
||
|
|
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"\nError: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|