feat: add omml api
This commit is contained in:
@@ -1,10 +1,10 @@
|
|||||||
"""Markdown to DOCX conversion endpoint."""
|
"""Format conversion endpoints."""
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response
|
||||||
|
|
||||||
from app.core.dependencies import get_converter
|
from app.core.dependencies import get_converter
|
||||||
from app.schemas.convert import MarkdownToDocxRequest
|
from app.schemas.convert import MarkdownToDocxRequest, LatexToOmmlRequest, LatexToOmmlResponse
|
||||||
from app.services.converter import Converter
|
from app.services.converter import Converter
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
@@ -28,3 +28,39 @@ async def convert_markdown_to_docx(
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
|
raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
|
||||||
|
async def convert_latex_to_omml(
|
||||||
|
request: LatexToOmmlRequest,
|
||||||
|
converter: Converter = Depends(get_converter),
|
||||||
|
) -> LatexToOmmlResponse:
|
||||||
|
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
||||||
|
|
||||||
|
OMML is the math format used by Microsoft Word and other Office applications.
|
||||||
|
This endpoint is separate from the main OCR endpoint due to the performance
|
||||||
|
overhead of OMML conversion (requires creating a temporary DOCX file).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
OMML representation of the formula.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```bash
|
||||||
|
curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \\
|
||||||
|
-H "Content-Type: application/json" \\
|
||||||
|
-d '{"latex": "\\\\frac{a}{b} + \\\\sqrt{c}"}'
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
if not request.latex or not request.latex.strip():
|
||||||
|
raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
|
||||||
|
|
||||||
|
try:
|
||||||
|
omml = converter.convert_to_omml(request.latex)
|
||||||
|
return LatexToOmmlResponse(omml=omml)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
except RuntimeError as e:
|
||||||
|
raise HTTPException(status_code=503, detail=str(e))
|
||||||
|
|||||||
@@ -2,12 +2,11 @@
|
|||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
|
||||||
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter
|
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service
|
||||||
from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse
|
from app.schemas.image import ImageOCRRequest, ImageOCRResponse
|
||||||
from app.services.image_processor import ImageProcessor
|
from app.services.image_processor import ImageProcessor
|
||||||
from app.services.layout_detector import LayoutDetector
|
from app.services.layout_detector import LayoutDetector
|
||||||
from app.services.ocr_service import OCRService, MineruOCRService
|
from app.services.ocr_service import OCRService, MineruOCRService
|
||||||
from app.services.converter import Converter
|
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
@@ -31,7 +30,7 @@ async def process_image_ocr(
|
|||||||
4. Convert output to LaTeX, Markdown, and MathML formats
|
4. Convert output to LaTeX, Markdown, and MathML formats
|
||||||
|
|
||||||
Note: OMML conversion is not included due to performance overhead.
|
Note: OMML conversion is not included due to performance overhead.
|
||||||
Use the /latex-to-omml endpoint to convert LaTeX to OMML separately.
|
Use the /convert/latex-to-omml endpoint to convert LaTeX to OMML separately.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
image = image_processor.preprocess(
|
image = image_processor.preprocess(
|
||||||
@@ -55,32 +54,3 @@ async def process_image_ocr(
|
|||||||
mathml=ocr_result.get("mathml", ""),
|
mathml=ocr_result.get("mathml", ""),
|
||||||
mml=ocr_result.get("mml", ""),
|
mml=ocr_result.get("mml", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
|
|
||||||
async def convert_latex_to_omml(
|
|
||||||
request: LatexToOmmlRequest,
|
|
||||||
converter: Converter = Depends(get_converter),
|
|
||||||
) -> LatexToOmmlResponse:
|
|
||||||
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
|
||||||
|
|
||||||
OMML is the math format used by Microsoft Word and other Office applications.
|
|
||||||
This endpoint is separate from the main OCR endpoint due to the performance
|
|
||||||
overhead of OMML conversion (requires creating a temporary DOCX file).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
OMML representation of the formula.
|
|
||||||
"""
|
|
||||||
if not request.latex or not request.latex.strip():
|
|
||||||
raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
|
|
||||||
|
|
||||||
try:
|
|
||||||
omml = converter.convert_to_omml(request.latex)
|
|
||||||
return LatexToOmmlResponse(omml=omml)
|
|
||||||
except ValueError as e:
|
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
|
||||||
except RuntimeError as e:
|
|
||||||
raise HTTPException(status_code=503, detail=str(e))
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
"""Request and response schemas for markdown to DOCX conversion endpoint."""
|
"""Request and response schemas for format conversion endpoints."""
|
||||||
|
|
||||||
from pydantic import BaseModel, Field, field_validator
|
from pydantic import BaseModel, Field, field_validator
|
||||||
|
|
||||||
@@ -17,3 +17,23 @@ class MarkdownToDocxRequest(BaseModel):
|
|||||||
raise ValueError("Markdown content cannot be empty")
|
raise ValueError("Markdown content cannot be empty")
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
|
||||||
|
class LatexToOmmlRequest(BaseModel):
|
||||||
|
"""Request body for LaTeX to OMML conversion endpoint."""
|
||||||
|
|
||||||
|
latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)")
|
||||||
|
|
||||||
|
@field_validator("latex")
|
||||||
|
@classmethod
|
||||||
|
def validate_latex_not_empty(cls, v: str) -> str:
|
||||||
|
"""Validate that LaTeX formula is not empty."""
|
||||||
|
if not v or not v.strip():
|
||||||
|
raise ValueError("LaTeX formula cannot be empty")
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
|
class LatexToOmmlResponse(BaseModel):
|
||||||
|
"""Response body for LaTeX to OMML conversion endpoint."""
|
||||||
|
|
||||||
|
omml: str = Field("", description="OMML (Office Math Markup Language) representation")
|
||||||
|
|
||||||
|
|||||||
@@ -47,14 +47,3 @@ class ImageOCRResponse(BaseModel):
|
|||||||
layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
|
layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
|
||||||
recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition")
|
recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition")
|
||||||
|
|
||||||
|
|
||||||
class LatexToOmmlRequest(BaseModel):
|
|
||||||
"""Request body for LaTeX to OMML conversion endpoint."""
|
|
||||||
|
|
||||||
latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)")
|
|
||||||
|
|
||||||
|
|
||||||
class LatexToOmmlResponse(BaseModel):
|
|
||||||
"""Response body for LaTeX to OMML conversion endpoint."""
|
|
||||||
|
|
||||||
omml: str = Field("", description="OMML (Office Math Markup Language) representation")
|
|
||||||
|
|||||||
112
test_omml_api.py
Normal file
112
test_omml_api.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
"""Test script for OMML conversion API endpoint."""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def test_latex_to_omml():
|
||||||
|
"""Test the /convert/latex-to-omml endpoint."""
|
||||||
|
|
||||||
|
# Test cases
|
||||||
|
test_cases = [
|
||||||
|
{
|
||||||
|
"name": "Simple fraction",
|
||||||
|
"latex": "\\frac{a}{b}",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Quadratic formula",
|
||||||
|
"latex": "x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Integral",
|
||||||
|
"latex": "\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Matrix",
|
||||||
|
"latex": "\\begin{matrix} a & b \\\\ c & d \\end{matrix}",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
|
||||||
|
|
||||||
|
print("Testing OMML Conversion API")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
for i, test_case in enumerate(test_cases, 1):
|
||||||
|
print(f"\nTest {i}: {test_case['name']}")
|
||||||
|
print("-" * 80)
|
||||||
|
print(f"LaTeX: {test_case['latex']}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
base_url,
|
||||||
|
json={"latex": test_case["latex"]},
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
omml = result.get("omml", "")
|
||||||
|
|
||||||
|
print(f"✓ Status: {response.status_code}")
|
||||||
|
print(f"OMML length: {len(omml)} characters")
|
||||||
|
print(f"OMML preview: {omml[:150]}...")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"✗ Status: {response.status_code}")
|
||||||
|
print(f"Error: {response.text}")
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"✗ Request failed: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Error: {e}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_input():
|
||||||
|
"""Test error handling with invalid input."""
|
||||||
|
|
||||||
|
print("\nTesting Error Handling")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
|
||||||
|
|
||||||
|
# Empty LaTeX
|
||||||
|
print("\nTest: Empty LaTeX")
|
||||||
|
response = requests.post(
|
||||||
|
base_url,
|
||||||
|
json={"latex": ""},
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
)
|
||||||
|
print(f"Status: {response.status_code}")
|
||||||
|
print(f"Response: {response.json()}")
|
||||||
|
|
||||||
|
# Missing LaTeX field
|
||||||
|
print("\nTest: Missing LaTeX field")
|
||||||
|
response = requests.post(
|
||||||
|
base_url,
|
||||||
|
json={},
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
)
|
||||||
|
print(f"Status: {response.status_code}")
|
||||||
|
print(f"Response: {response.json()}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("OMML API Test Suite")
|
||||||
|
print("Make sure the API server is running on http://localhost:8000")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
test_latex_to_omml()
|
||||||
|
test_invalid_input()
|
||||||
|
print("\n✓ All tests completed!")
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\n✗ Tests interrupted by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n✗ Test suite failed: {e}")
|
||||||
Reference in New Issue
Block a user