Files
doc_processer/app/api/v1/endpoints/image.py
2026-02-04 12:00:06 +08:00

87 lines
3.3 KiB
Python

"""Image OCR endpoint."""
from fastapi import APIRouter, Depends, HTTPException
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter
from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
from app.services.ocr_service import OCRService, MineruOCRService
from app.services.converter import Converter
router = APIRouter()
@router.post("/ocr", response_model=ImageOCRResponse)
async def process_image_ocr(
request: ImageOCRRequest,
image_processor: ImageProcessor = Depends(get_image_processor),
layout_detector: LayoutDetector = Depends(get_layout_detector),
mineru_service: MineruOCRService = Depends(get_mineru_ocr_service),
paddle_service: OCRService = Depends(get_ocr_service),
) -> ImageOCRResponse:
"""Process an image and extract content as LaTeX, Markdown, and MathML.
The processing pipeline:
1. Load and preprocess image (add 30% whitespace padding)
2. Detect layout using DocLayout-YOLO
3. Based on layout:
- If plain text exists: use PP-DocLayoutV2 for mixed recognition
- Otherwise: use PaddleOCR-VL with formula prompt
4. Convert output to LaTeX, Markdown, and MathML formats
Note: OMML conversion is not included due to performance overhead.
Use the /latex-to-omml endpoint to convert LaTeX to OMML separately.
"""
image = image_processor.preprocess(
image_url=request.image_url,
image_base64=request.image_base64,
)
try:
if request.model_name == "mineru":
ocr_result = mineru_service.recognize(image)
elif request.model_name == "paddle":
ocr_result = paddle_service.recognize(image)
else:
raise HTTPException(status_code=400, detail="Invalid model name")
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))
return ImageOCRResponse(
latex=ocr_result.get("latex", ""),
markdown=ocr_result.get("markdown", ""),
mathml=ocr_result.get("mathml", ""),
mml=ocr_result.get("mml", ""),
)
@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
async def convert_latex_to_omml(
request: LatexToOmmlRequest,
converter: Converter = Depends(get_converter),
) -> LatexToOmmlResponse:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
OMML is the math format used by Microsoft Word and other Office applications.
This endpoint is separate from the main OCR endpoint due to the performance
overhead of OMML conversion (requires creating a temporary DOCX file).
Args:
request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
Returns:
OMML representation of the formula.
"""
if not request.latex or not request.latex.strip():
raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
try:
omml = converter.convert_to_omml(request.latex)
return LatexToOmmlResponse(omml=omml)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))