Files
doc_processer/app/api/v1/endpoints/image.py

87 lines
3.3 KiB
Python
Raw Normal View History

2025-12-29 17:34:58 +08:00
"""Image OCR endpoint."""
from fastapi import APIRouter, Depends, HTTPException
2026-02-04 12:00:06 +08:00
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter
from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse
2025-12-29 17:34:58 +08:00
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
2026-01-05 17:30:54 +08:00
from app.services.ocr_service import OCRService, MineruOCRService
2026-02-04 12:00:06 +08:00
from app.services.converter import Converter
2025-12-29 17:34:58 +08:00
router = APIRouter()
@router.post("/ocr", response_model=ImageOCRResponse)
async def process_image_ocr(
request: ImageOCRRequest,
image_processor: ImageProcessor = Depends(get_image_processor),
layout_detector: LayoutDetector = Depends(get_layout_detector),
2026-01-05 17:30:54 +08:00
mineru_service: MineruOCRService = Depends(get_mineru_ocr_service),
paddle_service: OCRService = Depends(get_ocr_service),
2025-12-29 17:34:58 +08:00
) -> ImageOCRResponse:
"""Process an image and extract content as LaTeX, Markdown, and MathML.
The processing pipeline:
1. Load and preprocess image (add 30% whitespace padding)
2. Detect layout using DocLayout-YOLO
3. Based on layout:
- If plain text exists: use PP-DocLayoutV2 for mixed recognition
- Otherwise: use PaddleOCR-VL with formula prompt
4. Convert output to LaTeX, Markdown, and MathML formats
2026-02-04 12:00:06 +08:00
Note: OMML conversion is not included due to performance overhead.
Use the /latex-to-omml endpoint to convert LaTeX to OMML separately.
2025-12-29 17:34:58 +08:00
"""
2025-12-31 17:38:32 +08:00
image = image_processor.preprocess(
image_url=request.image_url,
image_base64=request.image_base64,
)
2025-12-29 17:34:58 +08:00
try:
2026-01-05 17:30:54 +08:00
if request.model_name == "mineru":
ocr_result = mineru_service.recognize(image)
elif request.model_name == "paddle":
ocr_result = paddle_service.recognize(image)
else:
raise HTTPException(status_code=400, detail="Invalid model name")
2025-12-29 17:34:58 +08:00
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))
return ImageOCRResponse(
latex=ocr_result.get("latex", ""),
markdown=ocr_result.get("markdown", ""),
mathml=ocr_result.get("mathml", ""),
2026-02-04 12:00:06 +08:00
mml=ocr_result.get("mml", ""),
2025-12-29 17:34:58 +08:00
)
2026-02-04 12:00:06 +08:00
@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
async def convert_latex_to_omml(
request: LatexToOmmlRequest,
converter: Converter = Depends(get_converter),
) -> LatexToOmmlResponse:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
OMML is the math format used by Microsoft Word and other Office applications.
This endpoint is separate from the main OCR endpoint due to the performance
overhead of OMML conversion (requires creating a temporary DOCX file).
Args:
request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
Returns:
OMML representation of the formula.
"""
if not request.latex or not request.latex.strip():
raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
try:
omml = converter.convert_to_omml(request.latex)
return LatexToOmmlResponse(omml=omml)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))