app/api/v1/endpoints/image.py

"""Image OCR endpoint."""

from fastapi import APIRouter, Depends, HTTPException

from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter
from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
from app.services.ocr_service import OCRService, MineruOCRService
from app.services.converter import Converter

router = APIRouter()


@router.post("/ocr", response_model=ImageOCRResponse)
async def process_image_ocr(
    request: ImageOCRRequest,
    image_processor: ImageProcessor = Depends(get_image_processor),
    layout_detector: LayoutDetector = Depends(get_layout_detector),
    mineru_service: MineruOCRService = Depends(get_mineru_ocr_service),
    paddle_service: OCRService = Depends(get_ocr_service),
) -> ImageOCRResponse:
    """Process an image and extract content as LaTeX, Markdown, and MathML.

    The processing pipeline:
    1. Load and preprocess image (add 30% whitespace padding)
    2. Detect layout using DocLayout-YOLO
    3. Based on layout:
       - If plain text exists: use PP-DocLayoutV2 for mixed recognition
       - Otherwise: use PaddleOCR-VL with formula prompt
    4. Convert output to LaTeX, Markdown, and MathML formats

    Note: OMML conversion is not included due to performance overhead.
    Use the /latex-to-omml endpoint to convert LaTeX to OMML separately.
    """

    image = image_processor.preprocess(
        image_url=request.image_url,
        image_base64=request.image_base64,
    )

    try:
        if request.model_name == "mineru":
            ocr_result = mineru_service.recognize(image)
        elif request.model_name == "paddle":
            ocr_result = paddle_service.recognize(image)
        else:
            raise HTTPException(status_code=400, detail="Invalid model name")
    except RuntimeError as e:
        raise HTTPException(status_code=503, detail=str(e))

    return ImageOCRResponse(
        latex=ocr_result.get("latex", ""),
        markdown=ocr_result.get("markdown", ""),
        mathml=ocr_result.get("mathml", ""),
        mml=ocr_result.get("mml", ""),
    )


@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
async def convert_latex_to_omml(
    request: LatexToOmmlRequest,
    converter: Converter = Depends(get_converter),
) -> LatexToOmmlResponse:
    """Convert LaTeX formula to OMML (Office Math Markup Language).

    OMML is the math format used by Microsoft Word and other Office applications.
    This endpoint is separate from the main OCR endpoint due to the performance
    overhead of OMML conversion (requires creating a temporary DOCX file).

    Args:
        request: Contains the LaTeX formula to convert (without $ or $$ delimiters).

    Returns:
        OMML representation of the formula.
    """
    if not request.latex or not request.latex.strip():
        raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")

    try:
        omml = converter.convert_to_omml(request.latex)
        return LatexToOmmlResponse(omml=omml)
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except RuntimeError as e:
        raise HTTPException(status_code=503, detail=str(e))
init repo 2025-12-29 17:34:58 +08:00			`"""Image OCR endpoint."""`

			`from fastapi import APIRouter, Depends, HTTPException`

feat: optimize the format convert 2026-02-04 12:00:06 +08:00			`from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_converter`
			`from app.schemas.image import ImageOCRRequest, ImageOCRResponse, LatexToOmmlRequest, LatexToOmmlResponse`
init repo 2025-12-29 17:34:58 +08:00			`from app.services.image_processor import ImageProcessor`
			`from app.services.layout_detector import LayoutDetector`
feat: add mineru model 2026-01-05 17:30:54 +08:00			`from app.services.ocr_service import OCRService, MineruOCRService`
feat: optimize the format convert 2026-02-04 12:00:06 +08:00			`from app.services.converter import Converter`
init repo 2025-12-29 17:34:58 +08:00
			`router = APIRouter()`


			`@router.post("/ocr", response_model=ImageOCRResponse)`
			`async def process_image_ocr(`
			`request: ImageOCRRequest,`
			`image_processor: ImageProcessor = Depends(get_image_processor),`
			`layout_detector: LayoutDetector = Depends(get_layout_detector),`
feat: add mineru model 2026-01-05 17:30:54 +08:00			`mineru_service: MineruOCRService = Depends(get_mineru_ocr_service),`
			`paddle_service: OCRService = Depends(get_ocr_service),`
init repo 2025-12-29 17:34:58 +08:00			`) -> ImageOCRResponse:`
			`"""Process an image and extract content as LaTeX, Markdown, and MathML.`

			`The processing pipeline:`
			`1. Load and preprocess image (add 30% whitespace padding)`
			`2. Detect layout using DocLayout-YOLO`
			`3. Based on layout:`
			`- If plain text exists: use PP-DocLayoutV2 for mixed recognition`
			`- Otherwise: use PaddleOCR-VL with formula prompt`
			`4. Convert output to LaTeX, Markdown, and MathML formats`
feat: optimize the format convert 2026-02-04 12:00:06 +08:00
			`Note: OMML conversion is not included due to performance overhead.`
			`Use the /latex-to-omml endpoint to convert LaTeX to OMML separately.`
init repo 2025-12-29 17:34:58 +08:00			`"""`

fix: refact logic 2025-12-31 17:38:32 +08:00			`image = image_processor.preprocess(`
			`image_url=request.image_url,`
			`image_base64=request.image_base64,`
			`)`
init repo 2025-12-29 17:34:58 +08:00
			`try:`
feat: add mineru model 2026-01-05 17:30:54 +08:00			`if request.model_name == "mineru":`
			`ocr_result = mineru_service.recognize(image)`
			`elif request.model_name == "paddle":`
			`ocr_result = paddle_service.recognize(image)`
			`else:`
			`raise HTTPException(status_code=400, detail="Invalid model name")`
init repo 2025-12-29 17:34:58 +08:00			`except RuntimeError as e:`
			`raise HTTPException(status_code=503, detail=str(e))`

			`return ImageOCRResponse(`
			`latex=ocr_result.get("latex", ""),`
			`markdown=ocr_result.get("markdown", ""),`
			`mathml=ocr_result.get("mathml", ""),`
feat: optimize the format convert 2026-02-04 12:00:06 +08:00			`mml=ocr_result.get("mml", ""),`
init repo 2025-12-29 17:34:58 +08:00			`)`
feat: optimize the format convert 2026-02-04 12:00:06 +08:00

			`@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)`
			`async def convert_latex_to_omml(`
			`request: LatexToOmmlRequest,`
			`converter: Converter = Depends(get_converter),`
			`) -> LatexToOmmlResponse:`
			`"""Convert LaTeX formula to OMML (Office Math Markup Language).`

			`OMML is the math format used by Microsoft Word and other Office applications.`
			`This endpoint is separate from the main OCR endpoint due to the performance`
			`overhead of OMML conversion (requires creating a temporary DOCX file).`

			`Args:`
			`request: Contains the LaTeX formula to convert (without $ or $$ delimiters).`

			`Returns:`
			`OMML representation of the formula.`
			`"""`
			`if not request.latex or not request.latex.strip():`
			`raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")`

			`try:`
			`omml = converter.convert_to_omml(request.latex)`
			`return LatexToOmmlResponse(omml=omml)`
			`except ValueError as e:`
			`raise HTTPException(status_code=400, detail=str(e))`
			`except RuntimeError as e:`
			`raise HTTPException(status_code=503, detail=str(e))`