"""Image OCR endpoint.""" import time import uuid from fastapi import APIRouter, Depends, HTTPException, Request, Response from app.core.dependencies import ( get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_glmocr_service, ) from app.core.logging_config import get_logger, RequestIDAdapter from app.schemas.image import ImageOCRRequest, ImageOCRResponse from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector from app.services.ocr_service import OCRService, MineruOCRService, GLMOCRService router = APIRouter() logger = get_logger() @router.post("/ocr", response_model=ImageOCRResponse) async def process_image_ocr( request: ImageOCRRequest, http_request: Request, response: Response, image_processor: ImageProcessor = Depends(get_image_processor), layout_detector: LayoutDetector = Depends(get_layout_detector), mineru_service: MineruOCRService = Depends(get_mineru_ocr_service), paddle_service: OCRService = Depends(get_ocr_service), glmocr_service: GLMOCRService = Depends(get_glmocr_service), ) -> ImageOCRResponse: """Process an image and extract content as LaTeX, Markdown, and MathML. The processing pipeline: 1. Load and preprocess image (add 30% whitespace padding) 2. Detect layout using DocLayout-YOLO 3. Based on layout: - If plain text exists: use PP-DocLayoutV2 for mixed recognition - Otherwise: use PaddleOCR-VL with formula prompt 4. Convert output to LaTeX, Markdown, and MathML formats Note: OMML conversion is not included due to performance overhead. Use the /convert/latex-to-omml endpoint to convert LaTeX to OMML separately. """ # Get or generate request ID request_id = http_request.headers.get("x-request-id", str(uuid.uuid4())) response.headers["x-request-id"] = request_id # Create logger adapter with request_id log = RequestIDAdapter(logger, {"request_id": request_id}) log.request_id = request_id try: log.info("Starting image OCR processing") # Preprocess image preprocess_start = time.time() image = image_processor.preprocess( image_url=request.image_url, image_base64=request.image_base64, ) preprocess_time = time.time() - preprocess_start log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s") # Layout detection layout_start = time.time() layout_info = layout_detector.detect(image) layout_time = time.time() - layout_start log.info(f"Layout detection completed in {layout_time:.3f}s") # OCR recognition ocr_start = time.time() if layout_info.MixedRecognition: recognition_method = "MixedRecognition (MinerU)" log.info(f"Using {recognition_method}") ocr_result = mineru_service.recognize(image) else: recognition_method = "FormulaOnly (GLMOCR)" log.info(f"Using {recognition_method}") ocr_result = glmocr_service.recognize(image) ocr_time = time.time() - ocr_start total_time = time.time() - preprocess_start log.info(f"OCR processing completed - Method: {recognition_method}, " f"Layout time: {layout_time:.3f}s, OCR time: {ocr_time:.3f}s, " f"Total time: {total_time:.3f}s") except RuntimeError as e: log.error(f"OCR processing failed: {str(e)}", exc_info=True) raise HTTPException(status_code=503, detail=str(e)) except Exception as e: log.error(f"Unexpected error during OCR processing: {str(e)}", exc_info=True) raise HTTPException(status_code=500, detail="Internal server error") return ImageOCRResponse( latex=ocr_result.get("latex", ""), markdown=ocr_result.get("markdown", ""), mathml=ocr_result.get("mathml", ""), mml=ocr_result.get("mml", ""), )