"""Image OCR endpoint.""" import time import uuid import cv2 from io import BytesIO from fastapi import APIRouter, Depends, HTTPException, Request, Response from app.core.dependencies import ( get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service, get_glmocr_service, ) from app.core.config import get_settings from app.core.logging_config import get_logger, RequestIDAdapter from app.schemas.image import ImageOCRRequest, ImageOCRResponse from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector from app.services.ocr_service import OCRService, MineruOCRService, GLMOCRService settings = get_settings() router = APIRouter() logger = get_logger() @router.post("/ocr", response_model=ImageOCRResponse) async def process_image_ocr( request: ImageOCRRequest, http_request: Request, response: Response, image_processor: ImageProcessor = Depends(get_image_processor), layout_detector: LayoutDetector = Depends(get_layout_detector), mineru_service: MineruOCRService = Depends(get_mineru_ocr_service), paddle_service: OCRService = Depends(get_ocr_service), glmocr_service: GLMOCRService = Depends(get_glmocr_service), ) -> ImageOCRResponse: """Process an image and extract content as LaTeX, Markdown, and MathML. The processing pipeline: 1. Load and preprocess image (add 30% whitespace padding) 2. Detect layout using DocLayout-YOLO 3. Based on layout: - If plain text exists: use PP-DocLayoutV2 for mixed recognition - Otherwise: use PaddleOCR-VL with formula prompt 4. Convert output to LaTeX, Markdown, and MathML formats Note: OMML conversion is not included due to performance overhead. Use the /convert/latex-to-omml endpoint to convert LaTeX to OMML separately. """ # Get or generate request ID request_id = http_request.headers.get("x-request-id", str(uuid.uuid4())) response.headers["x-request-id"] = request_id # Create logger adapter with request_id log = RequestIDAdapter(logger, {"request_id": request_id}) log.request_id = request_id try: log.info("Starting image OCR processing") # Preprocess image (load only, no padding yet) preprocess_start = time.time() image = image_processor.preprocess( image_url=request.image_url, image_base64=request.image_base64, ) # Apply padding only for layout detection processed_image = image if image_processor and settings.is_padding: processed_image = image_processor.add_padding(image) preprocess_time = time.time() - preprocess_start log.debug(f"Image loading completed in {preprocess_time:.3f}s") # Layout detection (using padded image if padding is enabled) layout_start = time.time() layout_info = layout_detector.detect(processed_image) layout_time = time.time() - layout_start log.info(f"Layout detection completed in {layout_time:.3f}s") # OCR recognition (use original image without padding) ocr_start = time.time() if layout_info.MixedRecognition: recognition_method = "MixedRecognition (MinerU)" log.info(f"Using {recognition_method}") # Convert original image (without padding) to bytes success, encoded_image = cv2.imencode(".png", image) if not success: raise RuntimeError("Failed to encode image") image_bytes = BytesIO(encoded_image.tobytes()) image_bytes.seek(0) # Ensure position is at the beginning ocr_result = mineru_service.recognize(image_bytes) else: recognition_method = "FormulaOnly (GLMOCR)" log.info(f"Using {recognition_method}") # Try GLM-OCR first, fallback to MinerU if token limit exceeded try: ocr_result = glmocr_service.recognize(image) except Exception as e: error_msg = str(e) # Check if error is due to token limit (max_model_len exceeded) if "max_model_len" in error_msg or "decoder prompt" in error_msg or "BadRequestError" in error_msg: log.warning(f"GLM-OCR failed due to token limit: {error_msg}") log.info("Falling back to MinerU for recognition") recognition_method = "FormulaOnly (MinerU fallback)" # Convert original image to bytes for MinerU success, encoded_image = cv2.imencode(".png", image) if not success: raise RuntimeError("Failed to encode image") image_bytes = BytesIO(encoded_image.tobytes()) image_bytes.seek(0) ocr_result = mineru_service.recognize(image_bytes) else: # Re-raise other errors raise ocr_time = time.time() - ocr_start total_time = time.time() - preprocess_start log.info(f"OCR processing completed - Method: {recognition_method}, " f"Layout time: {layout_time:.3f}s, OCR time: {ocr_time:.3f}s, " f"Total time: {total_time:.3f}s") except RuntimeError as e: log.error(f"OCR processing failed: {str(e)}", exc_info=True) raise HTTPException(status_code=503, detail=str(e)) except Exception as e: log.error(f"Unexpected error during OCR processing: {str(e)}", exc_info=True) raise HTTPException(status_code=500, detail="Internal server error") return ImageOCRResponse( latex=ocr_result.get("latex", ""), markdown=ocr_result.get("markdown", ""), mathml=ocr_result.get("mathml", ""), mml=ocr_result.get("mml", ""), )