"""Image OCR endpoint.""" import time import uuid from fastapi import APIRouter, Depends, HTTPException, Request, Response from app.core.dependencies import ( get_glmocr_endtoend_service, get_image_processor, ) from app.core.logging_config import RequestIDAdapter, get_logger from app.schemas.image import ImageOCRRequest, ImageOCRResponse from app.services.image_processor import ImageProcessor from app.services.ocr_service import GLMOCREndToEndService router = APIRouter() logger = get_logger() @router.post("/ocr", response_model=ImageOCRResponse) async def process_image_ocr( request: ImageOCRRequest, http_request: Request, response: Response, image_processor: ImageProcessor = Depends(get_image_processor), glmocr_service: GLMOCREndToEndService = Depends(get_glmocr_endtoend_service), ) -> ImageOCRResponse: """Process an image and extract content as LaTeX, Markdown, and MathML. The processing pipeline: 1. Load and preprocess image 2. Detect layout regions using PP-DocLayoutV3 3. Crop each region and recognize with GLM-OCR via vLLM (task-specific prompts) 4. Aggregate region results into Markdown 5. Convert to LaTeX, Markdown, and MathML formats Note: OMML conversion is not included due to performance overhead. Use the /convert/latex-to-omml endpoint to convert LaTeX to OMML separately. """ request_id = http_request.headers.get("x-request-id", str(uuid.uuid4())) response.headers["x-request-id"] = request_id log = RequestIDAdapter(logger, {"request_id": request_id}) log.request_id = request_id try: log.info("Starting image OCR processing") start = time.time() image = image_processor.preprocess( image_url=request.image_url, image_base64=request.image_base64, ) ocr_result = glmocr_service.recognize(image) log.info(f"OCR completed in {time.time() - start:.3f}s") except RuntimeError as e: log.error(f"OCR processing failed: {str(e)}", exc_info=True) raise HTTPException(status_code=503, detail=str(e)) except Exception as e: log.error(f"Unexpected error during OCR processing: {str(e)}", exc_info=True) raise HTTPException(status_code=500, detail="Internal server error") return ImageOCRResponse( latex=ocr_result.get("latex", ""), markdown=ocr_result.get("markdown", ""), mathml=ocr_result.get("mathml", ""), mml=ocr_result.get("mml", ""), )