"""PaddleOCR-VL client service for text and formula recognition.""" import numpy as np from app.core.config import get_settings from paddleocr import PaddleOCRVL from typing import Optional from app.services.layout_detector import LayoutDetector from app.services.image_processor import ImageProcessor from app.services.converter import Converter settings = get_settings() class OCRService: """Service for OCR using PaddleOCR-VL.""" _pipeline: Optional[PaddleOCRVL] = None _layout_detector: Optional[LayoutDetector] = None def __init__( self, vl_server_url: str, layout_detector: LayoutDetector, image_processor: ImageProcessor, converter: Converter, ): """Initialize OCR service. Args: vl_server_url: URL of the vLLM server for PaddleOCR-VL. layout_detector: Layout detector instance. image_processor: Image processor instance. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url self.layout_detector = layout_detector self.image_processor = image_processor self.converter = converter def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: PaddleOCRVL pipeline instance. """ if OCRService._pipeline is None: OCRService._pipeline = PaddleOCRVL( vl_rec_backend="vllm-server", vl_rec_server_url=self.vl_server_url, layout_detection_model_name="PP-DocLayoutV2", ) return OCRService._pipeline def recognize_mixed(self, image: np.ndarray) -> dict: """Recognize mixed content (text + formulas) using PP-DocLayoutV2. This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware recognition of mixed content. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=True) markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") convert_result = self.converter.convert_to_formats(markdown_content) return { "markdown": markdown_content, "latex": convert_result.latex, "mathml": convert_result.mathml, } except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e def recognize_formula(self, image: np.ndarray) -> dict: """Recognize formula/math content using PaddleOCR-VL with prompt. This mode uses PaddleOCR-VL directly with a formula recognition prompt. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula") markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") convert_result = self.converter.convert_to_formats(markdown_content) return { "latex": convert_result.latex, "mathml": convert_result.mathml, "markdown": markdown_content, } except Exception as e: raise RuntimeError(f"Formula recognition failed: {e}") from e def recognize(self, image: np.ndarray) -> dict: """Recognize content using PaddleOCR-VL. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ padded_image = self.image_processor.add_padding(image) layout_info = self.layout_detector.detect(padded_image) if layout_info.MixedRecognition: return self.recognize_mixed(image) else: return self.recognize_formula(image) if __name__ == "__main__": import cv2 from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector image_processor = ImageProcessor(padding_ratio=0.15) layout_detector = LayoutDetector() ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector) image = cv2.imread("test/image.png") ocr_result = ocr_service.recognize(image) print(ocr_result)