"""PaddleOCR-VL client service for text and formula recognition.""" import numpy as np import cv2 import requests from io import BytesIO from app.core.config import get_settings from paddleocr import PaddleOCRVL from typing import Optional from app.services.layout_detector import LayoutDetector from app.services.image_processor import ImageProcessor from app.services.converter import Converter from abc import ABC, abstractmethod settings = get_settings() class OCRServiceBase(ABC): @abstractmethod def recognize(self, image: np.ndarray) -> dict: pass class OCRService(OCRServiceBase): """Service for OCR using PaddleOCR-VL.""" _pipeline: Optional[PaddleOCRVL] = None _layout_detector: Optional[LayoutDetector] = None def __init__( self, vl_server_url: str, layout_detector: LayoutDetector, image_processor: ImageProcessor, converter: Converter, ): """Initialize OCR service. Args: vl_server_url: URL of the vLLM server for PaddleOCR-VL. layout_detector: Layout detector instance. image_processor: Image processor instance. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url self.layout_detector = layout_detector self.image_processor = image_processor self.converter = converter def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: PaddleOCRVL pipeline instance. """ if OCRService._pipeline is None: OCRService._pipeline = PaddleOCRVL( vl_rec_backend="vllm-server", vl_rec_server_url=self.vl_server_url, layout_detection_model_name="PP-DocLayoutV2", ) return OCRService._pipeline def _recognize_mixed(self, image: np.ndarray) -> dict: """Recognize mixed content (text + formulas) using PP-DocLayoutV2. This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware recognition of mixed content. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=True) markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") convert_result = self.converter.convert_to_formats(markdown_content) return { "markdown": markdown_content, "latex": convert_result.latex, "mathml": convert_result.mathml, } except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e def _recognize_formula(self, image: np.ndarray) -> dict: """Recognize formula/math content using PaddleOCR-VL with prompt. This mode uses PaddleOCR-VL directly with a formula recognition prompt. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula") markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") convert_result = self.converter.convert_to_formats(markdown_content) return { "latex": convert_result.latex, "mathml": convert_result.mathml, "markdown": markdown_content, } except Exception as e: raise RuntimeError(f"Formula recognition failed: {e}") from e def recognize(self, image: np.ndarray) -> dict: """Recognize content using PaddleOCR-VL. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ padded_image = self.image_processor.add_padding(image) layout_info = self.layout_detector.detect(padded_image) if layout_info.MixedRecognition: return self._recognize_mixed(image) else: return self._recognize_formula(image) class MineruOCRService(OCRServiceBase): """Service for OCR using local file_parse API.""" def __init__( self, api_url: str = "http://127.0.0.1:8000/file_parse", converter: Optional[Converter] = None, ): """Initialize Local API service. Args: api_url: URL of the local file_parse API endpoint. converter: Optional converter instance for format conversion. """ self.api_url = api_url self.converter = converter def recognize(self, image: np.ndarray) -> dict: """Recognize content using local file_parse API. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: # Convert numpy array to image bytes success, encoded_image = cv2.imencode('.png', image) if not success: raise RuntimeError("Failed to encode image") image_bytes = BytesIO(encoded_image.tobytes()) # Prepare multipart form data files = { 'files': ('image.png', image_bytes, 'image/png') } data = { 'return_middle_json': 'false', 'return_model_output': 'false', 'return_md': 'true', 'return_images': 'false', 'end_page_id': '99999', 'parse_method': 'auto', 'start_page_id': '0', 'lang_list': 'en', 'server_url': 'string', 'return_content_list': 'false', 'backend': 'hybrid-auto-engine', 'table_enable': 'true', 'response_format_zip': 'false', 'formula_enable': 'true', } # Make API request response = requests.post( self.api_url, files=files, data=data, headers={'accept': 'application/json'}, timeout=30 ) response.raise_for_status() result = response.json() # Extract markdown content from response markdown_content = "" if 'results' in result and 'image' in result['results']: markdown_content = result['results']['image'].get('md_content', '') # Convert to other formats if converter is available latex = "" mathml = "" if self.converter and markdown_content: convert_result = self.converter.convert_to_formats(markdown_content) latex = convert_result.latex mathml = convert_result.mathml return { "markdown": markdown_content, "latex": latex, "mathml": mathml, } except requests.RequestException as e: raise RuntimeError(f"Local API request failed: {e}") from e except Exception as e: raise RuntimeError(f"Recognition failed: {e}") from e if __name__ == "__main__": mineru_service = MineruOCRService() image = cv2.imread("test/complex_formula.png") image_numpy = np.array(image) ocr_result = mineru_service.recognize(image_numpy) print(ocr_result)