"""PaddleOCR-VL client service for text and formula recognition.""" import re import numpy as np import cv2 import requests from io import BytesIO from app.core.config import get_settings from paddleocr import PaddleOCRVL from typing import Optional from app.services.layout_detector import LayoutDetector from app.services.image_processor import ImageProcessor from app.services.converter import Converter from abc import ABC, abstractmethod settings = get_settings() _COMMANDS_NEED_SPACE = { # operators / calculus "cdot", "times", "div", "pm", "mp", "int", "iint", "iiint", "oint", "sum", "prod", "lim", # common functions "sin", "cos", "tan", "cot", "sec", "csc", "log", "ln", "exp", # misc "partial", "nabla", } _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL) _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+") # stage2: differentials inside math segments _DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(? str: """Split OCR-glued LaTeX command token by whitelist longest-prefix. Examples: - \\cdotdS -> \\cdot dS - \\intdx -> \\int dx """ if not token.startswith("\\"): return token body = token[1:] if len(body) < 2: return token best = None # longest prefix that is in whitelist for i in range(1, len(body)): prefix = body[:i] if prefix in _COMMANDS_NEED_SPACE: best = prefix if not best: return token suffix = body[len(best) :] if not suffix: return token return f"\\{best} {suffix}" def _postprocess_math(expr: str) -> str: """Postprocess a *math* expression (already inside $...$ or $$...$$).""" # stage1: split glued command tokens (e.g. \cdotdS) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) # stage2: normalize differentials (keep conservative) expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr) expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr) return expr def _postprocess_markdown(markdown_content: str) -> str: """Apply LaTeX postprocessing only within $...$ / $$...$$ segments.""" if not markdown_content: return markdown_content def _fix_segment(m: re.Match) -> str: seg = m.group(0) if seg.startswith("$$") and seg.endswith("$$"): return f"$${_postprocess_math(seg[2:-2])}$$" if seg.startswith("$") and seg.endswith("$"): return f"${_postprocess_math(seg[1:-1])}$" return seg return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content) class OCRServiceBase(ABC): @abstractmethod def recognize(self, image: np.ndarray) -> dict: pass class OCRService(OCRServiceBase): """Service for OCR using PaddleOCR-VL.""" _pipeline: Optional[PaddleOCRVL] = None _layout_detector: Optional[LayoutDetector] = None def __init__( self, vl_server_url: str, layout_detector: LayoutDetector, image_processor: ImageProcessor, converter: Converter, ): """Initialize OCR service. Args: vl_server_url: URL of the vLLM server for PaddleOCR-VL. layout_detector: Layout detector instance. image_processor: Image processor instance. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url self.layout_detector = layout_detector self.image_processor = image_processor self.converter = converter def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: PaddleOCRVL pipeline instance. """ if OCRService._pipeline is None: OCRService._pipeline = PaddleOCRVL( vl_rec_backend="vllm-server", vl_rec_server_url=self.vl_server_url, layout_detection_model_name="PP-DocLayoutV2", ) return OCRService._pipeline def _recognize_mixed(self, image: np.ndarray) -> dict: """Recognize mixed content (text + formulas) using PP-DocLayoutV2. This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware recognition of mixed content. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=True) markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") markdown_content = _postprocess_markdown(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content) return { "markdown": markdown_content, "latex": convert_result.latex, "mathml": convert_result.mathml, "mml": convert_result.mml, } except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e def _recognize_formula(self, image: np.ndarray) -> dict: """Recognize formula/math content using PaddleOCR-VL with prompt. This mode uses PaddleOCR-VL directly with a formula recognition prompt. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula") markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") markdown_content = _postprocess_markdown(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content) return { "latex": convert_result.latex, "mathml": convert_result.mathml, "mml": convert_result.mml, "markdown": markdown_content, } except Exception as e: raise RuntimeError(f"Formula recognition failed: {e}") from e def recognize(self, image: np.ndarray) -> dict: """Recognize content using PaddleOCR-VL. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ padded_image = self.image_processor.add_padding(image) layout_info = self.layout_detector.detect(padded_image) if layout_info.MixedRecognition: return self._recognize_mixed(image) else: return self._recognize_formula(image) class MineruOCRService(OCRServiceBase): """Service for OCR using local file_parse API.""" def __init__( self, api_url: str = "http://127.0.0.1:8000/file_parse", image_processor: Optional[ImageProcessor] = None, converter: Optional[Converter] = None, ): """Initialize Local API service. Args: api_url: URL of the local file_parse API endpoint. converter: Optional converter instance for format conversion. """ self.api_url = api_url self.image_processor = image_processor self.converter = converter def recognize(self, image: np.ndarray) -> dict: """Recognize content using local file_parse API. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: if self.image_processor: image = self.image_processor.add_padding(image) # Convert numpy array to image bytes success, encoded_image = cv2.imencode(".png", image) if not success: raise RuntimeError("Failed to encode image") image_bytes = BytesIO(encoded_image.tobytes()) # Prepare multipart form data files = {"files": ("image.png", image_bytes, "image/png")} data = { "return_middle_json": "false", "return_model_output": "false", "return_md": "true", "return_images": "false", "end_page_id": "99999", "start_page_id": "0", "lang_list": "en", "server_url": "string", "return_content_list": "false", "backend": "hybrid-auto-engine", "table_enable": "true", "response_format_zip": "false", "formula_enable": "true", "parse_method": "ocr", } # Make API request response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30) response.raise_for_status() result = response.json() # Extract markdown content from response markdown_content = "" if "results" in result and "image" in result["results"]: markdown_content = result["results"]["image"].get("md_content", "") # markdown_content = _postprocess_markdown(markdown_content) # Convert to other formats if converter is available latex = "" mathml = "" mml = "" if self.converter and markdown_content: convert_result = self.converter.convert_to_formats(markdown_content) latex = convert_result.latex mathml = convert_result.mathml mml = convert_result.mml return { "markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml, } except requests.RequestException as e: raise RuntimeError(f"Local API request failed: {e}") from e except Exception as e: raise RuntimeError(f"Recognition failed: {e}") from e if __name__ == "__main__": mineru_service = MineruOCRService() image = cv2.imread("test/complex_formula.png") image_numpy = np.array(image) ocr_result = mineru_service.recognize(image_numpy) print(ocr_result)