doc_processer/app/services/ocr_service.py

"""PaddleOCR-VL client service for text and formula recognition."""

import numpy as np
import cv2
import requests
from io import BytesIO
from app.core.config import get_settings
from paddleocr import PaddleOCRVL
from typing import Optional
from app.services.layout_detector import LayoutDetector
from app.services.image_processor import ImageProcessor
from app.services.converter import Converter
from abc import ABC, abstractmethod

settings = get_settings()

class OCRServiceBase(ABC):
    @abstractmethod
    def recognize(self, image: np.ndarray) -> dict:
        pass


class OCRService(OCRServiceBase):
    """Service for OCR using PaddleOCR-VL."""

    _pipeline: Optional[PaddleOCRVL] = None
    _layout_detector: Optional[LayoutDetector] = None

    def __init__(
        self,
        vl_server_url: str,
        layout_detector: LayoutDetector,
        image_processor: ImageProcessor,
        converter: Converter,
    ):
        """Initialize OCR service.

        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
            layout_detector: Layout detector instance.
            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
        self.layout_detector = layout_detector
        self.image_processor = image_processor
        self.converter = converter

    def _get_pipeline(self):
        """Get or create PaddleOCR-VL pipeline.

        Returns:
            PaddleOCRVL pipeline instance.
        """
        if OCRService._pipeline is None:
            OCRService._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
            )
        return OCRService._pipeline

    def _recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.

        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
        recognition of mixed content.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()

            output = pipeline.predict(image, use_layout_detection=True)

            markdown_content = ""

            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

            convert_result  = self.converter.convert_to_formats(markdown_content)

            return {
                "markdown": markdown_content,
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
            }
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

    def _recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using PaddleOCR-VL with prompt.

        This mode uses PaddleOCR-VL directly with a formula recognition prompt.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()

            output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")

            markdown_content = ""

            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
                "markdown": markdown_content,
            }
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e

    def recognize(self, image: np.ndarray) -> dict:
        """Recognize content using PaddleOCR-VL.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        padded_image = self.image_processor.add_padding(image)
        layout_info = self.layout_detector.detect(padded_image)
        if layout_info.MixedRecognition:
            return self._recognize_mixed(image)
        else:
            return self._recognize_formula(image)


class MineruOCRService(OCRServiceBase):
    """Service for OCR using local file_parse API."""

    def __init__(
        self,
        api_url: str = "http://127.0.0.1:8000/file_parse",
        converter: Optional[Converter] = None,
    ):
        """Initialize Local API service.

        Args:
            api_url: URL of the local file_parse API endpoint.
            converter: Optional converter instance for format conversion.
        """
        self.api_url = api_url
        self.converter = converter

    def recognize(self, image: np.ndarray) -> dict:
        """Recognize content using local file_parse API.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
            # Convert numpy array to image bytes
            success, encoded_image = cv2.imencode('.png', image)
            if not success:
                raise RuntimeError("Failed to encode image")

            image_bytes = BytesIO(encoded_image.tobytes())

            # Prepare multipart form data
            files = {
                'files': ('image.png', image_bytes, 'image/png')
            }

            data = {
                'return_middle_json': 'false',
                'return_model_output': 'false',
                'return_md': 'true',
                'return_images': 'false',
                'end_page_id': '99999',
                'parse_method': 'auto',
                'start_page_id': '0',
                'lang_list': 'en',
                'server_url': 'string',
                'return_content_list': 'false',
                'backend': 'hybrid-auto-engine',
                'table_enable': 'true',
                'response_format_zip': 'false',
                'formula_enable': 'true',
            }

            # Make API request
            response = requests.post(
                self.api_url,
                files=files,
                data=data,
                headers={'accept': 'application/json'},
                timeout=30
            )
            response.raise_for_status()

            result = response.json()

            # Extract markdown content from response
            markdown_content = ""
            if 'results' in result and 'image' in result['results']:
                markdown_content = result['results']['image'].get('md_content', '')

            # Convert to other formats if converter is available
            latex = ""
            mathml = ""
            if self.converter and markdown_content:
                convert_result = self.converter.convert_to_formats(markdown_content)
                latex = convert_result.latex
                mathml = convert_result.mathml

            return {
                "markdown": markdown_content,
                "latex": latex,
                "mathml": mathml,
            }

        except requests.RequestException as e:
            raise RuntimeError(f"Local API request failed: {e}") from e
        except Exception as e:
            raise RuntimeError(f"Recognition failed: {e}") from e


if __name__ == "__main__":
    mineru_service = MineruOCRService()
    image = cv2.imread("test/complex_formula.png")
    image_numpy = np.array(image)
    ocr_result = mineru_service.recognize(image_numpy)
    print(ocr_result)