app/services/ocr_service.py

"""PaddleOCR-VL client service for text and formula recognition."""

import numpy as np
from app.core.config import get_settings
from paddleocr import PaddleOCRVL
from typing import Optional
from app.services.layout_detector import LayoutDetector
from app.services.image_processor import ImageProcessor
from app.services.converter import Converter

settings = get_settings()


class OCRService:
    """Service for OCR using PaddleOCR-VL."""

    _pipeline: Optional[PaddleOCRVL] = None
    _layout_detector: Optional[LayoutDetector] = None

    def __init__(
        self,
        vl_server_url: str,
        layout_detector: LayoutDetector,
        image_processor: ImageProcessor,
        converter: Converter,
    ):
        """Initialize OCR service.

        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
            layout_detector: Layout detector instance.
            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
        self.layout_detector = layout_detector 
        self.image_processor = image_processor
        self.converter = converter
    def _get_pipeline(self):    
        """Get or create PaddleOCR-VL pipeline.

        Returns:
            PaddleOCRVL pipeline instance.
        """
        if OCRService._pipeline is None:
            OCRService._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
            )
        return OCRService._pipeline

    def recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.

        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
        recognition of mixed content.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()

            output = pipeline.predict(image, use_layout_detection=True)

            markdown_content = ""

            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

            convert_result  = self.converter.convert_to_formats(markdown_content)

            return {
                "markdown": markdown_content,
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
            }
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

    def recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using PaddleOCR-VL with prompt.

        This mode uses PaddleOCR-VL directly with a formula recognition prompt.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()

            output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")

            markdown_content = ""

            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
                "markdown": markdown_content,
            }
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e

    def recognize(self, image: np.ndarray) -> dict:
        """Recognize content using PaddleOCR-VL.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        padded_image = self.image_processor.add_padding(image)
        layout_info = self.layout_detector.detect(padded_image)
        if layout_info.MixedRecognition:
            return self.recognize_mixed(image)
        else:
            return self.recognize_formula(image)


if __name__ == "__main__":
    import cv2
    from app.services.image_processor import ImageProcessor
    from app.services.layout_detector import LayoutDetector
    image_processor = ImageProcessor(padding_ratio=0.15)
    layout_detector = LayoutDetector()
    ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
    image = cv2.imread("test/image.png")
    ocr_result = ocr_service.recognize(image)
    print(ocr_result)
init repo 2025-12-29 17:34:58 +08:00			`"""PaddleOCR-VL client service for text and formula recognition."""`

			`import numpy as np`
			`from app.core.config import get_settings`
fix: refact logic 2025-12-31 17:38:32 +08:00			`from paddleocr import PaddleOCRVL`
			`from typing import Optional`
			`from app.services.layout_detector import LayoutDetector`
			`from app.services.image_processor import ImageProcessor`
			`from app.services.converter import Converter`
init repo 2025-12-29 17:34:58 +08:00
			`settings = get_settings()`


			`class OCRService:`
			`"""Service for OCR using PaddleOCR-VL."""`

fix: refact logic 2025-12-31 17:38:32 +08:00			`_pipeline: Optional[PaddleOCRVL] = None`
			`_layout_detector: Optional[LayoutDetector] = None`
init repo 2025-12-29 17:34:58 +08:00
			`def __init__(`
			`self,`
fix: refact logic 2025-12-31 17:38:32 +08:00			`vl_server_url: str,`
			`layout_detector: LayoutDetector,`
			`image_processor: ImageProcessor,`
			`converter: Converter,`
init repo 2025-12-29 17:34:58 +08:00			`):`
			`"""Initialize OCR service.`

			`Args:`
			`vl_server_url: URL of the vLLM server for PaddleOCR-VL.`
fix: refact logic 2025-12-31 17:38:32 +08:00			`layout_detector: Layout detector instance.`
			`image_processor: Image processor instance.`
init repo 2025-12-29 17:34:58 +08:00			`"""`
			`self.vl_server_url = vl_server_url or settings.paddleocr_vl_url`
fix: refact logic 2025-12-31 17:38:32 +08:00			`self.layout_detector = layout_detector`
			`self.image_processor = image_processor`
			`self.converter = converter`
			`def _get_pipeline(self):`
init repo 2025-12-29 17:34:58 +08:00			`"""Get or create PaddleOCR-VL pipeline.`

			`Returns:`
			`PaddleOCRVL pipeline instance.`
			`"""`
fix: refact logic 2025-12-31 17:38:32 +08:00			`if OCRService._pipeline is None:`
			`OCRService._pipeline = PaddleOCRVL(`
init repo 2025-12-29 17:34:58 +08:00			`vl_rec_backend="vllm-server",`
			`vl_rec_server_url=self.vl_server_url,`
			`layout_detection_model_name="PP-DocLayoutV2",`
			`)`
fix: refact logic 2025-12-31 17:38:32 +08:00			`return OCRService._pipeline`
init repo 2025-12-29 17:34:58 +08:00
			`def recognize_mixed(self, image: np.ndarray) -> dict:`
			`"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.`

			`This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware`
			`recognition of mixed content.`

			`Args:`
			`image: Input image as numpy array in BGR format.`

			`Returns:`
			`Dict with 'markdown', 'latex', 'mathml' keys.`
			`"""`
			`try:`
			`pipeline = self._get_pipeline()`

fix: refact logic 2025-12-31 17:38:32 +08:00			`output = pipeline.predict(image, use_layout_detection=True)`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`markdown_content = ""`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`for res in output:`
			`markdown_content += res.markdown.get("markdown_texts", "")`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`convert_result = self.converter.convert_to_formats(markdown_content)`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`return {`
			`"markdown": markdown_content,`
			`"latex": convert_result.latex,`
			`"mathml": convert_result.mathml,`
			`}`
init repo 2025-12-29 17:34:58 +08:00			`except Exception as e:`
			`raise RuntimeError(f"Mixed recognition failed: {e}") from e`

			`def recognize_formula(self, image: np.ndarray) -> dict:`
			`"""Recognize formula/math content using PaddleOCR-VL with prompt.`

			`This mode uses PaddleOCR-VL directly with a formula recognition prompt.`

			`Args:`
			`image: Input image as numpy array in BGR format.`

			`Returns:`
			`Dict with 'latex', 'markdown', 'mathml' keys.`
			`"""`
			`try:`
fix: refact logic 2025-12-31 17:38:32 +08:00			`pipeline = self._get_pipeline()`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`markdown_content = ""`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`for res in output:`
			`markdown_content += res.markdown.get("markdown_texts", "")`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`convert_result = self.converter.convert_to_formats(markdown_content)`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`return {`
			`"latex": convert_result.latex,`
			`"mathml": convert_result.mathml,`
			`"markdown": markdown_content,`
			`}`
init repo 2025-12-29 17:34:58 +08:00			`except Exception as e:`
			`raise RuntimeError(f"Formula recognition failed: {e}") from e`

fix: refact logic 2025-12-31 17:38:32 +08:00			`def recognize(self, image: np.ndarray) -> dict:`
			`"""Recognize content using PaddleOCR-VL.`
init repo 2025-12-29 17:34:58 +08:00
			`Args:`
			`image: Input image as numpy array in BGR format.`

			`Returns:`
fix: refact logic 2025-12-31 17:38:32 +08:00			`Dict with 'latex', 'markdown', 'mathml' keys.`
init repo 2025-12-29 17:34:58 +08:00			`"""`
fix: refact logic 2025-12-31 17:38:32 +08:00			`padded_image = self.image_processor.add_padding(image)`
			`layout_info = self.layout_detector.detect(padded_image)`
			`if layout_info.MixedRecognition:`
			`return self.recognize_mixed(image)`
init repo 2025-12-29 17:34:58 +08:00			`else:`
fix: refact logic 2025-12-31 17:38:32 +08:00			`return self.recognize_formula(image)`


			`if __name__ == "__main__":`
			`import cv2`
			`from app.services.image_processor import ImageProcessor`
			`from app.services.layout_detector import LayoutDetector`
			`image_processor = ImageProcessor(padding_ratio=0.15)`
			`layout_detector = LayoutDetector()`
			`ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)`
			`image = cv2.imread("test/image.png")`
			`ocr_result = ocr_service.recognize(image)`
			`print(ocr_result)`