2025-12-29 17:34:58 +08:00
|
|
|
"""PaddleOCR-VL client service for text and formula recognition."""
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
from app.core.config import get_settings
|
2025-12-31 17:38:32 +08:00
|
|
|
from paddleocr import PaddleOCRVL
|
|
|
|
|
from typing import Optional
|
|
|
|
|
from app.services.layout_detector import LayoutDetector
|
|
|
|
|
from app.services.image_processor import ImageProcessor
|
|
|
|
|
from app.services.converter import Converter
|
2025-12-29 17:34:58 +08:00
|
|
|
|
|
|
|
|
settings = get_settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class OCRService:
|
|
|
|
|
"""Service for OCR using PaddleOCR-VL."""
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
_pipeline: Optional[PaddleOCRVL] = None
|
|
|
|
|
_layout_detector: Optional[LayoutDetector] = None
|
2025-12-29 17:34:58 +08:00
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
2025-12-31 17:38:32 +08:00
|
|
|
vl_server_url: str,
|
|
|
|
|
layout_detector: LayoutDetector,
|
|
|
|
|
image_processor: ImageProcessor,
|
|
|
|
|
converter: Converter,
|
2025-12-29 17:34:58 +08:00
|
|
|
):
|
|
|
|
|
"""Initialize OCR service.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
|
2025-12-31 17:38:32 +08:00
|
|
|
layout_detector: Layout detector instance.
|
|
|
|
|
image_processor: Image processor instance.
|
2025-12-29 17:34:58 +08:00
|
|
|
"""
|
|
|
|
|
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
|
2025-12-31 17:38:32 +08:00
|
|
|
self.layout_detector = layout_detector
|
|
|
|
|
self.image_processor = image_processor
|
|
|
|
|
self.converter = converter
|
|
|
|
|
def _get_pipeline(self):
|
2025-12-29 17:34:58 +08:00
|
|
|
"""Get or create PaddleOCR-VL pipeline.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
PaddleOCRVL pipeline instance.
|
|
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
if OCRService._pipeline is None:
|
|
|
|
|
OCRService._pipeline = PaddleOCRVL(
|
2025-12-29 17:34:58 +08:00
|
|
|
vl_rec_backend="vllm-server",
|
|
|
|
|
vl_rec_server_url=self.vl_server_url,
|
|
|
|
|
layout_detection_model_name="PP-DocLayoutV2",
|
|
|
|
|
)
|
2025-12-31 17:38:32 +08:00
|
|
|
return OCRService._pipeline
|
2025-12-29 17:34:58 +08:00
|
|
|
|
|
|
|
|
def recognize_mixed(self, image: np.ndarray) -> dict:
|
|
|
|
|
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
|
|
|
|
|
|
|
|
|
|
This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
|
|
|
|
|
recognition of mixed content.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'markdown', 'latex', 'mathml' keys.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
pipeline = self._get_pipeline()
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
output = pipeline.predict(image, use_layout_detection=True)
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
markdown_content = ""
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
for res in output:
|
|
|
|
|
markdown_content += res.markdown.get("markdown_texts", "")
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
return {
|
|
|
|
|
"markdown": markdown_content,
|
|
|
|
|
"latex": convert_result.latex,
|
|
|
|
|
"mathml": convert_result.mathml,
|
|
|
|
|
}
|
2025-12-29 17:34:58 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
|
|
|
|
|
|
|
|
|
def recognize_formula(self, image: np.ndarray) -> dict:
|
|
|
|
|
"""Recognize formula/math content using PaddleOCR-VL with prompt.
|
|
|
|
|
|
|
|
|
|
This mode uses PaddleOCR-VL directly with a formula recognition prompt.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'latex', 'markdown', 'mathml' keys.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
2025-12-31 17:38:32 +08:00
|
|
|
pipeline = self._get_pipeline()
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
markdown_content = ""
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
for res in output:
|
|
|
|
|
markdown_content += res.markdown.get("markdown_texts", "")
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
return {
|
|
|
|
|
"latex": convert_result.latex,
|
|
|
|
|
"mathml": convert_result.mathml,
|
|
|
|
|
"markdown": markdown_content,
|
|
|
|
|
}
|
2025-12-29 17:34:58 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"Formula recognition failed: {e}") from e
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def recognize(self, image: np.ndarray) -> dict:
|
|
|
|
|
"""Recognize content using PaddleOCR-VL.
|
2025-12-29 17:34:58 +08:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
2025-12-31 17:38:32 +08:00
|
|
|
Dict with 'latex', 'markdown', 'mathml' keys.
|
2025-12-29 17:34:58 +08:00
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
padded_image = self.image_processor.add_padding(image)
|
|
|
|
|
layout_info = self.layout_detector.detect(padded_image)
|
|
|
|
|
if layout_info.MixedRecognition:
|
|
|
|
|
return self.recognize_mixed(image)
|
2025-12-29 17:34:58 +08:00
|
|
|
else:
|
2025-12-31 17:38:32 +08:00
|
|
|
return self.recognize_formula(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
import cv2
|
|
|
|
|
from app.services.image_processor import ImageProcessor
|
|
|
|
|
from app.services.layout_detector import LayoutDetector
|
|
|
|
|
image_processor = ImageProcessor(padding_ratio=0.15)
|
|
|
|
|
layout_detector = LayoutDetector()
|
|
|
|
|
ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
|
|
|
|
|
image = cv2.imread("test/image.png")
|
|
|
|
|
ocr_result = ocr_service.recognize(image)
|
|
|
|
|
print(ocr_result)
|