feat: add mineru model

2026-01-05 17:30:54 +08:00
parent 3870c108b2
commit 6ea37c9380
5 changed files with 142 additions and 10 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,17 +1,26 @@
 """PaddleOCR-VL client service for text and formula recognition."""

 import numpy as np
+import cv2
+import requests
+from io import BytesIO
 from app.core.config import get_settings
 from paddleocr import PaddleOCRVL
 from typing import Optional
 from app.services.layout_detector import LayoutDetector
 from app.services.image_processor import ImageProcessor
 from app.services.converter import Converter
+from abc import ABC, abstractmethod

 settings = get_settings()

+class OCRServiceBase(ABC):
+    @abstractmethod
+    def recognize(self, image: np.ndarray) -> dict:
+        pass

-class OCRService:
+
+class OCRService(OCRServiceBase):
    """Service for OCR using PaddleOCR-VL."""

    _pipeline: Optional[PaddleOCRVL] = None
@@ -50,7 +59,7 @@ class OCRService:
            )
        return OCRService._pipeline

-    def recognize_mixed(self, image: np.ndarray) -> dict:
+    def _recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.

        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
@@ -82,7 +91,7 @@ class OCRService:
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

-    def recognize_formula(self, image: np.ndarray) -> dict:
+    def _recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using PaddleOCR-VL with prompt.

        This mode uses PaddleOCR-VL directly with a formula recognition prompt.
@@ -125,6 +134,109 @@ class OCRService:
        padded_image = self.image_processor.add_padding(image)
        layout_info = self.layout_detector.detect(padded_image)
        if layout_info.MixedRecognition:
-            return self.recognize_mixed(image)
+            return self._recognize_mixed(image)
        else:
-            return self.recognize_formula(image)
+            return self._recognize_formula(image)
+
+
+class MineruOCRService(OCRServiceBase):
+    """Service for OCR using local file_parse API."""
+    
+    def __init__(
+        self,
+        api_url: str = "http://127.0.0.1:8000/file_parse",
+        converter: Optional[Converter] = None,
+    ):
+        """Initialize Local API service.
+        
+        Args:
+            api_url: URL of the local file_parse API endpoint.
+            converter: Optional converter instance for format conversion.
+        """
+        self.api_url = api_url
+        self.converter = converter
+    
+    def recognize(self, image: np.ndarray) -> dict:
+        """Recognize content using local file_parse API.
+        
+        Args:
+            image: Input image as numpy array in BGR format.
+            
+        Returns:
+            Dict with 'markdown', 'latex', 'mathml' keys.
+        """
+        try:
+            # Convert numpy array to image bytes
+            success, encoded_image = cv2.imencode('.png', image)
+            if not success:
+                raise RuntimeError("Failed to encode image")
+            
+            image_bytes = BytesIO(encoded_image.tobytes())
+            
+            # Prepare multipart form data
+            files = {
+                'files': ('image.png', image_bytes, 'image/png')
+            }
+            
+            data = {
+                'return_middle_json': 'false',
+                'return_model_output': 'false',
+                'return_md': 'true',
+                'return_images': 'false',
+                'end_page_id': '99999',
+                'parse_method': 'auto',
+                'start_page_id': '0',
+                'lang_list': 'en',
+                'server_url': 'string',
+                'return_content_list': 'false',
+                'backend': 'hybrid-auto-engine',
+                'table_enable': 'true',
+                'response_format_zip': 'false',
+                'formula_enable': 'true',
+            }
+            
+            # Make API request
+            response = requests.post(
+                self.api_url,
+                files=files,
+                data=data,
+                headers={'accept': 'application/json'},
+                timeout=30
+            )
+            response.raise_for_status()
+            
+            result = response.json()
+            
+            # Extract markdown content from response
+            markdown_content = ""
+            if 'results' in result and 'image' in result['results']:
+                markdown_content = result['results']['image'].get('md_content', '')
+            
+            # Convert to other formats if converter is available
+            latex = ""
+            mathml = ""
+            if self.converter and markdown_content:
+                convert_result = self.converter.convert_to_formats(markdown_content)
+                latex = convert_result.latex
+                mathml = convert_result.mathml
+            
+            return {
+                "markdown": markdown_content,
+                "latex": latex,
+                "mathml": mathml,
+            }
+            
+        except requests.RequestException as e:
+            raise RuntimeError(f"Local API request failed: {e}") from e
+        except Exception as e:
+            raise RuntimeError(f"Recognition failed: {e}") from e
+
+
+
+
+if __name__ == "__main__":
+    mineru_service = MineruOCRService()
+    image = cv2.imread("test/complex_formula.png")
+    image_numpy = np.array(image)
+    ocr_result = mineru_service.recognize(image_numpy)
+    print(ocr_result)