feat: add glm ocr

2026-02-06 15:06:50 +08:00
parent c372a4afbe
commit f0ad0a4c77
5 changed files with 133 additions and 32 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -481,6 +481,92 @@ class OCRService(OCRServiceBase):
            return self._recognize_formula(image)


+class GLMOCRService(OCRServiceBase):
+    """Service for OCR using GLM-4V model via vLLM."""
+
+    def __init__(
+        self,
+        vl_server_url: str,
+        image_processor: ImageProcessor,
+        converter: Converter,
+    ):
+        """Initialize GLM OCR service.
+
+        Args:
+            vl_server_url: URL of the vLLM server for GLM-4V (default: http://127.0.0.1:8002/v1).
+            image_processor: Image processor instance.
+            converter: Converter instance for format conversion.
+        """
+        self.vl_server_url = vl_server_url or settings.glm_ocr_url
+        self.image_processor = image_processor
+        self.converter = converter
+        self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600)
+
+    def _recognize_formula(self, image: np.ndarray) -> dict:
+        """Recognize formula/math content using GLM-4V.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+
+        Returns:
+            Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
+        """
+        try:
+            # Add padding to image
+            padded_image = self.image_processor.add_padding(image)
+
+            # Encode image to base64
+            success, encoded_image = cv2.imencode(".png", padded_image)
+            if not success:
+                raise RuntimeError("Failed to encode image")
+
+            image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
+            image_url = f"data:image/png;base64,{image_base64}"
+
+            # Call OpenAI-compatible API with formula recognition prompt
+            prompt = "Formula Recognition:"
+            messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
+
+            response = self.openai_client.chat.completions.create(
+                model="glm-ocr",
+                messages=messages,
+                temperature=0.0,
+            )
+
+            markdown_content = response.choices[0].message.content
+
+            # Process LaTeX delimiters
+            if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
+                markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
+                markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
+            elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
+                markdown_content = f"$${markdown_content}$$"
+
+            # Apply postprocessing
+            markdown_content = _postprocess_markdown(markdown_content)
+            convert_result = self.converter.convert_to_formats(markdown_content)
+
+            return {
+                "latex": convert_result.latex,
+                "mathml": convert_result.mathml,
+                "mml": convert_result.mml,
+                "markdown": markdown_content,
+            }
+        except Exception as e:
+            raise RuntimeError(f"GLM formula recognition failed: {e}") from e
+
+    def recognize(self, image: np.ndarray) -> dict:
+        """Recognize content using GLM-4V.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+
+        Returns:
+            Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
+        """
+        return self._recognize_formula(image)
+
+
 class MineruOCRService(OCRServiceBase):
    """Service for OCR using local file_parse API."""

@@ -490,6 +576,7 @@ class MineruOCRService(OCRServiceBase):
        image_processor: Optional[ImageProcessor] = None,
        converter: Optional[Converter] = None,
        paddleocr_vl_url: str = "http://localhost:8001/v1",
+        layout_detector: Optional[LayoutDetector] = None,
    ):
        """Initialize Local API service.

@@ -573,7 +660,7 @@ class MineruOCRService(OCRServiceBase):
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
-            if self.image_processor:
+            if self.image_processor and get_settings().is_padding:
                image = self.image_processor.add_padding(image)

            # Convert numpy array to image bytes
@@ -647,7 +734,7 @@ class MineruOCRService(OCRServiceBase):

 if __name__ == "__main__":
    mineru_service = MineruOCRService()
-    image = cv2.imread("test/complex_formula.png")
+    image = cv2.imread("test/formula2.jpg")
    image_numpy = np.array(image)
    ocr_result = mineru_service.recognize(image_numpy)
    print(ocr_result)