feat: add glm ocr

2026-02-06 15:06:50 +08:00
parent c372a4afbe
commit f0ad0a4c77
5 changed files with 133 additions and 32 deletions
--- a/app/services/layout_detector.py
+++ b/app/services/layout_detector.py
@@ -87,11 +87,11 @@ class LayoutDetector:
    def _get_layout_detector(self):
        """Get or create LayoutDetection instance."""
        if LayoutDetector._layout_detector is None:
-            LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
+            LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV3")
        return LayoutDetector._layout_detector

    def detect(self, image: np.ndarray) -> LayoutInfo:
-        """Detect layout of the image using PP-DocLayoutV2.
+        """Detect layout of the image using PP-DocLayoutV3.

        Args:
            image: Input image as numpy array.
@@ -125,13 +125,14 @@ class LayoutDetector:
            # Normalize label to region type
            region_type = self.LABEL_TO_TYPE.get(label, "text")

-            regions.append(LayoutRegion(
-                type=region_type,
-                bbox=coordinate,
-                confidence=score,
-                score=score,
-            ))
-
+            regions.append(
+                LayoutRegion(
+                    type=region_type,
+                    bbox=coordinate,
+                    confidence=score,
+                    score=score,
+                )
+            )

        mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)

@@ -144,14 +145,14 @@ if __name__ == "__main__":
    from app.services.image_processor import ImageProcessor
    from app.services.converter import Converter
    from app.services.ocr_service import OCRService
-    
+
    settings = get_settings()
-    
+
    # Initialize dependencies
    layout_detector = LayoutDetector()
    image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)
    converter = Converter()
-    
+
    # Initialize OCR service
    ocr_service = OCRService(
        vl_server_url=settings.paddleocr_vl_url,
@@ -159,20 +160,20 @@ if __name__ == "__main__":
        image_processor=image_processor,
        converter=converter,
    )
-    
+
    # Load test image
-    image_path = "test/complex_formula.png"
+    image_path = "test/timeout.jpg"
    image = cv2.imread(image_path)
-    
+
    if image is None:
        print(f"Failed to load image: {image_path}")
    else:
        print(f"Image loaded: {image.shape}")
-        
+
        # Run OCR recognition
        result = ocr_service.recognize(image)
-        
+
        print("\n=== OCR Result ===")
        print(f"Markdown:\n{result['markdown']}")
        print(f"\nLaTeX:\n{result['latex']}")
-        print(f"\nMathML:\n{result['mathml']}")
+        print(f"\nMathML:\n{result['mathml']}")