feat: add glm ocr

This commit is contained in:
liuyuanchuang
2026-02-06 15:06:50 +08:00
parent c372a4afbe
commit f0ad0a4c77
5 changed files with 133 additions and 32 deletions

View File

@@ -87,11 +87,11 @@ class LayoutDetector:
def _get_layout_detector(self):
"""Get or create LayoutDetection instance."""
if LayoutDetector._layout_detector is None:
LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV3")
return LayoutDetector._layout_detector
def detect(self, image: np.ndarray) -> LayoutInfo:
"""Detect layout of the image using PP-DocLayoutV2.
"""Detect layout of the image using PP-DocLayoutV3.
Args:
image: Input image as numpy array.
@@ -125,13 +125,14 @@ class LayoutDetector:
# Normalize label to region type
region_type = self.LABEL_TO_TYPE.get(label, "text")
regions.append(LayoutRegion(
type=region_type,
bbox=coordinate,
confidence=score,
score=score,
))
regions.append(
LayoutRegion(
type=region_type,
bbox=coordinate,
confidence=score,
score=score,
)
)
mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)
@@ -144,14 +145,14 @@ if __name__ == "__main__":
from app.services.image_processor import ImageProcessor
from app.services.converter import Converter
from app.services.ocr_service import OCRService
settings = get_settings()
# Initialize dependencies
layout_detector = LayoutDetector()
image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)
converter = Converter()
# Initialize OCR service
ocr_service = OCRService(
vl_server_url=settings.paddleocr_vl_url,
@@ -159,20 +160,20 @@ if __name__ == "__main__":
image_processor=image_processor,
converter=converter,
)
# Load test image
image_path = "test/complex_formula.png"
image_path = "test/timeout.jpg"
image = cv2.imread(image_path)
if image is None:
print(f"Failed to load image: {image_path}")
else:
print(f"Image loaded: {image.shape}")
# Run OCR recognition
result = ocr_service.recognize(image)
print("\n=== OCR Result ===")
print(f"Markdown:\n{result['markdown']}")
print(f"\nLaTeX:\n{result['latex']}")
print(f"\nMathML:\n{result['mathml']}")
print(f"\nMathML:\n{result['mathml']}")