"""PP-DocLayoutV3 wrapper for document layout detection.""" import numpy as np from paddleocr import LayoutDetection from app.core.config import get_settings from app.schemas.image import LayoutInfo, LayoutRegion from app.services.layout_postprocess import apply_layout_postprocess settings = get_settings() class LayoutDetector: """Layout detector for PP-DocLayoutV2.""" _layout_detector: LayoutDetection | None = None # PP-DocLayoutV2 class ID to label mapping CLS_ID_TO_LABEL: dict[int, str] = { 0: "abstract", 1: "algorithm", 2: "aside_text", 3: "chart", 4: "content", 5: "display_formula", 6: "doc_title", 7: "figure_title", 8: "footer", 9: "footer_image", 10: "footnote", 11: "formula_number", 12: "header", 13: "header_image", 14: "image", 15: "inline_formula", 16: "number", 17: "paragraph_title", 18: "reference", 19: "reference_content", 20: "seal", 21: "table", 22: "text", 23: "vertical_text", 24: "vision_footnote", } # Mapping from raw labels to normalized region types LABEL_TO_TYPE: dict[str, str] = { # Text types "abstract": "text", "algorithm": "text", "aside_text": "text", "content": "text", "doc_title": "text", "footer": "text", "footnote": "text", "header": "text", "number": "text", "paragraph_title": "text", "reference": "text", "reference_content": "text", "text": "text", "vertical_text": "text", "vision_footnote": "text", # Formula types "display_formula": "formula", "inline_formula": "formula", # formula_number is a plain text annotation "(2.9)" next to a formula, # not a formula itself — use text prompt so vLLM returns plain text "formula_number": "text", # Table types "table": "table", # Figure types "chart": "figure", "figure_title": "figure", "footer_image": "figure", "header_image": "figure", "image": "figure", "seal": "figure", } def __init__(self): """Initialize layout detector. Args: """ _ = self._get_layout_detector() def _get_layout_detector(self): """Get or create LayoutDetection instance.""" if LayoutDetector._layout_detector is None: LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV3") return LayoutDetector._layout_detector def detect(self, image: np.ndarray) -> LayoutInfo: """Detect layout of the image using PP-DocLayoutV3. Args: image: Input image as numpy array. Returns: LayoutInfo with detected regions and flags. """ layout_detector = self._get_layout_detector() result = layout_detector.predict(image) # Parse the result regions: list[LayoutRegion] = [] mixed_recognition = False # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}] if isinstance(result, list) and len(result) > 0: first_result = result[0] if isinstance(first_result, dict) and "boxes" in first_result: boxes = first_result.get("boxes", []) else: boxes = [] else: boxes = [] # Apply GLM-OCR layout post-processing (NMS, containment, unclip, clamp) if boxes: h, w = image.shape[:2] boxes = apply_layout_postprocess( boxes, img_size=(w, h), layout_nms=True, layout_unclip_ratio=None, layout_merge_bboxes_mode="large", ) for box in boxes: cls_id = box.get("cls_id") label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other") score = box.get("score", 0.0) coordinate = box.get("coordinate", [0, 0, 0, 0]) # Normalize label to region type region_type = self.LABEL_TO_TYPE.get(label, "text") regions.append( LayoutRegion( type=region_type, native_label=label, bbox=coordinate, confidence=score, score=score, ) ) mixed_recognition = any(region.type == "text" and region.score > 0.3 for region in regions) return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition) if __name__ == "__main__": import cv2 from app.core.config import get_settings from app.services.converter import Converter from app.services.image_processor import ImageProcessor from app.services.ocr_service import GLMOCREndToEndService settings = get_settings() # Initialize dependencies layout_detector = LayoutDetector() image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio) converter = Converter() # Initialize OCR service ocr_service = GLMOCREndToEndService( vl_server_url=settings.glm_ocr_url, layout_detector=layout_detector, image_processor=image_processor, converter=converter, ) # Load test image image_path = "test/image2.png" image = cv2.imread(image_path) if image is None: print(f"Failed to load image: {image_path}") else: print(f"Image loaded: {image.shape}") # Run OCR recognition result = ocr_service.recognize(image) print("\n=== OCR Result ===") print(f"Markdown:\n{result['markdown']}") print(f"\nLaTeX:\n{result['latex']}") print(f"\nMathML:\n{result['mathml']}")