app/services/layout_detector.py

"""PP-DocLayoutV2 wrapper for document layout detection."""

import numpy as np

from app.schemas.image import LayoutInfo, LayoutRegion
from app.core.config import get_settings
from paddleocr import LayoutDetection
from typing import Optional

settings = get_settings()


class LayoutDetector:
    """Layout detector for PP-DocLayoutV2."""

    _layout_detector: Optional[LayoutDetection] = None

    # PP-DocLayoutV2 class ID to label mapping
    CLS_ID_TO_LABEL: dict[int, str] = {
        0: "abstract",
        1: "algorithm",
        2: "aside_text",
        3: "chart",
        4: "content",
        5: "display_formula",
        6: "doc_title",
        7: "figure_title",
        8: "footer",
        9: "footer_image",
        10: "footnote",
        11: "formula_number",
        12: "header",
        13: "header_image",
        14: "image",
        15: "inline_formula",
        16: "number",
        17: "paragraph_title",
        18: "reference",
        19: "reference_content",
        20: "seal",
        21: "table",
        22: "text",
        23: "vertical_text",
        24: "vision_footnote",
    }

    # Mapping from raw labels to normalized region types
    LABEL_TO_TYPE: dict[str, str] = {
        # Text types
        "abstract": "text",
        "algorithm": "text",
        "aside_text": "text",
        "content": "text",
        "doc_title": "text",
        "footer": "text",
        "footnote": "text",
        "header": "text",
        "number": "text",
        "paragraph_title": "text",
        "reference": "text",
        "reference_content": "text",
        "text": "text",
        "vertical_text": "text",
        "vision_footnote": "text",
        # Formula types
        "display_formula": "formula",
        "inline_formula": "formula",
        "formula_number": "formula",
        # Table types
        "table": "table",
        # Figure types
        "chart": "figure",
        "figure_title": "figure",
        "footer_image": "figure",
        "header_image": "figure",
        "image": "figure",
        "seal": "figure",
    }

    def __init__(self):
        """Initialize layout detector.

        Args:
        """
        _ = self._get_layout_detector()

    def _get_layout_detector(self):
        """Get or create LayoutDetection instance."""
        if LayoutDetector._layout_detector is None:
            LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
        return LayoutDetector._layout_detector

    def detect(self, image: np.ndarray) -> LayoutInfo:
        """Detect layout of the image using PP-DocLayoutV2.

        Args:
            image: Input image as numpy array.

        Returns:
            LayoutInfo with detected regions and flags.
        """
        layout_detector = self._get_layout_detector()
        result = layout_detector.predict(image)

        # Parse the result
        regions: list[LayoutRegion] = []
        mixed_recognition = False

        # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]
        if isinstance(result, list) and len(result) > 0:
            first_result = result[0]
            if isinstance(first_result, dict) and "boxes" in first_result:
                boxes = first_result.get("boxes", [])
            else:
                boxes = []
        else:
            boxes = []

        for box in boxes:
            cls_id = box.get("cls_id")
            label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")
            score = box.get("score", 0.0)
            coordinate = box.get("coordinate", [0, 0, 0, 0])

            # Normalize label to region type
            region_type = self.LABEL_TO_TYPE.get(label, "text")

            regions.append(LayoutRegion(
                type=region_type,
                bbox=coordinate,
                confidence=score,
                score=score,
            ))


        mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)

        return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)


if __name__ == "__main__":
    import cv2
    from app.core.config import get_settings
    from app.services.image_processor import ImageProcessor
    from app.services.converter import Converter
    from app.services.ocr_service import OCRService
    
    settings = get_settings()
    
    # Initialize dependencies
    layout_detector = LayoutDetector()
    image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)
    converter = Converter()
    
    # Initialize OCR service
    ocr_service = OCRService(
        vl_server_url=settings.paddleocr_vl_url,
        layout_detector=layout_detector,
        image_processor=image_processor,
        converter=converter,
    )
    
    # Load test image
    image_path = "test/complex_formula.png"
    image = cv2.imread(image_path)
    
    if image is None:
        print(f"Failed to load image: {image_path}")
    else:
        print(f"Image loaded: {image.shape}")
        
        # Run OCR recognition
        result = ocr_service.recognize(image)
        
        print("\n=== OCR Result ===")
        print(f"Markdown:\n{result['markdown']}")
        print(f"\nLaTeX:\n{result['latex']}")
        print(f"\nMathML:\n{result['mathml']}")
fix: refact logic 2025-12-31 17:38:32 +08:00			`"""PP-DocLayoutV2 wrapper for document layout detection."""`
init repo 2025-12-29 17:34:58 +08:00
			`import numpy as np`

			`from app.schemas.image import LayoutInfo, LayoutRegion`
fix: add package 2025-12-29 20:02:07 +08:00			`from app.core.config import get_settings`
fix: refact logic 2025-12-31 17:38:32 +08:00			`from paddleocr import LayoutDetection`
			`from typing import Optional`
fix: add package 2025-12-29 20:02:07 +08:00
			`settings = get_settings()`
init repo 2025-12-29 17:34:58 +08:00

			`class LayoutDetector:`
fix: refact logic 2025-12-31 17:38:32 +08:00			`"""Layout detector for PP-DocLayoutV2."""`

			`_layout_detector: Optional[LayoutDetection] = None`

			`# PP-DocLayoutV2 class ID to label mapping`
			`CLS_ID_TO_LABEL: dict[int, str] = {`
			`0: "abstract",`
			`1: "algorithm",`
			`2: "aside_text",`
			`3: "chart",`
			`4: "content",`
			`5: "display_formula",`
			`6: "doc_title",`
			`7: "figure_title",`
			`8: "footer",`
			`9: "footer_image",`
			`10: "footnote",`
			`11: "formula_number",`
			`12: "header",`
			`13: "header_image",`
			`14: "image",`
			`15: "inline_formula",`
			`16: "number",`
			`17: "paragraph_title",`
			`18: "reference",`
			`19: "reference_content",`
			`20: "seal",`
			`21: "table",`
			`22: "text",`
			`23: "vertical_text",`
			`24: "vision_footnote",`
init repo 2025-12-29 17:34:58 +08:00			`}`

fix: refact logic 2025-12-31 17:38:32 +08:00			`# Mapping from raw labels to normalized region types`
			`LABEL_TO_TYPE: dict[str, str] = {`
			`# Text types`
			`"abstract": "text",`
			`"algorithm": "text",`
			`"aside_text": "text",`
			`"content": "text",`
			`"doc_title": "text",`
			`"footer": "text",`
			`"footnote": "text",`
			`"header": "text",`
			`"number": "text",`
			`"paragraph_title": "text",`
			`"reference": "text",`
			`"reference_content": "text",`
			`"text": "text",`
			`"vertical_text": "text",`
			`"vision_footnote": "text",`
			`# Formula types`
			`"display_formula": "formula",`
			`"inline_formula": "formula",`
			`"formula_number": "formula",`
			`# Table types`
			`"table": "table",`
			`# Figure types`
			`"chart": "figure",`
			`"figure_title": "figure",`
			`"footer_image": "figure",`
			`"header_image": "figure",`
			`"image": "figure",`
			`"seal": "figure",`
			`}`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`def __init__(self):`
			`"""Initialize layout detector.`
init repo 2025-12-29 17:34:58 +08:00
			`Args:`
			`"""`
fix: refact logic 2025-12-31 17:38:32 +08:00			`_ = self._get_layout_detector()`

			`def _get_layout_detector(self):`
			`"""Get or create LayoutDetection instance."""`
			`if LayoutDetector._layout_detector is None:`
			`LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")`
			`return LayoutDetector._layout_detector`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`def detect(self, image: np.ndarray) -> LayoutInfo:`
			`"""Detect layout of the image using PP-DocLayoutV2.`

			`Args:`
			`image: Input image as numpy array.`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`Returns:`
			`LayoutInfo with detected regions and flags.`
init repo 2025-12-29 17:34:58 +08:00			`"""`
fix: refact logic 2025-12-31 17:38:32 +08:00			`layout_detector = self._get_layout_detector()`
			`result = layout_detector.predict(image)`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`# Parse the result`
			`regions: list[LayoutRegion] = []`
			`mixed_recognition = False`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`# Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]`
			`if isinstance(result, list) and len(result) > 0:`
			`first_result = result[0]`
			`if isinstance(first_result, dict) and "boxes" in first_result:`
			`boxes = first_result.get("boxes", [])`
			`else:`
			`boxes = []`
			`else:`
			`boxes = []`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`for box in boxes:`
			`cls_id = box.get("cls_id")`
			`label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")`
			`score = box.get("score", 0.0)`
			`coordinate = box.get("coordinate", [0, 0, 0, 0])`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`# Normalize label to region type`
			`region_type = self.LABEL_TO_TYPE.get(label, "text")`
init repo 2025-12-29 17:34:58 +08:00
fix: refact logic 2025-12-31 17:38:32 +08:00			`regions.append(LayoutRegion(`
			`type=region_type,`
			`bbox=coordinate,`
			`confidence=score,`
			`score=score,`
			`))`
init repo 2025-12-29 17:34:58 +08:00

fix: refact logic 2025-12-31 17:38:32 +08:00			`mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)`

			`return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)`


			`if __name__ == "__main__":`
			`import cv2`
fix: image alpha error 2026-01-01 23:38:52 +08:00			`from app.core.config import get_settings`
fix: refact logic 2025-12-31 17:38:32 +08:00			`from app.services.image_processor import ImageProcessor`
fix: image alpha error 2026-01-01 23:38:52 +08:00			`from app.services.converter import Converter`
			`from app.services.ocr_service import OCRService`

			`settings = get_settings()`

			`# Initialize dependencies`
fix: refact logic 2025-12-31 17:38:32 +08:00			`layout_detector = LayoutDetector()`
fix: image alpha error 2026-01-01 23:38:52 +08:00			`image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)`
			`converter = Converter()`

			`# Initialize OCR service`
			`ocr_service = OCRService(`
			`vl_server_url=settings.paddleocr_vl_url,`
			`layout_detector=layout_detector,`
			`image_processor=image_processor,`
			`converter=converter,`
			`)`

			`# Load test image`
			`image_path = "test/complex_formula.png"`
fix: refact logic 2025-12-31 17:38:32 +08:00			`image = cv2.imread(image_path)`
fix: image alpha error 2026-01-01 23:38:52 +08:00
			`if image is None:`
			`print(f"Failed to load image: {image_path}")`
			`else:`
			`print(f"Image loaded: {image.shape}")`

			`# Run OCR recognition`
			`result = ocr_service.recognize(image)`

			`print("\n=== OCR Result ===")`
			`print(f"Markdown:\n{result['markdown']}")`
			`print(f"\nLaTeX:\n{result['latex']}")`
			`print(f"\nMathML:\n{result['mathml']}")`