doc_processer/app/services/layout_detector.py

"""PP-DocLayoutV2 wrapper for document layout detection."""

import numpy as np

from app.schemas.image import LayoutInfo, LayoutRegion
from app.core.config import get_settings
from paddleocr import LayoutDetection
from typing import Optional

settings = get_settings()


class LayoutDetector:
    """Layout detector for PP-DocLayoutV2."""

    _layout_detector: Optional[LayoutDetection] = None

    # PP-DocLayoutV2 class ID to label mapping
    CLS_ID_TO_LABEL: dict[int, str] = {
        0: "abstract",
        1: "algorithm",
        2: "aside_text",
        3: "chart",
        4: "content",
        5: "display_formula",
        6: "doc_title",
        7: "figure_title",
        8: "footer",
        9: "footer_image",
        10: "footnote",
        11: "formula_number",
        12: "header",
        13: "header_image",
        14: "image",
        15: "inline_formula",
        16: "number",
        17: "paragraph_title",
        18: "reference",
        19: "reference_content",
        20: "seal",
        21: "table",
        22: "text",
        23: "vertical_text",
        24: "vision_footnote",
    }

    # Mapping from raw labels to normalized region types
    LABEL_TO_TYPE: dict[str, str] = {
        # Text types
        "abstract": "text",
        "algorithm": "text",
        "aside_text": "text",
        "content": "text",
        "doc_title": "text",
        "footer": "text",
        "footnote": "text",
        "header": "text",
        "number": "text",
        "paragraph_title": "text",
        "reference": "text",
        "reference_content": "text",
        "text": "text",
        "vertical_text": "text",
        "vision_footnote": "text",
        # Formula types
        "display_formula": "formula",
        "inline_formula": "formula",
        "formula_number": "formula",
        # Table types
        "table": "table",
        # Figure types
        "chart": "figure",
        "figure_title": "figure",
        "footer_image": "figure",
        "header_image": "figure",
        "image": "figure",
        "seal": "figure",
    }

    def __init__(self):
        """Initialize layout detector.

        Args:
        """
        _ = self._get_layout_detector()

    def _get_layout_detector(self):
        """Get or create LayoutDetection instance."""
        if LayoutDetector._layout_detector is None:
            LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
        return LayoutDetector._layout_detector

    def detect(self, image: np.ndarray) -> LayoutInfo:
        """Detect layout of the image using PP-DocLayoutV2.

        Args:
            image: Input image as numpy array.

        Returns:
            LayoutInfo with detected regions and flags.
        """
        layout_detector = self._get_layout_detector()
        result = layout_detector.predict(image)

        # Parse the result
        regions: list[LayoutRegion] = []
        mixed_recognition = False

        # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]
        if isinstance(result, list) and len(result) > 0:
            first_result = result[0]
            if isinstance(first_result, dict) and "boxes" in first_result:
                boxes = first_result.get("boxes", [])
            else:
                boxes = []
        else:
            boxes = []

        for box in boxes:
            cls_id = box.get("cls_id")
            label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")
            score = box.get("score", 0.0)
            coordinate = box.get("coordinate", [0, 0, 0, 0])

            # Normalize label to region type
            region_type = self.LABEL_TO_TYPE.get(label, "text")

            regions.append(LayoutRegion(
                type=region_type,
                bbox=coordinate,
                confidence=score,
                score=score,
            ))


        mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)

        return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)


if __name__ == "__main__":
    import cv2
    from app.services.image_processor import ImageProcessor

    layout_detector = LayoutDetector()
    image_path = "test/timeout.png"

    image = cv2.imread(image_path)
    image_processor = ImageProcessor(padding_ratio=0.15)
    image = image_processor.add_padding(image)

    # Save the padded image for debugging
    cv2.imwrite("debug_padded_image.png", image)


    layout_info = layout_detector.detect(image)
    print(layout_info)