Files
doc_processer/app/services/layout_detector.py

178 lines
5.2 KiB
Python
Raw Normal View History

2025-12-31 17:38:32 +08:00
"""PP-DocLayoutV2 wrapper for document layout detection."""
2025-12-29 17:34:58 +08:00
import numpy as np
from app.schemas.image import LayoutInfo, LayoutRegion
2025-12-29 20:02:07 +08:00
from app.core.config import get_settings
2025-12-31 17:38:32 +08:00
from paddleocr import LayoutDetection
from typing import Optional
2025-12-29 20:02:07 +08:00
settings = get_settings()
2025-12-29 17:34:58 +08:00
class LayoutDetector:
2025-12-31 17:38:32 +08:00
"""Layout detector for PP-DocLayoutV2."""
_layout_detector: Optional[LayoutDetection] = None
# PP-DocLayoutV2 class ID to label mapping
CLS_ID_TO_LABEL: dict[int, str] = {
0: "abstract",
1: "algorithm",
2: "aside_text",
3: "chart",
4: "content",
5: "display_formula",
6: "doc_title",
7: "figure_title",
8: "footer",
9: "footer_image",
10: "footnote",
11: "formula_number",
12: "header",
13: "header_image",
14: "image",
15: "inline_formula",
16: "number",
17: "paragraph_title",
18: "reference",
19: "reference_content",
20: "seal",
21: "table",
22: "text",
23: "vertical_text",
24: "vision_footnote",
2025-12-29 17:34:58 +08:00
}
2025-12-31 17:38:32 +08:00
# Mapping from raw labels to normalized region types
LABEL_TO_TYPE: dict[str, str] = {
# Text types
"abstract": "text",
"algorithm": "text",
"aside_text": "text",
"content": "text",
"doc_title": "text",
"footer": "text",
"footnote": "text",
"header": "text",
"number": "text",
"paragraph_title": "text",
"reference": "text",
"reference_content": "text",
"text": "text",
"vertical_text": "text",
"vision_footnote": "text",
# Formula types
"display_formula": "formula",
"inline_formula": "formula",
"formula_number": "formula",
# Table types
"table": "table",
# Figure types
"chart": "figure",
"figure_title": "figure",
"footer_image": "figure",
"header_image": "figure",
"image": "figure",
"seal": "figure",
}
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
def __init__(self):
"""Initialize layout detector.
2025-12-29 17:34:58 +08:00
Args:
"""
2025-12-31 17:38:32 +08:00
_ = self._get_layout_detector()
def _get_layout_detector(self):
"""Get or create LayoutDetection instance."""
if LayoutDetector._layout_detector is None:
LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
return LayoutDetector._layout_detector
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
def detect(self, image: np.ndarray) -> LayoutInfo:
"""Detect layout of the image using PP-DocLayoutV2.
Args:
image: Input image as numpy array.
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
Returns:
LayoutInfo with detected regions and flags.
2025-12-29 17:34:58 +08:00
"""
2025-12-31 17:38:32 +08:00
layout_detector = self._get_layout_detector()
result = layout_detector.predict(image)
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
# Parse the result
regions: list[LayoutRegion] = []
mixed_recognition = False
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
# Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]
if isinstance(result, list) and len(result) > 0:
first_result = result[0]
if isinstance(first_result, dict) and "boxes" in first_result:
boxes = first_result.get("boxes", [])
else:
boxes = []
else:
boxes = []
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
for box in boxes:
cls_id = box.get("cls_id")
label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")
score = box.get("score", 0.0)
coordinate = box.get("coordinate", [0, 0, 0, 0])
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
# Normalize label to region type
region_type = self.LABEL_TO_TYPE.get(label, "text")
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
regions.append(LayoutRegion(
type=region_type,
bbox=coordinate,
confidence=score,
score=score,
))
2025-12-29 17:34:58 +08:00
2025-12-31 17:38:32 +08:00
mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)
return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)
if __name__ == "__main__":
import cv2
2026-01-01 23:38:52 +08:00
from app.core.config import get_settings
2025-12-31 17:38:32 +08:00
from app.services.image_processor import ImageProcessor
2026-01-01 23:38:52 +08:00
from app.services.converter import Converter
from app.services.ocr_service import OCRService
settings = get_settings()
# Initialize dependencies
2025-12-31 17:38:32 +08:00
layout_detector = LayoutDetector()
2026-01-01 23:38:52 +08:00
image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)
converter = Converter()
# Initialize OCR service
ocr_service = OCRService(
vl_server_url=settings.paddleocr_vl_url,
layout_detector=layout_detector,
image_processor=image_processor,
converter=converter,
)
# Load test image
image_path = "test/complex_formula.png"
2025-12-31 17:38:32 +08:00
image = cv2.imread(image_path)
2026-01-01 23:38:52 +08:00
if image is None:
print(f"Failed to load image: {image_path}")
else:
print(f"Image loaded: {image.shape}")
# Run OCR recognition
result = ocr_service.recognize(image)
print("\n=== OCR Result ===")
print(f"Markdown:\n{result['markdown']}")
print(f"\nLaTeX:\n{result['latex']}")
print(f"\nMathML:\n{result['mathml']}")