feat: add padding

2026-02-07 16:53:09 +08:00
parent d86107976a
commit f514f98142
3 changed files with 50 additions and 21 deletions
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -2,6 +2,8 @@

 import time
 import uuid
+import cv2
+from io import BytesIO

 from fastapi import APIRouter, Depends, HTTPException, Request, Response

@@ -12,12 +14,15 @@ from app.core.dependencies import (
    get_mineru_ocr_service,
    get_glmocr_service,
 )
+from app.core.config import get_settings
 from app.core.logging_config import get_logger, RequestIDAdapter
 from app.schemas.image import ImageOCRRequest, ImageOCRResponse
 from app.services.image_processor import ImageProcessor
 from app.services.layout_detector import LayoutDetector
 from app.services.ocr_service import OCRService, MineruOCRService, GLMOCRService

+settings = get_settings()
+
 router = APIRouter()
 logger = get_logger()

@@ -63,12 +68,18 @@ async def process_image_ocr(
            image_url=request.image_url,
            image_base64=request.image_base64,
        )
+
+        # Apply padding if enabled (before layout detection)
+        processed_image = image
+        if image_processor and settings.is_padding:
+            processed_image = image_processor.add_padding(image)
+
        preprocess_time = time.time() - preprocess_start
        log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s")

-        # Layout detection
+        # Layout detection (using padded image if padding is enabled)
        layout_start = time.time()
-        layout_info = layout_detector.detect(image)
+        layout_info = layout_detector.detect(processed_image)
        layout_time = time.time() - layout_start
        log.info(f"Layout detection completed in {layout_time:.3f}s")

@@ -77,11 +88,19 @@ async def process_image_ocr(
        if layout_info.MixedRecognition:
            recognition_method = "MixedRecognition (MinerU)"
            log.info(f"Using {recognition_method}")
-            ocr_result = mineru_service.recognize(image)
+
+            # Convert numpy array to image bytes (image already padded)
+            success, encoded_image = cv2.imencode(".png", processed_image)
+            if not success:
+                raise RuntimeError("Failed to encode image")
+
+            image_bytes = BytesIO(encoded_image.tobytes())
+            image_bytes.seek(0)  # Ensure position is at the beginning
+            ocr_result = mineru_service.recognize(image_bytes)
        else:
            recognition_method = "FormulaOnly (GLMOCR)"
            log.info(f"Using {recognition_method}")
-            ocr_result = glmocr_service.recognize(image)
+            ocr_result = glmocr_service.recognize(processed_image)
        ocr_time = time.time() - ocr_start

        total_time = time.time() - preprocess_start