feat: add padding
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
|
||||
import time
|
||||
import uuid
|
||||
import cv2
|
||||
from io import BytesIO
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request, Response
|
||||
|
||||
@@ -12,12 +14,15 @@ from app.core.dependencies import (
|
||||
get_mineru_ocr_service,
|
||||
get_glmocr_service,
|
||||
)
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging_config import get_logger, RequestIDAdapter
|
||||
from app.schemas.image import ImageOCRRequest, ImageOCRResponse
|
||||
from app.services.image_processor import ImageProcessor
|
||||
from app.services.layout_detector import LayoutDetector
|
||||
from app.services.ocr_service import OCRService, MineruOCRService, GLMOCRService
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
router = APIRouter()
|
||||
logger = get_logger()
|
||||
|
||||
@@ -63,12 +68,18 @@ async def process_image_ocr(
|
||||
image_url=request.image_url,
|
||||
image_base64=request.image_base64,
|
||||
)
|
||||
|
||||
# Apply padding if enabled (before layout detection)
|
||||
processed_image = image
|
||||
if image_processor and settings.is_padding:
|
||||
processed_image = image_processor.add_padding(image)
|
||||
|
||||
preprocess_time = time.time() - preprocess_start
|
||||
log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s")
|
||||
|
||||
# Layout detection
|
||||
# Layout detection (using padded image if padding is enabled)
|
||||
layout_start = time.time()
|
||||
layout_info = layout_detector.detect(image)
|
||||
layout_info = layout_detector.detect(processed_image)
|
||||
layout_time = time.time() - layout_start
|
||||
log.info(f"Layout detection completed in {layout_time:.3f}s")
|
||||
|
||||
@@ -77,11 +88,19 @@ async def process_image_ocr(
|
||||
if layout_info.MixedRecognition:
|
||||
recognition_method = "MixedRecognition (MinerU)"
|
||||
log.info(f"Using {recognition_method}")
|
||||
ocr_result = mineru_service.recognize(image)
|
||||
|
||||
# Convert numpy array to image bytes (image already padded)
|
||||
success, encoded_image = cv2.imencode(".png", processed_image)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image")
|
||||
|
||||
image_bytes = BytesIO(encoded_image.tobytes())
|
||||
image_bytes.seek(0) # Ensure position is at the beginning
|
||||
ocr_result = mineru_service.recognize(image_bytes)
|
||||
else:
|
||||
recognition_method = "FormulaOnly (GLMOCR)"
|
||||
log.info(f"Using {recognition_method}")
|
||||
ocr_result = glmocr_service.recognize(image)
|
||||
ocr_result = glmocr_service.recognize(processed_image)
|
||||
ocr_time = time.time() - ocr_start
|
||||
|
||||
total_time = time.time() - preprocess_start
|
||||
|
||||
@@ -21,17 +21,31 @@ class Settings(BaseSettings):
|
||||
api_prefix: str = "/doc_process/v1"
|
||||
debug: bool = False
|
||||
|
||||
# Base Host Settings (can be overridden via .env file)
|
||||
# Default: 127.0.0.1 (production)
|
||||
# Dev: Set BASE_HOST=100.115.184.74 in .env file
|
||||
base_host: str = "127.0.0.1"
|
||||
|
||||
# PaddleOCR-VL Settings
|
||||
paddleocr_vl_url: str = "http://127.0.0.1:8001/v1"
|
||||
@property
|
||||
def paddleocr_vl_url(self) -> str:
|
||||
"""Get PaddleOCR-VL URL based on base_host."""
|
||||
return f"http://{self.base_host}:8001/v1"
|
||||
|
||||
# MinerOCR Settings
|
||||
miner_ocr_api_url: str = "http://127.0.0.1:8000/file_parse"
|
||||
@property
|
||||
def miner_ocr_api_url(self) -> str:
|
||||
"""Get MinerOCR API URL based on base_host."""
|
||||
return f"http://{self.base_host}:8000/file_parse"
|
||||
|
||||
# GLM OCR Settings
|
||||
glm_ocr_url: str = "http://127.0.0.1:8002/v1"
|
||||
@property
|
||||
def glm_ocr_url(self) -> str:
|
||||
"""Get GLM OCR URL based on base_host."""
|
||||
return f"http://{self.base_host}:8002/v1"
|
||||
|
||||
# padding ratio
|
||||
is_padding: bool = False
|
||||
is_padding: bool = True
|
||||
padding_ratio: float = 0.15
|
||||
|
||||
# Model Paths
|
||||
|
||||
@@ -650,26 +650,16 @@ class MineruOCRService(OCRServiceBase):
|
||||
|
||||
return formula_text
|
||||
|
||||
def recognize(self, image: np.ndarray) -> dict:
|
||||
def recognize(self, image_bytes: BytesIO) -> dict:
|
||||
"""Recognize content using local file_parse API.
|
||||
|
||||
Args:
|
||||
image: Input image as numpy array in BGR format.
|
||||
image_bytes: Input image as BytesIO object (already encoded as PNG).
|
||||
|
||||
Returns:
|
||||
Dict with 'markdown', 'latex', 'mathml' keys.
|
||||
"""
|
||||
try:
|
||||
if self.image_processor and settings.is_padding:
|
||||
image = self.image_processor.add_padding(image)
|
||||
|
||||
# Convert numpy array to image bytes
|
||||
success, encoded_image = cv2.imencode(".png", image)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image")
|
||||
|
||||
image_bytes = BytesIO(encoded_image.tobytes())
|
||||
|
||||
# Prepare multipart form data
|
||||
files = {"files": ("image.png", image_bytes, "image/png")}
|
||||
|
||||
@@ -731,5 +721,11 @@ if __name__ == "__main__":
|
||||
mineru_service = MineruOCRService()
|
||||
image = cv2.imread("test/formula2.jpg")
|
||||
image_numpy = np.array(image)
|
||||
ocr_result = mineru_service.recognize(image_numpy)
|
||||
# Encode image to bytes (as done in API layer)
|
||||
success, encoded_image = cv2.imencode(".png", image_numpy)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image")
|
||||
image_bytes = BytesIO(encoded_image.tobytes())
|
||||
image_bytes.seek(0)
|
||||
ocr_result = mineru_service.recognize(image_bytes)
|
||||
print(ocr_result)
|
||||
|
||||
Reference in New Issue
Block a user