diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py index 7911a48..0074cf2 100644 --- a/app/api/v1/endpoints/image.py +++ b/app/api/v1/endpoints/image.py @@ -2,6 +2,8 @@ import time import uuid +import cv2 +from io import BytesIO from fastapi import APIRouter, Depends, HTTPException, Request, Response @@ -12,12 +14,15 @@ from app.core.dependencies import ( get_mineru_ocr_service, get_glmocr_service, ) +from app.core.config import get_settings from app.core.logging_config import get_logger, RequestIDAdapter from app.schemas.image import ImageOCRRequest, ImageOCRResponse from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector from app.services.ocr_service import OCRService, MineruOCRService, GLMOCRService +settings = get_settings() + router = APIRouter() logger = get_logger() @@ -63,12 +68,18 @@ async def process_image_ocr( image_url=request.image_url, image_base64=request.image_base64, ) + + # Apply padding if enabled (before layout detection) + processed_image = image + if image_processor and settings.is_padding: + processed_image = image_processor.add_padding(image) + preprocess_time = time.time() - preprocess_start log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s") - # Layout detection + # Layout detection (using padded image if padding is enabled) layout_start = time.time() - layout_info = layout_detector.detect(image) + layout_info = layout_detector.detect(processed_image) layout_time = time.time() - layout_start log.info(f"Layout detection completed in {layout_time:.3f}s") @@ -77,11 +88,19 @@ async def process_image_ocr( if layout_info.MixedRecognition: recognition_method = "MixedRecognition (MinerU)" log.info(f"Using {recognition_method}") - ocr_result = mineru_service.recognize(image) + + # Convert numpy array to image bytes (image already padded) + success, encoded_image = cv2.imencode(".png", processed_image) + if not success: + raise RuntimeError("Failed to encode image") + + image_bytes = BytesIO(encoded_image.tobytes()) + image_bytes.seek(0) # Ensure position is at the beginning + ocr_result = mineru_service.recognize(image_bytes) else: recognition_method = "FormulaOnly (GLMOCR)" log.info(f"Using {recognition_method}") - ocr_result = glmocr_service.recognize(image) + ocr_result = glmocr_service.recognize(processed_image) ocr_time = time.time() - ocr_start total_time = time.time() - preprocess_start diff --git a/app/core/config.py b/app/core/config.py index c11760b..07bb04a 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -21,17 +21,31 @@ class Settings(BaseSettings): api_prefix: str = "/doc_process/v1" debug: bool = False + # Base Host Settings (can be overridden via .env file) + # Default: 127.0.0.1 (production) + # Dev: Set BASE_HOST=100.115.184.74 in .env file + base_host: str = "127.0.0.1" + # PaddleOCR-VL Settings - paddleocr_vl_url: str = "http://127.0.0.1:8001/v1" + @property + def paddleocr_vl_url(self) -> str: + """Get PaddleOCR-VL URL based on base_host.""" + return f"http://{self.base_host}:8001/v1" # MinerOCR Settings - miner_ocr_api_url: str = "http://127.0.0.1:8000/file_parse" + @property + def miner_ocr_api_url(self) -> str: + """Get MinerOCR API URL based on base_host.""" + return f"http://{self.base_host}:8000/file_parse" # GLM OCR Settings - glm_ocr_url: str = "http://127.0.0.1:8002/v1" + @property + def glm_ocr_url(self) -> str: + """Get GLM OCR URL based on base_host.""" + return f"http://{self.base_host}:8002/v1" # padding ratio - is_padding: bool = False + is_padding: bool = True padding_ratio: float = 0.15 # Model Paths diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 78e38e6..2981052 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -650,26 +650,16 @@ class MineruOCRService(OCRServiceBase): return formula_text - def recognize(self, image: np.ndarray) -> dict: + def recognize(self, image_bytes: BytesIO) -> dict: """Recognize content using local file_parse API. Args: - image: Input image as numpy array in BGR format. + image_bytes: Input image as BytesIO object (already encoded as PNG). Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: - if self.image_processor and settings.is_padding: - image = self.image_processor.add_padding(image) - - # Convert numpy array to image bytes - success, encoded_image = cv2.imencode(".png", image) - if not success: - raise RuntimeError("Failed to encode image") - - image_bytes = BytesIO(encoded_image.tobytes()) - # Prepare multipart form data files = {"files": ("image.png", image_bytes, "image/png")} @@ -731,5 +721,11 @@ if __name__ == "__main__": mineru_service = MineruOCRService() image = cv2.imread("test/formula2.jpg") image_numpy = np.array(image) - ocr_result = mineru_service.recognize(image_numpy) + # Encode image to bytes (as done in API layer) + success, encoded_image = cv2.imencode(".png", image_numpy) + if not success: + raise RuntimeError("Failed to encode image") + image_bytes = BytesIO(encoded_image.tobytes()) + image_bytes.seek(0) + ocr_result = mineru_service.recognize(image_bytes) print(ocr_result)