init repo

2025-12-29 17:34:58 +08:00
commit 874fd383cc
36 changed files with 2641 additions and 0 deletions
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -0,0 +1,59 @@
+"""Image OCR endpoint."""
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service
+from app.schemas.image import ImageOCRRequest, ImageOCRResponse
+from app.services.image_processor import ImageProcessor
+from app.services.layout_detector import LayoutDetector
+from app.services.ocr_service import OCRService
+
+router = APIRouter()
+
+
+@router.post("/ocr", response_model=ImageOCRResponse)
+async def process_image_ocr(
+    request: ImageOCRRequest,
+    image_processor: ImageProcessor = Depends(get_image_processor),
+    layout_detector: LayoutDetector = Depends(get_layout_detector),
+    ocr_service: OCRService = Depends(get_ocr_service),
+) -> ImageOCRResponse:
+    """Process an image and extract content as LaTeX, Markdown, and MathML.
+
+    The processing pipeline:
+    1. Load and preprocess image (add 30% whitespace padding)
+    2. Detect layout using DocLayout-YOLO
+    3. Based on layout:
+       - If plain text exists: use PP-DocLayoutV2 for mixed recognition
+       - Otherwise: use PaddleOCR-VL with formula prompt
+    4. Convert output to LaTeX, Markdown, and MathML formats
+    """
+    try:
+        # 1. Load and preprocess image
+        image = image_processor.preprocess(
+            image_url=request.image_url,
+            image_base64=request.image_base64,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    try:
+        # 2. Detect layout
+        layout_info = layout_detector.detect(image)
+    except RuntimeError as e:
+        raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}")
+
+    try:
+        # 3. Perform OCR based on layout
+        ocr_result = ocr_service.recognize(image, layout_info)
+    except RuntimeError as e:
+        raise HTTPException(status_code=503, detail=str(e))
+
+    # 4. Return response
+    return ImageOCRResponse(
+        latex=ocr_result.get("latex", ""),
+        markdown=ocr_result.get("markdown", ""),
+        mathml=ocr_result.get("mathml", ""),
+        layout_info=layout_info,
+        recognition_mode=ocr_result.get("recognition_mode", ""),
+    )