From f8173f7c0a0d2701ac2fd1aaeae466bb05dfbafb Mon Sep 17 00:00:00 2001
From: liuyuanchuang <yuanchuang_liu@qingsongchou.com>
Date: Tue, 10 Mar 2026 09:54:54 +0800
Subject: [PATCH] feat: optimize padding and formula fallback

---
 app/services/image_processor.py |  7 +++++--
 app/services/ocr_service.py     | 10 ++++++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/app/services/image_processor.py b/app/services/image_processor.py
index b57dff6..baf6f8a 100644
--- a/app/services/image_processor.py
+++ b/app/services/image_processor.py
@@ -104,7 +104,8 @@ class ImageProcessor:
         """Add whitespace padding around the image.
 
         Adds padding equal to padding_ratio * max(height, width) on each side.
-        This expands the image by approximately 30% total (15% on each side).
+        For small images (height < 80 or width < 500), uses reduced padding_ratio 0.2.
+        This expands the image by approximately 30% total (15% on each side) for normal images.
 
         Args:
             image: Input image as numpy array in BGR format.
@@ -113,7 +114,9 @@ class ImageProcessor:
             Padded image as numpy array.
         """
         height, width = image.shape[:2]
-        padding = int(max(height, width) * self.padding_ratio)
+        # Use smaller padding ratio for small images to preserve detail
+        padding_ratio = 0.2 if height < 80 or width < 500 else self.padding_ratio
+        padding = int(max(height, width) * padding_ratio)
 
         # Add white padding on all sides
         padded_image = cv2.copyMakeBorder(
diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py
index 321a483..3ccfb53 100644
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -890,8 +890,14 @@ class GLMOCREndToEndService(OCRServiceBase):
 
         # 3. OCR: per-region (parallel) or full-image fallback
         if not layout_info.regions:
-            raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
-            markdown_content = self._formatter._clean_content(raw_content)
+            # No layout detected → assume it's a formula, use formula recognition
+            logger.info("No layout regions detected, treating image as formula")
+            raw_content = self._call_vllm(padded, _TASK_PROMPTS["formula"])
+            # Format as display formula markdown
+            formatted_content = raw_content.strip()
+            if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")):
+                formatted_content = f"$$\n{formatted_content}\n$$"
+            markdown_content = formatted_content
         else:
             # Build task list for non-figure regions
             tasks = []