fix: remove padding from GLMOCREndToEndService and clean up ruff violations

- Drop image padding in GLMOCREndToEndService.recognize(); use raw image directly - Fix F821 undefined `padded` references replaced with `image` - Fix F601 duplicate dict key "≠" in converter - Fix F841 unused `image_cls_ids` variable in layout_postprocess - Fix E702 semicolon-separated statements in layout_postprocess - Fix UP031 percent-format replaced with f-string in logging_config - Auto-fix 44 additional ruff violations (import order, UP035/UP045/UP006, F401, F541) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 19:52:22 +08:00
parent f8173f7c0a
commit 30d2c2f45b
16 changed files with 162 additions and 140 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -878,12 +878,9 @@ class GLMOCREndToEndService(OCRServiceBase):
        Returns:
            Dict with 'markdown', 'latex', 'mathml', 'mml' keys.
        """
-        # 1. Padding
-        padded = self.image_processor.add_padding(image)
-        img_h, img_w = padded.shape[:2]
-
-        # 2. Layout detection
-        layout_info = self.layout_detector.detect(padded)
+        # 1. Layout detection
+        img_h, img_w = image.shape[:2]
+        layout_info = self.layout_detector.detect(image)

        # Sort regions in reading order: top-to-bottom, left-to-right
        layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
@@ -892,7 +889,7 @@ class GLMOCREndToEndService(OCRServiceBase):
        if not layout_info.regions:
            # No layout detected → assume it's a formula, use formula recognition
            logger.info("No layout regions detected, treating image as formula")
-            raw_content = self._call_vllm(padded, _TASK_PROMPTS["formula"])
+            raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"])
            # Format as display formula markdown
            formatted_content = raw_content.strip()
            if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")):
@@ -905,7 +902,7 @@ class GLMOCREndToEndService(OCRServiceBase):
                if region.type == "figure":
                    continue
                x1, y1, x2, y2 = (int(c) for c in region.bbox)
-                cropped = padded[y1:y2, x1:x2]
+                cropped = image[y1:y2, x1:x2]
                if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
                    logger.warning(
                        "Skipping region idx=%d (label=%s): crop too small %s",
@@ -918,7 +915,7 @@ class GLMOCREndToEndService(OCRServiceBase):
                tasks.append((idx, region, cropped, prompt))

            if not tasks:
-                raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
+                raw_content = self._call_vllm(image, _DEFAULT_PROMPT)
                markdown_content = self._formatter._clean_content(raw_content)
            else:
                # Parallel OCR calls
@@ -965,17 +962,3 @@ class GLMOCREndToEndService(OCRServiceBase):
                logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)

        return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}
-
-
-if __name__ == "__main__":
-    mineru_service = MineruOCRService()
-    image = cv2.imread("test/formula2.jpg")
-    image_numpy = np.array(image)
-    # Encode image to bytes (as done in API layer)
-    success, encoded_image = cv2.imencode(".png", image_numpy)
-    if not success:
-        raise RuntimeError("Failed to encode image")
-    image_bytes = BytesIO(encoded_image.tobytes())
-    image_bytes.seek(0)
-    ocr_result = mineru_service.recognize(image_bytes)
-    print(ocr_result)