fix: remove padding from GLMOCREndToEndService and clean up ruff violations

- Drop image padding in GLMOCREndToEndService.recognize(); use raw image directly
- Fix F821 undefined `padded` references replaced with `image`
- Fix F601 duplicate dict key "≠" in converter
- Fix F841 unused `image_cls_ids` variable in layout_postprocess
- Fix E702 semicolon-separated statements in layout_postprocess
- Fix UP031 percent-format replaced with f-string in logging_config
- Auto-fix 44 additional ruff violations (import order, UP035/UP045/UP006, F401, F541)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
liuyuanchuang
2026-03-10 19:52:22 +08:00
parent f8173f7c0a
commit 30d2c2f45b
16 changed files with 162 additions and 140 deletions

View File

@@ -878,12 +878,9 @@ class GLMOCREndToEndService(OCRServiceBase):
Returns:
Dict with 'markdown', 'latex', 'mathml', 'mml' keys.
"""
# 1. Padding
padded = self.image_processor.add_padding(image)
img_h, img_w = padded.shape[:2]
# 2. Layout detection
layout_info = self.layout_detector.detect(padded)
# 1. Layout detection
img_h, img_w = image.shape[:2]
layout_info = self.layout_detector.detect(image)
# Sort regions in reading order: top-to-bottom, left-to-right
layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
@@ -892,7 +889,7 @@ class GLMOCREndToEndService(OCRServiceBase):
if not layout_info.regions:
# No layout detected → assume it's a formula, use formula recognition
logger.info("No layout regions detected, treating image as formula")
raw_content = self._call_vllm(padded, _TASK_PROMPTS["formula"])
raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"])
# Format as display formula markdown
formatted_content = raw_content.strip()
if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")):
@@ -905,7 +902,7 @@ class GLMOCREndToEndService(OCRServiceBase):
if region.type == "figure":
continue
x1, y1, x2, y2 = (int(c) for c in region.bbox)
cropped = padded[y1:y2, x1:x2]
cropped = image[y1:y2, x1:x2]
if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
logger.warning(
"Skipping region idx=%d (label=%s): crop too small %s",
@@ -918,7 +915,7 @@ class GLMOCREndToEndService(OCRServiceBase):
tasks.append((idx, region, cropped, prompt))
if not tasks:
raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
raw_content = self._call_vllm(image, _DEFAULT_PROMPT)
markdown_content = self._formatter._clean_content(raw_content)
else:
# Parallel OCR calls
@@ -965,17 +962,3 @@ class GLMOCREndToEndService(OCRServiceBase):
logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}
if __name__ == "__main__":
mineru_service = MineruOCRService()
image = cv2.imread("test/formula2.jpg")
image_numpy = np.array(image)
# Encode image to bytes (as done in API layer)
success, encoded_image = cv2.imencode(".png", image_numpy)
if not success:
raise RuntimeError("Failed to encode image")
image_bytes = BytesIO(encoded_image.tobytes())
image_bytes.seek(0)
ocr_result = mineru_service.recognize(image_bytes)
print(ocr_result)