fix: layout detection & format conversion robustness

Three targeted fixes for layout processing issues: 1. formula_number type mapping (layout_detector.py) - Changed formula_number region type from "formula" to "text" - Ensures Text Recognition prompt, preventing $$-wrapped output - Prevents malformed \tag{$$...\n$$} in merged formulas 2. Reading order (ocr_service.py) - Sort layout regions by (y1, x1) after detection - Ensures top-to-bottom, left-to-right processing order - Fixes paragraph ordering issues in output 3. Formula number cleaning (glm_postprocess.py) - clean_formula_number() now strips $$, $, \[...\] delimiters - Handles edge case where vLLM still returns math-mode wrapped content - Prevents delimiter leakage into \tag{} placeholders Also adds logging: - Warning when empty formula content is skipped - Warning when region crop is too small (< 10×10 px) - Warning when vLLM parallel call fails - Warning when format conversion fails Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-03-09 17:57:05 +08:00
parent bd1c118cb2
commit cff14904bf
3 changed files with 39 additions and 6 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,6 +1,7 @@
 """PaddleOCR-VL client service for text and formula recognition."""

 import base64
+import logging
 import re
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -20,6 +21,7 @@ from app.services.image_processor import ImageProcessor
 from app.services.layout_detector import LayoutDetector

 settings = get_settings()
+logger = logging.getLogger(__name__)

 _COMMANDS_NEED_SPACE = {
    # operators / calculus
@@ -883,6 +885,9 @@ class GLMOCREndToEndService(OCRServiceBase):
        # 2. Layout detection
        layout_info = self.layout_detector.detect(padded)

+        # Sort regions in reading order: top-to-bottom, left-to-right
+        layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
+
        # 3. OCR: per-region (parallel) or full-image fallback
        if not layout_info.regions:
            raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
@@ -895,7 +900,13 @@ class GLMOCREndToEndService(OCRServiceBase):
                    continue
                x1, y1, x2, y2 = (int(c) for c in region.bbox)
                cropped = padded[y1:y2, x1:x2]
-                if cropped.size == 0:
+                if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
+                    logger.warning(
+                        "Skipping region idx=%d (label=%s): crop too small %s",
+                        idx,
+                        region.native_label,
+                        cropped.shape[:2],
+                    )
                    continue
                prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
                tasks.append((idx, region, cropped, prompt))
@@ -915,7 +926,8 @@ class GLMOCREndToEndService(OCRServiceBase):
                        idx = future_map[future]
                        try:
                            raw_results[idx] = future.result()
-                        except Exception:
+                        except Exception as e:
+                            logger.warning("vLLM call failed for region idx=%d: %s", idx, e)
                            raw_results[idx] = ""

                # Build structured region dicts for GLMResultFormatter
@@ -940,8 +952,11 @@ class GLMOCREndToEndService(OCRServiceBase):
        # 6. Format conversion
        latex, mathml, mml = "", "", ""
        if markdown_content and self.converter:
-            fmt = self.converter.convert_to_formats(markdown_content)
-            latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
+            try:
+                fmt = self.converter.convert_to_formats(markdown_content)
+                latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
+            except RuntimeError as e:
+                logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)

        return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}