From cff14904bfa965355a6dd7075c315457853930c1 Mon Sep 17 00:00:00 2001 From: liuyuanchuang Date: Mon, 9 Mar 2026 17:57:05 +0800 Subject: [PATCH] fix: layout detection & format conversion robustness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three targeted fixes for layout processing issues: 1. formula_number type mapping (layout_detector.py) - Changed formula_number region type from "formula" to "text" - Ensures Text Recognition prompt, preventing $$-wrapped output - Prevents malformed \tag{$$...\n$$} in merged formulas 2. Reading order (ocr_service.py) - Sort layout regions by (y1, x1) after detection - Ensures top-to-bottom, left-to-right processing order - Fixes paragraph ordering issues in output 3. Formula number cleaning (glm_postprocess.py) - clean_formula_number() now strips $$, $, \[...\] delimiters - Handles edge case where vLLM still returns math-mode wrapped content - Prevents delimiter leakage into \tag{} placeholders Also adds logging: - Warning when empty formula content is skipped - Warning when region crop is too small (< 10×10 px) - Warning when vLLM parallel call fails - Warning when format conversion fails Co-Authored-By: Claude Haiku 4.5 --- app/services/glm_postprocess.py | 18 +++++++++++++++++- app/services/layout_detector.py | 4 +++- app/services/ocr_service.py | 23 +++++++++++++++++++---- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/app/services/glm_postprocess.py b/app/services/glm_postprocess.py index d84e589..36256fd 100644 --- a/app/services/glm_postprocess.py +++ b/app/services/glm_postprocess.py @@ -13,8 +13,11 @@ Covers: from __future__ import annotations +import logging import re import json + +logger = logging.getLogger(__name__) from collections import Counter from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple @@ -94,8 +97,18 @@ def clean_repeated_content( def clean_formula_number(number_content: str) -> str: - """Strip parentheses from a formula number string, e.g. '(1)' → '1'.""" + """Strip delimiters from a formula number string, e.g. '(1)' → '1'. + + Also strips math-mode delimiters ($$, $, \\[...\\]) that vLLM may add + when the region is processed with a formula prompt. + """ s = number_content.strip() + # Strip display math delimiters + for start, end in [("$$", "$$"), (r"\[", r"\]"), ("$", "$"), (r"\(", r"\)")]: + if s.startswith(start) and s.endswith(end) and len(s) > len(start) + len(end): + s = s[len(start):-len(end)].strip() + break + # Strip CJK/ASCII parentheses if s.startswith("(") and s.endswith(")"): return s[1:-1] if s.startswith("(") and s.endswith(")"): @@ -253,6 +266,9 @@ class GLMResultFormatter: if content.startswith(s) and content.endswith(e): content = content[len(s) : -len(e)].strip() break + if not content: + logger.warning("Skipping formula region with empty content after stripping delimiters") + return "" content = "$$\n" + content + "\n$$" # Text formatting diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py index 84b0647..36eb1b9 100644 --- a/app/services/layout_detector.py +++ b/app/services/layout_detector.py @@ -66,7 +66,9 @@ class LayoutDetector: # Formula types "display_formula": "formula", "inline_formula": "formula", - "formula_number": "formula", + # formula_number is a plain text annotation "(2.9)" next to a formula, + # not a formula itself — use text prompt so vLLM returns plain text + "formula_number": "text", # Table types "table": "table", # Figure types diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 28de285..321a483 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -1,6 +1,7 @@ """PaddleOCR-VL client service for text and formula recognition.""" import base64 +import logging import re from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor, as_completed @@ -20,6 +21,7 @@ from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector settings = get_settings() +logger = logging.getLogger(__name__) _COMMANDS_NEED_SPACE = { # operators / calculus @@ -883,6 +885,9 @@ class GLMOCREndToEndService(OCRServiceBase): # 2. Layout detection layout_info = self.layout_detector.detect(padded) + # Sort regions in reading order: top-to-bottom, left-to-right + layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0])) + # 3. OCR: per-region (parallel) or full-image fallback if not layout_info.regions: raw_content = self._call_vllm(padded, _DEFAULT_PROMPT) @@ -895,7 +900,13 @@ class GLMOCREndToEndService(OCRServiceBase): continue x1, y1, x2, y2 = (int(c) for c in region.bbox) cropped = padded[y1:y2, x1:x2] - if cropped.size == 0: + if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10: + logger.warning( + "Skipping region idx=%d (label=%s): crop too small %s", + idx, + region.native_label, + cropped.shape[:2], + ) continue prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT) tasks.append((idx, region, cropped, prompt)) @@ -915,7 +926,8 @@ class GLMOCREndToEndService(OCRServiceBase): idx = future_map[future] try: raw_results[idx] = future.result() - except Exception: + except Exception as e: + logger.warning("vLLM call failed for region idx=%d: %s", idx, e) raw_results[idx] = "" # Build structured region dicts for GLMResultFormatter @@ -940,8 +952,11 @@ class GLMOCREndToEndService(OCRServiceBase): # 6. Format conversion latex, mathml, mml = "", "", "" if markdown_content and self.converter: - fmt = self.converter.convert_to_formats(markdown_content) - latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml + try: + fmt = self.converter.convert_to_formats(markdown_content) + latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml + except RuntimeError as e: + logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e) return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}