fix: layout detection & format conversion robustness

Three targeted fixes for layout processing issues:

1. formula_number type mapping (layout_detector.py)
   - Changed formula_number region type from "formula" to "text"
   - Ensures Text Recognition prompt, preventing $$-wrapped output
   - Prevents malformed \tag{$$...\n$$} in merged formulas

2. Reading order (ocr_service.py)
   - Sort layout regions by (y1, x1) after detection
   - Ensures top-to-bottom, left-to-right processing order
   - Fixes paragraph ordering issues in output

3. Formula number cleaning (glm_postprocess.py)
   - clean_formula_number() now strips $$, $, \[...\] delimiters
   - Handles edge case where vLLM still returns math-mode wrapped content
   - Prevents delimiter leakage into \tag{} placeholders

Also adds logging:
- Warning when empty formula content is skipped
- Warning when region crop is too small (< 10×10 px)
- Warning when vLLM parallel call fails
- Warning when format conversion fails

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
liuyuanchuang
2026-03-09 17:57:05 +08:00
parent bd1c118cb2
commit cff14904bf
3 changed files with 39 additions and 6 deletions

View File

@@ -13,8 +13,11 @@ Covers:
from __future__ import annotations from __future__ import annotations
import logging
import re import re
import json import json
logger = logging.getLogger(__name__)
from collections import Counter from collections import Counter
from copy import deepcopy from copy import deepcopy
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@@ -94,8 +97,18 @@ def clean_repeated_content(
def clean_formula_number(number_content: str) -> str: def clean_formula_number(number_content: str) -> str:
"""Strip parentheses from a formula number string, e.g. '(1)''1'.""" """Strip delimiters from a formula number string, e.g. '(1)''1'.
Also strips math-mode delimiters ($$, $, \\[...\\]) that vLLM may add
when the region is processed with a formula prompt.
"""
s = number_content.strip() s = number_content.strip()
# Strip display math delimiters
for start, end in [("$$", "$$"), (r"\[", r"\]"), ("$", "$"), (r"\(", r"\)")]:
if s.startswith(start) and s.endswith(end) and len(s) > len(start) + len(end):
s = s[len(start):-len(end)].strip()
break
# Strip CJK/ASCII parentheses
if s.startswith("(") and s.endswith(")"): if s.startswith("(") and s.endswith(")"):
return s[1:-1] return s[1:-1]
if s.startswith("") and s.endswith(""): if s.startswith("") and s.endswith(""):
@@ -253,6 +266,9 @@ class GLMResultFormatter:
if content.startswith(s) and content.endswith(e): if content.startswith(s) and content.endswith(e):
content = content[len(s) : -len(e)].strip() content = content[len(s) : -len(e)].strip()
break break
if not content:
logger.warning("Skipping formula region with empty content after stripping delimiters")
return ""
content = "$$\n" + content + "\n$$" content = "$$\n" + content + "\n$$"
# Text formatting # Text formatting

View File

@@ -66,7 +66,9 @@ class LayoutDetector:
# Formula types # Formula types
"display_formula": "formula", "display_formula": "formula",
"inline_formula": "formula", "inline_formula": "formula",
"formula_number": "formula", # formula_number is a plain text annotation "(2.9)" next to a formula,
# not a formula itself — use text prompt so vLLM returns plain text
"formula_number": "text",
# Table types # Table types
"table": "table", "table": "table",
# Figure types # Figure types

View File

@@ -1,6 +1,7 @@
"""PaddleOCR-VL client service for text and formula recognition.""" """PaddleOCR-VL client service for text and formula recognition."""
import base64 import base64
import logging
import re import re
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -20,6 +21,7 @@ from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector from app.services.layout_detector import LayoutDetector
settings = get_settings() settings = get_settings()
logger = logging.getLogger(__name__)
_COMMANDS_NEED_SPACE = { _COMMANDS_NEED_SPACE = {
# operators / calculus # operators / calculus
@@ -883,6 +885,9 @@ class GLMOCREndToEndService(OCRServiceBase):
# 2. Layout detection # 2. Layout detection
layout_info = self.layout_detector.detect(padded) layout_info = self.layout_detector.detect(padded)
# Sort regions in reading order: top-to-bottom, left-to-right
layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
# 3. OCR: per-region (parallel) or full-image fallback # 3. OCR: per-region (parallel) or full-image fallback
if not layout_info.regions: if not layout_info.regions:
raw_content = self._call_vllm(padded, _DEFAULT_PROMPT) raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
@@ -895,7 +900,13 @@ class GLMOCREndToEndService(OCRServiceBase):
continue continue
x1, y1, x2, y2 = (int(c) for c in region.bbox) x1, y1, x2, y2 = (int(c) for c in region.bbox)
cropped = padded[y1:y2, x1:x2] cropped = padded[y1:y2, x1:x2]
if cropped.size == 0: if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
logger.warning(
"Skipping region idx=%d (label=%s): crop too small %s",
idx,
region.native_label,
cropped.shape[:2],
)
continue continue
prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT) prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
tasks.append((idx, region, cropped, prompt)) tasks.append((idx, region, cropped, prompt))
@@ -915,7 +926,8 @@ class GLMOCREndToEndService(OCRServiceBase):
idx = future_map[future] idx = future_map[future]
try: try:
raw_results[idx] = future.result() raw_results[idx] = future.result()
except Exception: except Exception as e:
logger.warning("vLLM call failed for region idx=%d: %s", idx, e)
raw_results[idx] = "" raw_results[idx] = ""
# Build structured region dicts for GLMResultFormatter # Build structured region dicts for GLMResultFormatter
@@ -940,8 +952,11 @@ class GLMOCREndToEndService(OCRServiceBase):
# 6. Format conversion # 6. Format conversion
latex, mathml, mml = "", "", "" latex, mathml, mml = "", "", ""
if markdown_content and self.converter: if markdown_content and self.converter:
fmt = self.converter.convert_to_formats(markdown_content) try:
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml fmt = self.converter.convert_to_formats(markdown_content)
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
except RuntimeError as e:
logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml} return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}