fix: layout detection & format conversion robustness
Three targeted fixes for layout processing issues:
1. formula_number type mapping (layout_detector.py)
- Changed formula_number region type from "formula" to "text"
- Ensures Text Recognition prompt, preventing $$-wrapped output
- Prevents malformed \tag{$$...\n$$} in merged formulas
2. Reading order (ocr_service.py)
- Sort layout regions by (y1, x1) after detection
- Ensures top-to-bottom, left-to-right processing order
- Fixes paragraph ordering issues in output
3. Formula number cleaning (glm_postprocess.py)
- clean_formula_number() now strips $$, $, \[...\] delimiters
- Handles edge case where vLLM still returns math-mode wrapped content
- Prevents delimiter leakage into \tag{} placeholders
Also adds logging:
- Warning when empty formula content is skipped
- Warning when region crop is too small (< 10×10 px)
- Warning when vLLM parallel call fails
- Warning when format conversion fails
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -13,8 +13,11 @@ Covers:
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
@@ -94,8 +97,18 @@ def clean_repeated_content(
|
|||||||
|
|
||||||
|
|
||||||
def clean_formula_number(number_content: str) -> str:
|
def clean_formula_number(number_content: str) -> str:
|
||||||
"""Strip parentheses from a formula number string, e.g. '(1)' → '1'."""
|
"""Strip delimiters from a formula number string, e.g. '(1)' → '1'.
|
||||||
|
|
||||||
|
Also strips math-mode delimiters ($$, $, \\[...\\]) that vLLM may add
|
||||||
|
when the region is processed with a formula prompt.
|
||||||
|
"""
|
||||||
s = number_content.strip()
|
s = number_content.strip()
|
||||||
|
# Strip display math delimiters
|
||||||
|
for start, end in [("$$", "$$"), (r"\[", r"\]"), ("$", "$"), (r"\(", r"\)")]:
|
||||||
|
if s.startswith(start) and s.endswith(end) and len(s) > len(start) + len(end):
|
||||||
|
s = s[len(start):-len(end)].strip()
|
||||||
|
break
|
||||||
|
# Strip CJK/ASCII parentheses
|
||||||
if s.startswith("(") and s.endswith(")"):
|
if s.startswith("(") and s.endswith(")"):
|
||||||
return s[1:-1]
|
return s[1:-1]
|
||||||
if s.startswith("(") and s.endswith(")"):
|
if s.startswith("(") and s.endswith(")"):
|
||||||
@@ -253,6 +266,9 @@ class GLMResultFormatter:
|
|||||||
if content.startswith(s) and content.endswith(e):
|
if content.startswith(s) and content.endswith(e):
|
||||||
content = content[len(s) : -len(e)].strip()
|
content = content[len(s) : -len(e)].strip()
|
||||||
break
|
break
|
||||||
|
if not content:
|
||||||
|
logger.warning("Skipping formula region with empty content after stripping delimiters")
|
||||||
|
return ""
|
||||||
content = "$$\n" + content + "\n$$"
|
content = "$$\n" + content + "\n$$"
|
||||||
|
|
||||||
# Text formatting
|
# Text formatting
|
||||||
|
|||||||
@@ -66,7 +66,9 @@ class LayoutDetector:
|
|||||||
# Formula types
|
# Formula types
|
||||||
"display_formula": "formula",
|
"display_formula": "formula",
|
||||||
"inline_formula": "formula",
|
"inline_formula": "formula",
|
||||||
"formula_number": "formula",
|
# formula_number is a plain text annotation "(2.9)" next to a formula,
|
||||||
|
# not a formula itself — use text prompt so vLLM returns plain text
|
||||||
|
"formula_number": "text",
|
||||||
# Table types
|
# Table types
|
||||||
"table": "table",
|
"table": "table",
|
||||||
# Figure types
|
# Figure types
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""PaddleOCR-VL client service for text and formula recognition."""
|
"""PaddleOCR-VL client service for text and formula recognition."""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
@@ -20,6 +21,7 @@ from app.services.image_processor import ImageProcessor
|
|||||||
from app.services.layout_detector import LayoutDetector
|
from app.services.layout_detector import LayoutDetector
|
||||||
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_COMMANDS_NEED_SPACE = {
|
_COMMANDS_NEED_SPACE = {
|
||||||
# operators / calculus
|
# operators / calculus
|
||||||
@@ -883,6 +885,9 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
# 2. Layout detection
|
# 2. Layout detection
|
||||||
layout_info = self.layout_detector.detect(padded)
|
layout_info = self.layout_detector.detect(padded)
|
||||||
|
|
||||||
|
# Sort regions in reading order: top-to-bottom, left-to-right
|
||||||
|
layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
|
||||||
|
|
||||||
# 3. OCR: per-region (parallel) or full-image fallback
|
# 3. OCR: per-region (parallel) or full-image fallback
|
||||||
if not layout_info.regions:
|
if not layout_info.regions:
|
||||||
raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
|
raw_content = self._call_vllm(padded, _DEFAULT_PROMPT)
|
||||||
@@ -895,7 +900,13 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
continue
|
continue
|
||||||
x1, y1, x2, y2 = (int(c) for c in region.bbox)
|
x1, y1, x2, y2 = (int(c) for c in region.bbox)
|
||||||
cropped = padded[y1:y2, x1:x2]
|
cropped = padded[y1:y2, x1:x2]
|
||||||
if cropped.size == 0:
|
if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
|
||||||
|
logger.warning(
|
||||||
|
"Skipping region idx=%d (label=%s): crop too small %s",
|
||||||
|
idx,
|
||||||
|
region.native_label,
|
||||||
|
cropped.shape[:2],
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
|
prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
|
||||||
tasks.append((idx, region, cropped, prompt))
|
tasks.append((idx, region, cropped, prompt))
|
||||||
@@ -915,7 +926,8 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
idx = future_map[future]
|
idx = future_map[future]
|
||||||
try:
|
try:
|
||||||
raw_results[idx] = future.result()
|
raw_results[idx] = future.result()
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logger.warning("vLLM call failed for region idx=%d: %s", idx, e)
|
||||||
raw_results[idx] = ""
|
raw_results[idx] = ""
|
||||||
|
|
||||||
# Build structured region dicts for GLMResultFormatter
|
# Build structured region dicts for GLMResultFormatter
|
||||||
@@ -940,8 +952,11 @@ class GLMOCREndToEndService(OCRServiceBase):
|
|||||||
# 6. Format conversion
|
# 6. Format conversion
|
||||||
latex, mathml, mml = "", "", ""
|
latex, mathml, mml = "", "", ""
|
||||||
if markdown_content and self.converter:
|
if markdown_content and self.converter:
|
||||||
fmt = self.converter.convert_to_formats(markdown_content)
|
try:
|
||||||
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
|
fmt = self.converter.convert_to_formats(markdown_content)
|
||||||
|
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
|
||||||
|
except RuntimeError as e:
|
||||||
|
logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)
|
||||||
|
|
||||||
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}
|
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user