diff --git a/app/services/glm_postprocess.py b/app/services/glm_postprocess.py index d84e589..36256fd 100644 --- a/app/services/glm_postprocess.py +++ b/app/services/glm_postprocess.py @@ -13,8 +13,11 @@ Covers: from __future__ import annotations +import logging import re import json + +logger = logging.getLogger(__name__) from collections import Counter from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple @@ -94,8 +97,18 @@ def clean_repeated_content( def clean_formula_number(number_content: str) -> str: - """Strip parentheses from a formula number string, e.g. '(1)' → '1'.""" + """Strip delimiters from a formula number string, e.g. '(1)' → '1'. + + Also strips math-mode delimiters ($$, $, \\[...\\]) that vLLM may add + when the region is processed with a formula prompt. + """ s = number_content.strip() + # Strip display math delimiters + for start, end in [("$$", "$$"), (r"\[", r"\]"), ("$", "$"), (r"\(", r"\)")]: + if s.startswith(start) and s.endswith(end) and len(s) > len(start) + len(end): + s = s[len(start):-len(end)].strip() + break + # Strip CJK/ASCII parentheses if s.startswith("(") and s.endswith(")"): return s[1:-1] if s.startswith("(") and s.endswith(")"): @@ -253,6 +266,9 @@ class GLMResultFormatter: if content.startswith(s) and content.endswith(e): content = content[len(s) : -len(e)].strip() break + if not content: + logger.warning("Skipping formula region with empty content after stripping delimiters") + return "" content = "$$\n" + content + "\n$$" # Text formatting diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py index 84b0647..36eb1b9 100644 --- a/app/services/layout_detector.py +++ b/app/services/layout_detector.py @@ -66,7 +66,9 @@ class LayoutDetector: # Formula types "display_formula": "formula", "inline_formula": "formula", - "formula_number": "formula", + # formula_number is a plain text annotation "(2.9)" next to a formula, + # not a formula itself — use text prompt so vLLM returns plain text + "formula_number": "text", # Table types "table": "table", # Figure types diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 28de285..321a483 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -1,6 +1,7 @@ """PaddleOCR-VL client service for text and formula recognition.""" import base64 +import logging import re from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor, as_completed @@ -20,6 +21,7 @@ from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector settings = get_settings() +logger = logging.getLogger(__name__) _COMMANDS_NEED_SPACE = { # operators / calculus @@ -883,6 +885,9 @@ class GLMOCREndToEndService(OCRServiceBase): # 2. Layout detection layout_info = self.layout_detector.detect(padded) + # Sort regions in reading order: top-to-bottom, left-to-right + layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0])) + # 3. OCR: per-region (parallel) or full-image fallback if not layout_info.regions: raw_content = self._call_vllm(padded, _DEFAULT_PROMPT) @@ -895,7 +900,13 @@ class GLMOCREndToEndService(OCRServiceBase): continue x1, y1, x2, y2 = (int(c) for c in region.bbox) cropped = padded[y1:y2, x1:x2] - if cropped.size == 0: + if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10: + logger.warning( + "Skipping region idx=%d (label=%s): crop too small %s", + idx, + region.native_label, + cropped.shape[:2], + ) continue prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT) tasks.append((idx, region, cropped, prompt)) @@ -915,7 +926,8 @@ class GLMOCREndToEndService(OCRServiceBase): idx = future_map[future] try: raw_results[idx] = future.result() - except Exception: + except Exception as e: + logger.warning("vLLM call failed for region idx=%d: %s", idx, e) raw_results[idx] = "" # Build structured region dicts for GLMResultFormatter @@ -940,8 +952,11 @@ class GLMOCREndToEndService(OCRServiceBase): # 6. Format conversion latex, mathml, mml = "", "", "" if markdown_content and self.converter: - fmt = self.converter.convert_to_formats(markdown_content) - latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml + try: + fmt = self.converter.convert_to_formats(markdown_content) + latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml + except RuntimeError as e: + logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e) return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}