Files
doc_processer/app/services/ocr_service.py

950 lines
34 KiB
Python

"""PaddleOCR-VL client service for text and formula recognition."""
import base64
import logging
import re
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO
import cv2
import numpy as np
import requests
from openai import OpenAI
from paddleocr import PaddleOCRVL
from PIL import Image as PILImage
from app.core.config import get_settings
from app.services.converter import Converter
from app.services.glm_postprocess import GLMResultFormatter
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
settings = get_settings()
logger = logging.getLogger(__name__)
_COMMANDS_NEED_SPACE = {
# operators / calculus
"cdot",
"times",
"div",
"pm",
"mp",
"int",
"iint",
"iiint",
"oint",
"sum",
"prod",
"lim",
# common functions
"sin",
"cos",
"tan",
"cot",
"sec",
"csc",
"log",
"ln",
"exp",
# set relations (often glued by OCR)
"in",
"notin",
"subset",
"supset",
"subseteq",
"supseteq",
"cap",
"cup",
# misc
"partial",
"nabla",
}
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
# Match LaTeX commands: \command (greedy match all letters)
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
# stage2: differentials inside math segments
# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
# Only match differentials in specific contexts (after integrals, in fractions)
# (?<!\\) - not preceded by backslash (not a LaTeX command)
# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")
def _split_glued_command_token(token: str) -> str:
"""Split OCR-glued LaTeX command token by whitelist longest-prefix.
Examples:
- \\cdotdS -> \\cdot dS
- \\intdx -> \\int dx
- \\inX -> \\in X (stop at uppercase letter)
"""
if not token.startswith("\\"):
return token
body = token[1:]
if len(body) < 2:
return token
best = None
# Find longest prefix that is in whitelist
for i in range(1, len(body) + 1):
prefix = body[:i]
if prefix in _COMMANDS_NEED_SPACE:
best = prefix
if not best:
return token
suffix = body[len(best) :]
if not suffix:
return token
return f"\\{best} {suffix}"
def _clean_latex_syntax_spaces(expr: str) -> str:
"""Clean unwanted spaces in LaTeX syntax (common OCR errors).
OCR often adds spaces in LaTeX syntax structures where they shouldn't be:
- Subscripts: a _ {i 1} -> a_{i1}
- Superscripts: x ^ {2 3} -> x^{23}
- Fractions: \\frac { a } { b } -> \\frac{a}{b}
- Commands: \\ alpha -> \\alpha
- Braces: { a b } -> {ab} (within subscripts/superscripts)
This is safe because these spaces are always OCR errors - LaTeX doesn't
need or want spaces in these positions.
Args:
expr: LaTeX math expression.
Returns:
Expression with LaTeX syntax spaces cleaned.
"""
# Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
# a _ {i} -> a_{i}, x ^ {2} -> x^{2}
expr = re.sub(r"\s*_\s*", "_", expr)
expr = re.sub(r"\s*\^\s*", "^", expr)
# Pattern 2: Spaces inside braces that follow _ or ^
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
# must be preserved as they serve as command terminators (\in X != \inX)
def clean_subscript_superscript_braces(match):
operator = match.group(1) # _ or ^
content = match.group(2) # content inside braces
if "\\" not in content:
# No LaTeX commands: safe to remove all spaces
cleaned = re.sub(r"\s+", "", content)
else:
# Contains LaTeX commands: remove spaces carefully
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
# Strategy: remove spaces before \ and between non-command chars,
# but preserve the space after \command when followed by a non-\ char
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
return f"{operator}{{{cleaned}}}"
# Match _{ ... } or ^{ ... }
expr = re.sub(r"([_^])\{([^}]+)\}", clean_subscript_superscript_braces, expr)
# Pattern 3: Spaces inside \frac arguments
# \frac { a } { b } -> \frac{a}{b}
# \frac{ a + b }{ c } -> \frac{a+b}{c}
def clean_frac_braces(match):
numerator = match.group(1).strip()
denominator = match.group(2).strip()
return f"\\frac{{{numerator}}}{{{denominator}}}"
expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr)
# Pattern 4: Spaces after backslash in LaTeX commands
# \ alpha -> \alpha, \ beta -> \beta
expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr)
# Pattern 5: Spaces before/after braces in general contexts (conservative)
# Only remove if the space is clearly wrong (e.g., after operators)
# { x } in standalone context is kept as-is to avoid breaking valid spacing
# But after operators like \sqrt{ x } -> \sqrt{x}
expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr) # \sqrt { -> \sqrt{
return expr
def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$).
Processing stages:
0. Fix OCR number errors (spaces in numbers)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
3. Normalize differentials (DISABLED by default to avoid breaking variables)
Args:
expr: LaTeX math expression without delimiters.
Returns:
Processed LaTeX expression.
"""
# stage0: fix OCR number errors (digits with spaces)
expr = _fix_ocr_number_errors(expr)
# stage1: split glued command tokens (e.g. \cdotdS, \inX)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
expr = _clean_latex_syntax_spaces(expr)
# stage3: normalize differentials - DISABLED
# This feature is disabled because it's too aggressive and can break:
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
# - Variable names: dx, dy, dz might be variable names, not differentials
# - Subscripts: x_{dx}, y_{dy}
# - Function names or custom notation
#
# The risk of false positives (breaking valid LaTeX) outweighs the benefit
# of normalizing differentials for OCR output.
#
# If differential normalization is needed, implement a context-aware version:
# expr = _normalize_differentials_contextaware(expr)
return expr
def _normalize_differentials_contextaware(expr: str) -> str:
"""Context-aware differential normalization (optional, not used by default).
Only normalizes differentials in specific mathematical contexts:
1. After integral symbols: \\int dx, \\iint dA, \\oint dr
2. In fraction denominators: \\frac{dy}{dx}
3. In explicit differential notation: f(x)dx (function followed by differential)
This avoids false positives like variable names, subscripts, or LaTeX commands.
Args:
expr: LaTeX math expression.
Returns:
Expression with differentials normalized in safe contexts only.
"""
# Pattern 1: After integral commands
# \int dx -> \int d x
integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])")
expr = integral_pattern.sub(r"\1 \2 d \3", expr)
# Pattern 2: In fraction denominators
# \frac{...}{dx} -> \frac{...}{d x}
frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})")
expr = frac_pattern.sub(r"\1d \2\3", expr)
return expr
def _fix_ocr_number_errors(expr: str) -> str:
"""Fix common OCR errors in LaTeX math expressions.
OCR often splits numbers incorrectly, especially decimals:
- "2 2. 2" should be "22.2"
- "3 0. 4" should be "30.4"
- "1 5 0" should be "150"
This function merges digit sequences that are separated by spaces.
Args:
expr: LaTeX math expression.
Returns:
LaTeX expression with number errors fixed.
"""
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
# Example: "2 2. 2" → "22.2"
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
# Example: "22. 2" → "22.2"
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
# Be careful: only merge if followed by decimal point or comma/end
# Example: "1 5 0" → "150" when followed by comma or end
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
# Fix pattern 4: Multiple spaces in decimal numbers
# Example: "2 2 . 2" → "22.2"
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
return expr
def _postprocess_markdown(markdown_content: str) -> str:
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
if not markdown_content:
return markdown_content
def _fix_segment(m: re.Match) -> str:
seg = m.group(0)
if seg.startswith("$$") and seg.endswith("$$"):
return f"$${_postprocess_math(seg[2:-2])}$$"
if seg.startswith("$") and seg.endswith("$"):
return f"${_postprocess_math(seg[1:-1])}$"
return seg
markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
# Apply markdown-level postprocessing (after LaTeX processing)
markdown_content = _remove_false_heading_from_single_formula(markdown_content)
return markdown_content
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
"""Remove false heading markers from single-formula content.
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
This function detects and removes the heading marker when:
1. The content contains only one formula (display or inline)
2. The formula line starts with '#' (heading marker)
3. No other non-formula text content exists
Examples:
Input: "# $$E = mc^2$$"
Output: "$$E = mc^2$$"
Input: "# $x = y$"
Output: "$x = y$"
Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading)
Output: "# Introduction\n$$E = mc^2$$"
Args:
markdown_content: Markdown text with potential false headings.
Returns:
Markdown text with false heading markers removed.
"""
if not markdown_content or not markdown_content.strip():
return markdown_content
lines = markdown_content.split("\n")
# Count formulas and heading lines
formula_count = 0
heading_lines = []
has_non_formula_text = False
for i, line in enumerate(lines):
line_stripped = line.strip()
if not line_stripped:
continue
# Check if line starts with heading marker
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped)
if heading_match:
heading_level = heading_match.group(1)
content = heading_match.group(2)
# Check if the heading content is a formula
if re.fullmatch(r"\$\$?.+\$\$?", content):
# This is a heading with a formula
heading_lines.append((i, heading_level, content))
formula_count += 1
else:
# This is a real heading with text
has_non_formula_text = True
elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped):
# Standalone formula line (not in a heading)
formula_count += 1
elif line_stripped and not re.match(r"^#+\s*$", line_stripped):
# Non-empty, non-heading, non-formula line
has_non_formula_text = True
# Only remove heading markers if:
# 1. There's exactly one formula
# 2. That formula is in a heading line
# 3. There's no other text content
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
# Remove the heading marker from the formula
line_idx, heading_level, formula_content = heading_lines[0]
lines[line_idx] = formula_content
return "\n".join(lines)
class OCRServiceBase(ABC):
@abstractmethod
def recognize(self, image: np.ndarray) -> dict:
pass
class OCRService(OCRServiceBase):
"""Service for OCR using PaddleOCR-VL."""
_pipeline: PaddleOCRVL | None = None
_layout_detector: LayoutDetector | None = None
def __init__(
self,
vl_server_url: str,
layout_detector: LayoutDetector,
image_processor: ImageProcessor,
converter: Converter,
):
"""Initialize OCR service.
Args:
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
layout_detector: Layout detector instance.
image_processor: Image processor instance.
"""
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
self.layout_detector = layout_detector
self.image_processor = image_processor
self.converter = converter
def _get_pipeline(self):
"""Get or create PaddleOCR-VL pipeline.
Returns:
PaddleOCRVL pipeline instance.
"""
if OCRService._pipeline is None:
OCRService._pipeline = PaddleOCRVL(
vl_rec_backend="vllm-server",
vl_rec_server_url=self.vl_server_url,
layout_detection_model_name="PP-DocLayoutV2",
)
return OCRService._pipeline
def _recognize_mixed(self, image: np.ndarray) -> dict:
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
recognition of mixed content.
Args:
image: Input image as numpy array in BGR format.
Returns:
Dict with 'markdown', 'latex', 'mathml' keys.
"""
try:
pipeline = self._get_pipeline()
output = pipeline.predict(image, use_layout_detection=True)
markdown_content = ""
for res in output:
markdown_content += res.markdown.get("markdown_texts", "")
markdown_content = _postprocess_markdown(markdown_content)
convert_result = self.converter.convert_to_formats(markdown_content)
return {
"markdown": markdown_content,
"latex": convert_result.latex,
"mathml": convert_result.mathml,
"mml": convert_result.mml,
}
except Exception as e:
raise RuntimeError(f"Mixed recognition failed: {e}") from e
def _recognize_formula(self, image: np.ndarray) -> dict:
"""Recognize formula/math content using PaddleOCR-VL with prompt.
This mode uses PaddleOCR-VL directly with a formula recognition prompt.
Args:
image: Input image as numpy array in BGR format.
Returns:
Dict with 'latex', 'markdown', 'mathml' keys.
"""
try:
pipeline = self._get_pipeline()
output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")
markdown_content = ""
for res in output:
markdown_content += res.markdown.get("markdown_texts", "")
markdown_content = _postprocess_markdown(markdown_content)
convert_result = self.converter.convert_to_formats(markdown_content)
return {
"latex": convert_result.latex,
"mathml": convert_result.mathml,
"mml": convert_result.mml,
"markdown": markdown_content,
}
except Exception as e:
raise RuntimeError(f"Formula recognition failed: {e}") from e
def recognize(self, image: np.ndarray) -> dict:
"""Recognize content using PaddleOCR-VL.
Args:
image: Input image as numpy array in BGR format.
Returns:
Dict with 'latex', 'markdown', 'mathml' keys.
"""
padded_image = self.image_processor.add_padding(image)
layout_info = self.layout_detector.detect(padded_image)
if layout_info.MixedRecognition:
return self._recognize_mixed(image)
else:
return self._recognize_formula(image)
class GLMOCRService(OCRServiceBase):
"""Service for OCR using GLM-4V model via vLLM."""
def __init__(
self,
vl_server_url: str,
image_processor: ImageProcessor,
converter: Converter,
):
"""Initialize GLM OCR service.
Args:
vl_server_url: URL of the vLLM server for GLM-4V (default: http://127.0.0.1:8002/v1).
image_processor: Image processor instance.
converter: Converter instance for format conversion.
"""
self.vl_server_url = vl_server_url or settings.glm_ocr_url
self.image_processor = image_processor
self.converter = converter
self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600)
def _recognize_formula(self, image: np.ndarray) -> dict:
"""Recognize formula/math content using GLM-4V.
Args:
image: Input image as numpy array in BGR format.
Returns:
Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
Raises:
RuntimeError: If recognition fails (preserves original exception for fallback handling).
"""
# Add padding to image
padded_image = self.image_processor.add_padding(image)
# Encode image to base64
success, encoded_image = cv2.imencode(".png", padded_image)
if not success:
raise RuntimeError("Failed to encode image")
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
image_url = f"data:image/png;base64,{image_base64}"
# Call OpenAI-compatible API with formula recognition prompt
prompt = "Formula Recognition:"
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": prompt},
],
}
]
# Don't catch exceptions here - let them propagate for fallback handling
response = self.openai_client.chat.completions.create(
model="glm-ocr",
messages=messages,
temperature=0.0,
)
markdown_content = response.choices[0].message.content
# Process LaTeX delimiters
if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
markdown_content = f"$${markdown_content}$$"
# Apply postprocessing
markdown_content = _postprocess_markdown(markdown_content)
convert_result = self.converter.convert_to_formats(markdown_content)
return {
"latex": convert_result.latex,
"mathml": convert_result.mathml,
"mml": convert_result.mml,
"markdown": markdown_content,
}
def recognize(self, image: np.ndarray) -> dict:
"""Recognize content using GLM-4V.
Args:
image: Input image as numpy array in BGR format.
Returns:
Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
"""
return self._recognize_formula(image)
class MineruOCRService(OCRServiceBase):
"""Service for OCR using local file_parse API."""
def __init__(
self,
api_url: str = "http://127.0.0.1:8000/file_parse",
image_processor: ImageProcessor | None = None,
converter: Converter | None = None,
glm_ocr_url: str = "http://localhost:8002/v1",
layout_detector: LayoutDetector | None = None,
):
"""Initialize Local API service.
Args:
api_url: URL of the local file_parse API endpoint.
converter: Optional converter instance for format conversion.
glm_ocr_url: URL of the GLM-OCR vLLM server.
"""
self.api_url = api_url
self.image_processor = image_processor
self.converter = converter
self.glm_ocr_url = glm_ocr_url
self.openai_client = OpenAI(api_key="EMPTY", base_url=glm_ocr_url, timeout=3600)
def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
"""Recognize formula using PaddleOCR-VL API.
Args:
image: Input image as numpy array in BGR format.
prompt: Recognition prompt (default: "Formula Recognition:")
Returns:
Recognized formula text (LaTeX format).
"""
try:
# Encode image to base64
success, encoded_image = cv2.imencode(".png", image)
if not success:
raise RuntimeError("Failed to encode image")
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
image_url = f"data:image/png;base64,{image_base64}"
# Call OpenAI-compatible API
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": prompt},
],
}
]
response = self.openai_client.chat.completions.create(
model="glm-ocr",
messages=messages,
temperature=0.0,
)
return response.choices[0].message.content
except Exception as e:
raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
"""Extract image references from markdown and recognize formulas.
Args:
markdown_content: Markdown content with potential image references.
original_image: Original input image.
Returns:
Markdown content with formulas recognized by PaddleOCR-VL.
"""
# Pattern to match image references: ![](images/xxx.png) or ![](images/xxx.jpg)
image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
if not image_pattern.search(markdown_content):
return markdown_content
formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
if formula_text.startswith(r"\[") or formula_text.startswith(r"\("):
formula_text = formula_text.replace(r"\[", "$$").replace(r"\(", "$$")
formula_text = formula_text.replace(r"\]", "$$").replace(r"\)", "$$")
elif not formula_text.startswith("$$") and not formula_text.startswith("$"):
formula_text = f"$${formula_text}$$"
return formula_text
def recognize(self, image_bytes: BytesIO) -> dict:
"""Recognize content using local file_parse API.
Args:
image_bytes: Input image as BytesIO object (already encoded as PNG).
Returns:
Dict with 'markdown', 'latex', 'mathml' keys.
"""
try:
# Decode image_bytes to numpy array for potential formula recognition
image_bytes.seek(0)
image_data = np.frombuffer(image_bytes.read(), dtype=np.uint8)
original_image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
# Reset image_bytes for API request
image_bytes.seek(0)
# Prepare multipart form data
files = {"files": ("image.png", image_bytes, "image/png")}
data = {
"return_middle_json": "false",
"return_model_output": "false",
"return_md": "true",
"return_images": "false",
"end_page_id": "99999",
"start_page_id": "0",
"lang_list": "en",
"server_url": "string",
"return_content_list": "false",
"backend": "hybrid-auto-engine",
"table_enable": "true",
"response_format_zip": "false",
"formula_enable": "true",
"parse_method": "ocr",
}
# Make API request
response = requests.post(
self.api_url,
files=files,
data=data,
headers={"accept": "application/json"},
timeout=30,
)
response.raise_for_status()
result = response.json()
# Extract markdown content from response
markdown_content = ""
if "results" in result and "image" in result["results"]:
markdown_content = result["results"]["image"].get("md_content", "")
if "![](images/" in markdown_content:
markdown_content = self._extract_and_recognize_formulas(markdown_content, original_image)
# Apply postprocessing to fix OCR errors
markdown_content = _postprocess_markdown(markdown_content)
# Convert to other formats if converter is available
latex = ""
mathml = ""
mml = ""
if self.converter and markdown_content:
convert_result = self.converter.convert_to_formats(markdown_content)
latex = convert_result.latex
mathml = convert_result.mathml
mml = convert_result.mml
return {
"markdown": markdown_content,
"latex": latex,
"mathml": mathml,
"mml": mml,
}
except requests.RequestException as e:
raise RuntimeError(f"Local API request failed: {e}") from e
except Exception as e:
raise RuntimeError(f"Recognition failed: {e}") from e
# Task-specific prompts (from GLM-OCR SDK config.yaml)
_TASK_PROMPTS: dict[str, str] = {
"text": "Text Recognition. If the content is a formula, please ouput display latex code, else output text",
"formula": "Formula Recognition:",
"table": "Table Recognition:",
}
_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please ouput display latex code, else output text"
class GLMOCREndToEndService(OCRServiceBase):
"""End-to-end OCR using GLM-OCR pipeline: layout detection → per-region OCR.
Pipeline:
1. Add padding (ImageProcessor)
2. Detect layout regions (LayoutDetector → PP-DocLayoutV3)
3. Crop each region and call vLLM with a task-specific prompt (parallel)
4. GLMResultFormatter: clean, format titles/bullets/formulas, merge tags
5. _postprocess_markdown: LaTeX math error correction
6. Converter: markdown → latex/mathml/mml
This replaces both GLMOCRService (formula-only) and MineruOCRService (mixed).
"""
def __init__(
self,
vl_server_url: str,
image_processor: ImageProcessor,
converter: Converter,
layout_detector: LayoutDetector,
max_workers: int = 8,
):
self.vl_server_url = vl_server_url or settings.glm_ocr_url
self.image_processor = image_processor
self.converter = converter
self.layout_detector = layout_detector
self.max_workers = max_workers
self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600)
self._formatter = GLMResultFormatter()
def _encode_region(self, image: np.ndarray) -> str:
"""Convert BGR numpy array to base64 JPEG string."""
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
pil_img = PILImage.fromarray(rgb)
buf = BytesIO()
pil_img.save(buf, format="JPEG")
return base64.b64encode(buf.getvalue()).decode("utf-8")
def _call_vllm(self, image: np.ndarray, prompt: str) -> str:
"""Send image + prompt to vLLM and return raw content string."""
img_b64 = self._encode_region(image)
data_url = f"data:image/jpeg;base64,{img_b64}"
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": data_url}},
{"type": "text", "text": prompt},
],
}
]
response = self.openai_client.chat.completions.create(
model="glm-ocr",
messages=messages,
temperature=0.01,
max_tokens=settings.max_tokens,
)
return response.choices[0].message.content.strip()
def _normalize_bbox(self, bbox: list[float], img_w: int, img_h: int) -> list[int]:
"""Convert pixel bbox [x1,y1,x2,y2] to 0-1000 normalised coords."""
x1, y1, x2, y2 = bbox
return [
int(x1 / img_w * 1000),
int(y1 / img_h * 1000),
int(x2 / img_w * 1000),
int(y2 / img_h * 1000),
]
def recognize(self, image: np.ndarray) -> dict:
"""Full pipeline: padding → layout → per-region OCR → postprocess → markdown.
Args:
image: Input image as numpy array in BGR format.
Returns:
Dict with 'markdown', 'latex', 'mathml', 'mml' keys.
"""
# 1. Layout detection
img_h, img_w = image.shape[:2]
layout_info = self.layout_detector.detect(image)
# Sort regions in reading order: top-to-bottom, left-to-right
layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
# 3. OCR: per-region (parallel) or full-image fallback
if not layout_info.regions or (len(layout_info.regions) == 1 and not layout_info.MixedRecognition):
# No layout detected → assume it's a formula, use formula recognition
logger.info("No layout regions detected, treating image as formula")
raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"])
# Format as display formula markdown
formatted_content = raw_content.strip()
if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")):
formatted_content = f"$$\n{formatted_content}\n$$"
markdown_content = formatted_content
else:
# Build task list for non-figure regions
tasks = []
for idx, region in enumerate(layout_info.regions):
if region.type == "figure":
continue
x1, y1, x2, y2 = (int(c) for c in region.bbox)
cropped = image[y1:y2, x1:x2]
if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
logger.warning(
"Skipping region idx=%d (label=%s): crop too small %s",
idx,
region.native_label,
cropped.shape[:2],
)
continue
prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
tasks.append((idx, region, cropped, prompt))
if not tasks:
raw_content = self._call_vllm(image, _DEFAULT_PROMPT)
markdown_content = self._formatter._clean_content(raw_content)
else:
# Parallel OCR calls
raw_results: dict[int, str] = {}
with ThreadPoolExecutor(max_workers=min(self.max_workers, len(tasks))) as ex:
future_map = {ex.submit(self._call_vllm, cropped, prompt): idx for idx, region, cropped, prompt in tasks}
for future in as_completed(future_map):
idx = future_map[future]
try:
raw_results[idx] = future.result()
except Exception as e:
logger.warning("vLLM call failed for region idx=%d: %s", idx, e)
raw_results[idx] = ""
# Build structured region dicts for GLMResultFormatter
region_dicts = []
for idx, region, _cropped, _prompt in tasks:
region_dicts.append(
{
"index": idx,
"label": region.type,
"native_label": region.native_label,
"content": raw_results.get(idx, ""),
"bbox_2d": self._normalize_bbox(region.bbox, img_w, img_h),
}
)
# 4. GLM-OCR postprocessing: clean, format, merge, bullets
markdown_content = self._formatter.process(region_dicts)
# 5. LaTeX math error correction (our existing pipeline)
markdown_content = _postprocess_markdown(markdown_content)
# 6. Format conversion
latex, mathml, mml = "", "", ""
if markdown_content and self.converter:
try:
fmt = self.converter.convert_to_formats(markdown_content)
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
except RuntimeError as e:
logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}