2025-12-29 17:34:58 +08:00
|
|
|
"""PaddleOCR-VL client service for text and formula recognition."""
|
|
|
|
|
|
2026-03-09 16:51:06 +08:00
|
|
|
import base64
|
2026-03-09 17:57:05 +08:00
|
|
|
import logging
|
2026-01-05 21:37:51 +08:00
|
|
|
import re
|
2026-03-09 16:51:06 +08:00
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
import cv2
|
2026-03-09 16:51:06 +08:00
|
|
|
import numpy as np
|
2026-01-05 17:30:54 +08:00
|
|
|
import requests
|
2026-03-09 16:51:06 +08:00
|
|
|
from openai import OpenAI
|
2025-12-31 17:38:32 +08:00
|
|
|
from paddleocr import PaddleOCRVL
|
2026-03-09 16:51:06 +08:00
|
|
|
from PIL import Image as PILImage
|
|
|
|
|
|
|
|
|
|
from app.core.config import get_settings
|
2025-12-31 17:38:32 +08:00
|
|
|
from app.services.converter import Converter
|
2026-03-09 16:51:06 +08:00
|
|
|
from app.services.glm_postprocess import GLMResultFormatter
|
|
|
|
|
from app.services.image_processor import ImageProcessor
|
|
|
|
|
from app.services.layout_detector import LayoutDetector
|
2025-12-29 17:34:58 +08:00
|
|
|
|
|
|
|
|
settings = get_settings()
|
2026-03-09 17:57:05 +08:00
|
|
|
logger = logging.getLogger(__name__)
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2026-01-05 21:37:51 +08:00
|
|
|
_COMMANDS_NEED_SPACE = {
|
|
|
|
|
# operators / calculus
|
2026-02-04 12:00:06 +08:00
|
|
|
"cdot",
|
|
|
|
|
"times",
|
|
|
|
|
"div",
|
|
|
|
|
"pm",
|
|
|
|
|
"mp",
|
|
|
|
|
"int",
|
|
|
|
|
"iint",
|
|
|
|
|
"iiint",
|
|
|
|
|
"oint",
|
|
|
|
|
"sum",
|
|
|
|
|
"prod",
|
|
|
|
|
"lim",
|
2026-01-05 21:37:51 +08:00
|
|
|
# common functions
|
2026-02-04 12:00:06 +08:00
|
|
|
"sin",
|
|
|
|
|
"cos",
|
|
|
|
|
"tan",
|
|
|
|
|
"cot",
|
|
|
|
|
"sec",
|
|
|
|
|
"csc",
|
|
|
|
|
"log",
|
|
|
|
|
"ln",
|
|
|
|
|
"exp",
|
2026-02-07 21:28:46 +08:00
|
|
|
# set relations (often glued by OCR)
|
|
|
|
|
"in",
|
|
|
|
|
"notin",
|
|
|
|
|
"subset",
|
|
|
|
|
"supset",
|
|
|
|
|
"subseteq",
|
|
|
|
|
"supseteq",
|
|
|
|
|
"cap",
|
|
|
|
|
"cup",
|
2026-01-05 21:37:51 +08:00
|
|
|
# misc
|
2026-02-04 12:00:06 +08:00
|
|
|
"partial",
|
|
|
|
|
"nabla",
|
2026-01-05 21:37:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
2026-02-07 21:28:46 +08:00
|
|
|
# Match LaTeX commands: \command (greedy match all letters)
|
|
|
|
|
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
|
2026-01-05 21:37:51 +08:00
|
|
|
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
|
|
|
|
|
|
|
|
|
# stage2: differentials inside math segments
|
2026-02-05 13:18:55 +08:00
|
|
|
# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
|
|
|
|
|
# Only match differentials in specific contexts (after integrals, in fractions)
|
|
|
|
|
# (?<!\\) - not preceded by backslash (not a LaTeX command)
|
|
|
|
|
# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
|
|
|
|
|
# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
|
|
|
|
|
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
|
|
|
|
|
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")
|
2026-01-05 21:37:51 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _split_glued_command_token(token: str) -> str:
|
|
|
|
|
"""Split OCR-glued LaTeX command token by whitelist longest-prefix.
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
- \\cdotdS -> \\cdot dS
|
|
|
|
|
- \\intdx -> \\int dx
|
2026-02-07 21:28:46 +08:00
|
|
|
- \\inX -> \\in X (stop at uppercase letter)
|
2026-01-05 21:37:51 +08:00
|
|
|
"""
|
|
|
|
|
if not token.startswith("\\"):
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
body = token[1:]
|
|
|
|
|
if len(body) < 2:
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
best = None
|
2026-02-07 21:28:46 +08:00
|
|
|
# Find longest prefix that is in whitelist
|
|
|
|
|
for i in range(1, len(body) + 1):
|
2026-01-05 21:37:51 +08:00
|
|
|
prefix = body[:i]
|
|
|
|
|
if prefix in _COMMANDS_NEED_SPACE:
|
|
|
|
|
best = prefix
|
|
|
|
|
|
|
|
|
|
if not best:
|
|
|
|
|
return token
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
suffix = body[len(best) :]
|
2026-01-05 21:37:51 +08:00
|
|
|
if not suffix:
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
return f"\\{best} {suffix}"
|
|
|
|
|
|
|
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
def _clean_latex_syntax_spaces(expr: str) -> str:
|
|
|
|
|
"""Clean unwanted spaces in LaTeX syntax (common OCR errors).
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
OCR often adds spaces in LaTeX syntax structures where they shouldn't be:
|
|
|
|
|
- Subscripts: a _ {i 1} -> a_{i1}
|
|
|
|
|
- Superscripts: x ^ {2 3} -> x^{23}
|
|
|
|
|
- Fractions: \\frac { a } { b } -> \\frac{a}{b}
|
|
|
|
|
- Commands: \\ alpha -> \\alpha
|
|
|
|
|
- Braces: { a b } -> {ab} (within subscripts/superscripts)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
This is safe because these spaces are always OCR errors - LaTeX doesn't
|
|
|
|
|
need or want spaces in these positions.
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
Args:
|
|
|
|
|
expr: LaTeX math expression.
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
Returns:
|
|
|
|
|
Expression with LaTeX syntax spaces cleaned.
|
|
|
|
|
"""
|
|
|
|
|
# Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
|
|
|
|
|
# a _ {i} -> a_{i}, x ^ {2} -> x^{2}
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"\s*_\s*", "_", expr)
|
|
|
|
|
expr = re.sub(r"\s*\^\s*", "^", expr)
|
|
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
# Pattern 2: Spaces inside braces that follow _ or ^
|
|
|
|
|
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
|
|
|
|
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
2026-02-07 21:28:46 +08:00
|
|
|
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
|
|
|
|
|
# must be preserved as they serve as command terminators (\in X != \inX)
|
2026-02-05 13:32:13 +08:00
|
|
|
def clean_subscript_superscript_braces(match):
|
|
|
|
|
operator = match.group(1) # _ or ^
|
2026-02-05 20:32:26 +08:00
|
|
|
content = match.group(2) # content inside braces
|
2026-02-07 21:28:46 +08:00
|
|
|
if "\\" not in content:
|
|
|
|
|
# No LaTeX commands: safe to remove all spaces
|
|
|
|
|
cleaned = re.sub(r"\s+", "", content)
|
|
|
|
|
else:
|
|
|
|
|
# Contains LaTeX commands: remove spaces carefully
|
|
|
|
|
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
|
|
|
|
|
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
|
|
|
|
|
# Strategy: remove spaces before \ and between non-command chars,
|
|
|
|
|
# but preserve the space after \command when followed by a non-\ char
|
2026-02-09 22:18:30 +08:00
|
|
|
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
|
2026-03-10 21:36:35 +08:00
|
|
|
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
|
2026-02-05 13:32:13 +08:00
|
|
|
return f"{operator}{{{cleaned}}}"
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
# Match _{ ... } or ^{ ... }
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"([_^])\{([^}]+)\}", clean_subscript_superscript_braces, expr)
|
|
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
# Pattern 3: Spaces inside \frac arguments
|
|
|
|
|
# \frac { a } { b } -> \frac{a}{b}
|
|
|
|
|
# \frac{ a + b }{ c } -> \frac{a+b}{c}
|
|
|
|
|
def clean_frac_braces(match):
|
|
|
|
|
numerator = match.group(1).strip()
|
|
|
|
|
denominator = match.group(2).strip()
|
|
|
|
|
return f"\\frac{{{numerator}}}{{{denominator}}}"
|
2026-02-05 20:32:26 +08:00
|
|
|
|
|
|
|
|
expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr)
|
|
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
# Pattern 4: Spaces after backslash in LaTeX commands
|
|
|
|
|
# \ alpha -> \alpha, \ beta -> \beta
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr)
|
|
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
# Pattern 5: Spaces before/after braces in general contexts (conservative)
|
|
|
|
|
# Only remove if the space is clearly wrong (e.g., after operators)
|
|
|
|
|
# { x } in standalone context is kept as-is to avoid breaking valid spacing
|
|
|
|
|
# But after operators like \sqrt{ x } -> \sqrt{x}
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr) # \sqrt { -> \sqrt{
|
|
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
return expr
|
|
|
|
|
|
|
|
|
|
|
2026-01-05 21:37:51 +08:00
|
|
|
def _postprocess_math(expr: str) -> str:
|
2026-02-05 13:18:55 +08:00
|
|
|
"""Postprocess a *math* expression (already inside $...$ or $$...$$).
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
Processing stages:
|
2026-02-05 13:32:13 +08:00
|
|
|
0. Fix OCR number errors (spaces in numbers)
|
2026-02-07 21:28:46 +08:00
|
|
|
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
|
2026-02-05 13:32:13 +08:00
|
|
|
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
|
2026-02-05 13:18:55 +08:00
|
|
|
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
Args:
|
|
|
|
|
expr: LaTeX math expression without delimiters.
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
Returns:
|
|
|
|
|
Processed LaTeX expression.
|
|
|
|
|
"""
|
2026-02-04 16:04:18 +08:00
|
|
|
# stage0: fix OCR number errors (digits with spaces)
|
|
|
|
|
expr = _fix_ocr_number_errors(expr)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-07 21:28:46 +08:00
|
|
|
# stage1: split glued command tokens (e.g. \cdotdS, \inX)
|
2026-01-05 21:37:51 +08:00
|
|
|
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
|
|
|
|
|
expr = _clean_latex_syntax_spaces(expr)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:32:13 +08:00
|
|
|
# stage3: normalize differentials - DISABLED
|
2026-02-05 13:18:55 +08:00
|
|
|
# This feature is disabled because it's too aggressive and can break:
|
|
|
|
|
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
|
|
|
|
|
# - Variable names: dx, dy, dz might be variable names, not differentials
|
|
|
|
|
# - Subscripts: x_{dx}, y_{dy}
|
|
|
|
|
# - Function names or custom notation
|
|
|
|
|
#
|
|
|
|
|
# The risk of false positives (breaking valid LaTeX) outweighs the benefit
|
|
|
|
|
# of normalizing differentials for OCR output.
|
|
|
|
|
#
|
|
|
|
|
# If differential normalization is needed, implement a context-aware version:
|
|
|
|
|
# expr = _normalize_differentials_contextaware(expr)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
return expr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_differentials_contextaware(expr: str) -> str:
|
|
|
|
|
"""Context-aware differential normalization (optional, not used by default).
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
Only normalizes differentials in specific mathematical contexts:
|
|
|
|
|
1. After integral symbols: \\int dx, \\iint dA, \\oint dr
|
|
|
|
|
2. In fraction denominators: \\frac{dy}{dx}
|
|
|
|
|
3. In explicit differential notation: f(x)dx (function followed by differential)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
This avoids false positives like variable names, subscripts, or LaTeX commands.
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
Args:
|
|
|
|
|
expr: LaTeX math expression.
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
Returns:
|
|
|
|
|
Expression with differentials normalized in safe contexts only.
|
|
|
|
|
"""
|
|
|
|
|
# Pattern 1: After integral commands
|
|
|
|
|
# \int dx -> \int d x
|
2026-02-05 20:32:26 +08:00
|
|
|
integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])")
|
|
|
|
|
expr = integral_pattern.sub(r"\1 \2 d \3", expr)
|
|
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
# Pattern 2: In fraction denominators
|
|
|
|
|
# \frac{...}{dx} -> \frac{...}{d x}
|
2026-02-05 20:32:26 +08:00
|
|
|
frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})")
|
|
|
|
|
expr = frac_pattern.sub(r"\1d \2\3", expr)
|
|
|
|
|
|
2026-01-05 21:37:51 +08:00
|
|
|
return expr
|
|
|
|
|
|
|
|
|
|
|
2026-02-04 16:04:18 +08:00
|
|
|
def _fix_ocr_number_errors(expr: str) -> str:
|
|
|
|
|
"""Fix common OCR errors in LaTeX math expressions.
|
|
|
|
|
|
|
|
|
|
OCR often splits numbers incorrectly, especially decimals:
|
|
|
|
|
- "2 2. 2" should be "22.2"
|
|
|
|
|
- "3 0. 4" should be "30.4"
|
|
|
|
|
- "1 5 0" should be "150"
|
|
|
|
|
|
|
|
|
|
This function merges digit sequences that are separated by spaces.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
expr: LaTeX math expression.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
LaTeX expression with number errors fixed.
|
|
|
|
|
"""
|
|
|
|
|
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
|
|
|
|
|
# Example: "2 2. 2" → "22.2"
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
|
|
|
|
|
|
2026-02-04 16:04:18 +08:00
|
|
|
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
|
|
|
|
|
# Example: "22. 2" → "22.2"
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
|
|
|
|
|
|
2026-02-04 16:04:18 +08:00
|
|
|
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
|
|
|
|
|
# Be careful: only merge if followed by decimal point or comma/end
|
|
|
|
|
# Example: "1 5 0" → "150" when followed by comma or end
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
|
|
|
|
|
|
2026-02-04 16:04:18 +08:00
|
|
|
# Fix pattern 4: Multiple spaces in decimal numbers
|
|
|
|
|
# Example: "2 2 . 2" → "22.2"
|
2026-02-05 20:32:26 +08:00
|
|
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
|
|
|
|
|
|
2026-02-04 16:04:18 +08:00
|
|
|
return expr
|
|
|
|
|
|
|
|
|
|
|
2026-01-05 21:37:51 +08:00
|
|
|
def _postprocess_markdown(markdown_content: str) -> str:
|
|
|
|
|
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
|
|
|
|
|
if not markdown_content:
|
|
|
|
|
return markdown_content
|
|
|
|
|
|
|
|
|
|
def _fix_segment(m: re.Match) -> str:
|
|
|
|
|
seg = m.group(0)
|
|
|
|
|
if seg.startswith("$$") and seg.endswith("$$"):
|
|
|
|
|
return f"$${_postprocess_math(seg[2:-2])}$$"
|
|
|
|
|
if seg.startswith("$") and seg.endswith("$"):
|
|
|
|
|
return f"${_postprocess_math(seg[1:-1])}$"
|
|
|
|
|
return seg
|
|
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
# Apply markdown-level postprocessing (after LaTeX processing)
|
|
|
|
|
markdown_content = _remove_false_heading_from_single_formula(markdown_content)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
return markdown_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
|
|
|
|
"""Remove false heading markers from single-formula content.
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
|
|
|
|
|
This function detects and removes the heading marker when:
|
|
|
|
|
1. The content contains only one formula (display or inline)
|
|
|
|
|
2. The formula line starts with '#' (heading marker)
|
|
|
|
|
3. No other non-formula text content exists
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
Examples:
|
|
|
|
|
Input: "# $$E = mc^2$$"
|
|
|
|
|
Output: "$$E = mc^2$$"
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
Input: "# $x = y$"
|
|
|
|
|
Output: "$x = y$"
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading)
|
|
|
|
|
Output: "# Introduction\n$$E = mc^2$$"
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
Args:
|
|
|
|
|
markdown_content: Markdown text with potential false headings.
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
Returns:
|
|
|
|
|
Markdown text with false heading markers removed.
|
|
|
|
|
"""
|
|
|
|
|
if not markdown_content or not markdown_content.strip():
|
|
|
|
|
return markdown_content
|
2026-02-05 20:32:26 +08:00
|
|
|
|
|
|
|
|
lines = markdown_content.split("\n")
|
|
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
# Count formulas and heading lines
|
|
|
|
|
formula_count = 0
|
|
|
|
|
heading_lines = []
|
|
|
|
|
has_non_formula_text = False
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
line_stripped = line.strip()
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
if not line_stripped:
|
|
|
|
|
continue
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
# Check if line starts with heading marker
|
2026-02-05 20:32:26 +08:00
|
|
|
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped)
|
|
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
if heading_match:
|
|
|
|
|
heading_level = heading_match.group(1)
|
|
|
|
|
content = heading_match.group(2)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
# Check if the heading content is a formula
|
2026-02-05 20:32:26 +08:00
|
|
|
if re.fullmatch(r"\$\$?.+\$\$?", content):
|
2026-02-05 17:59:54 +08:00
|
|
|
# This is a heading with a formula
|
|
|
|
|
heading_lines.append((i, heading_level, content))
|
|
|
|
|
formula_count += 1
|
|
|
|
|
else:
|
|
|
|
|
# This is a real heading with text
|
|
|
|
|
has_non_formula_text = True
|
2026-02-05 20:32:26 +08:00
|
|
|
elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped):
|
2026-02-05 17:59:54 +08:00
|
|
|
# Standalone formula line (not in a heading)
|
|
|
|
|
formula_count += 1
|
2026-02-05 20:32:26 +08:00
|
|
|
elif line_stripped and not re.match(r"^#+\s*$", line_stripped):
|
2026-02-05 17:59:54 +08:00
|
|
|
# Non-empty, non-heading, non-formula line
|
|
|
|
|
has_non_formula_text = True
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 17:59:54 +08:00
|
|
|
# Only remove heading markers if:
|
|
|
|
|
# 1. There's exactly one formula
|
|
|
|
|
# 2. That formula is in a heading line
|
|
|
|
|
# 3. There's no other text content
|
|
|
|
|
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
|
|
|
|
|
# Remove the heading marker from the formula
|
|
|
|
|
line_idx, heading_level, formula_content = heading_lines[0]
|
|
|
|
|
lines[line_idx] = formula_content
|
2026-02-05 20:32:26 +08:00
|
|
|
|
|
|
|
|
return "\n".join(lines)
|
2026-01-05 21:37:51 +08:00
|
|
|
|
|
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
class OCRServiceBase(ABC):
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def recognize(self, image: np.ndarray) -> dict:
|
|
|
|
|
pass
|
|
|
|
|
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
class OCRService(OCRServiceBase):
|
2025-12-29 17:34:58 +08:00
|
|
|
"""Service for OCR using PaddleOCR-VL."""
|
|
|
|
|
|
2026-03-09 16:51:06 +08:00
|
|
|
_pipeline: PaddleOCRVL | None = None
|
|
|
|
|
_layout_detector: LayoutDetector | None = None
|
2025-12-29 17:34:58 +08:00
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
2025-12-31 17:38:32 +08:00
|
|
|
vl_server_url: str,
|
|
|
|
|
layout_detector: LayoutDetector,
|
|
|
|
|
image_processor: ImageProcessor,
|
|
|
|
|
converter: Converter,
|
2025-12-29 17:34:58 +08:00
|
|
|
):
|
|
|
|
|
"""Initialize OCR service.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
|
2025-12-31 17:38:32 +08:00
|
|
|
layout_detector: Layout detector instance.
|
|
|
|
|
image_processor: Image processor instance.
|
2025-12-29 17:34:58 +08:00
|
|
|
"""
|
|
|
|
|
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
|
2026-02-04 12:00:06 +08:00
|
|
|
self.layout_detector = layout_detector
|
2025-12-31 17:38:32 +08:00
|
|
|
self.image_processor = image_processor
|
|
|
|
|
self.converter = converter
|
2026-01-01 23:38:52 +08:00
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
def _get_pipeline(self):
|
2025-12-29 17:34:58 +08:00
|
|
|
"""Get or create PaddleOCR-VL pipeline.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
PaddleOCRVL pipeline instance.
|
|
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
if OCRService._pipeline is None:
|
|
|
|
|
OCRService._pipeline = PaddleOCRVL(
|
2025-12-29 17:34:58 +08:00
|
|
|
vl_rec_backend="vllm-server",
|
|
|
|
|
vl_rec_server_url=self.vl_server_url,
|
|
|
|
|
layout_detection_model_name="PP-DocLayoutV2",
|
|
|
|
|
)
|
2025-12-31 17:38:32 +08:00
|
|
|
return OCRService._pipeline
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
def _recognize_mixed(self, image: np.ndarray) -> dict:
|
2025-12-29 17:34:58 +08:00
|
|
|
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
|
|
|
|
|
|
|
|
|
|
This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
|
|
|
|
|
recognition of mixed content.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'markdown', 'latex', 'mathml' keys.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
pipeline = self._get_pipeline()
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
output = pipeline.predict(image, use_layout_detection=True)
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
markdown_content = ""
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
for res in output:
|
|
|
|
|
markdown_content += res.markdown.get("markdown_texts", "")
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2026-01-05 21:37:51 +08:00
|
|
|
markdown_content = _postprocess_markdown(markdown_content)
|
2026-02-04 12:00:06 +08:00
|
|
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
return {
|
|
|
|
|
"markdown": markdown_content,
|
|
|
|
|
"latex": convert_result.latex,
|
|
|
|
|
"mathml": convert_result.mathml,
|
2026-02-04 12:00:06 +08:00
|
|
|
"mml": convert_result.mml,
|
2025-12-31 17:38:32 +08:00
|
|
|
}
|
2025-12-29 17:34:58 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
|
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
def _recognize_formula(self, image: np.ndarray) -> dict:
|
2025-12-29 17:34:58 +08:00
|
|
|
"""Recognize formula/math content using PaddleOCR-VL with prompt.
|
|
|
|
|
|
|
|
|
|
This mode uses PaddleOCR-VL directly with a formula recognition prompt.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'latex', 'markdown', 'mathml' keys.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
2025-12-31 17:38:32 +08:00
|
|
|
pipeline = self._get_pipeline()
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
markdown_content = ""
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
for res in output:
|
|
|
|
|
markdown_content += res.markdown.get("markdown_texts", "")
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2026-01-05 21:37:51 +08:00
|
|
|
markdown_content = _postprocess_markdown(markdown_content)
|
2025-12-31 17:38:32 +08:00
|
|
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
2025-12-29 17:34:58 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
return {
|
|
|
|
|
"latex": convert_result.latex,
|
|
|
|
|
"mathml": convert_result.mathml,
|
2026-02-04 12:00:06 +08:00
|
|
|
"mml": convert_result.mml,
|
2025-12-31 17:38:32 +08:00
|
|
|
"markdown": markdown_content,
|
|
|
|
|
}
|
2025-12-29 17:34:58 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"Formula recognition failed: {e}") from e
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def recognize(self, image: np.ndarray) -> dict:
|
|
|
|
|
"""Recognize content using PaddleOCR-VL.
|
2025-12-29 17:34:58 +08:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
2025-12-31 17:38:32 +08:00
|
|
|
Dict with 'latex', 'markdown', 'mathml' keys.
|
2025-12-29 17:34:58 +08:00
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
padded_image = self.image_processor.add_padding(image)
|
|
|
|
|
layout_info = self.layout_detector.detect(padded_image)
|
|
|
|
|
if layout_info.MixedRecognition:
|
2026-01-05 17:30:54 +08:00
|
|
|
return self._recognize_mixed(image)
|
2025-12-29 17:34:58 +08:00
|
|
|
else:
|
2026-01-05 17:30:54 +08:00
|
|
|
return self._recognize_formula(image)
|
|
|
|
|
|
|
|
|
|
|
2026-02-06 15:06:50 +08:00
|
|
|
class GLMOCRService(OCRServiceBase):
|
|
|
|
|
"""Service for OCR using GLM-4V model via vLLM."""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
vl_server_url: str,
|
|
|
|
|
image_processor: ImageProcessor,
|
|
|
|
|
converter: Converter,
|
|
|
|
|
):
|
|
|
|
|
"""Initialize GLM OCR service.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
vl_server_url: URL of the vLLM server for GLM-4V (default: http://127.0.0.1:8002/v1).
|
|
|
|
|
image_processor: Image processor instance.
|
|
|
|
|
converter: Converter instance for format conversion.
|
|
|
|
|
"""
|
|
|
|
|
self.vl_server_url = vl_server_url or settings.glm_ocr_url
|
|
|
|
|
self.image_processor = image_processor
|
|
|
|
|
self.converter = converter
|
|
|
|
|
self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600)
|
|
|
|
|
|
|
|
|
|
def _recognize_formula(self, image: np.ndarray) -> dict:
|
|
|
|
|
"""Recognize formula/math content using GLM-4V.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
|
2026-02-09 22:18:30 +08:00
|
|
|
|
2026-02-07 21:38:41 +08:00
|
|
|
Raises:
|
|
|
|
|
RuntimeError: If recognition fails (preserves original exception for fallback handling).
|
2026-02-06 15:06:50 +08:00
|
|
|
"""
|
2026-02-07 21:38:41 +08:00
|
|
|
# Add padding to image
|
|
|
|
|
padded_image = self.image_processor.add_padding(image)
|
2026-02-06 15:06:50 +08:00
|
|
|
|
2026-02-07 21:38:41 +08:00
|
|
|
# Encode image to base64
|
|
|
|
|
success, encoded_image = cv2.imencode(".png", padded_image)
|
|
|
|
|
if not success:
|
|
|
|
|
raise RuntimeError("Failed to encode image")
|
|
|
|
|
|
|
|
|
|
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
|
|
|
|
|
image_url = f"data:image/png;base64,{image_base64}"
|
|
|
|
|
|
|
|
|
|
# Call OpenAI-compatible API with formula recognition prompt
|
|
|
|
|
prompt = "Formula Recognition:"
|
2026-03-09 16:51:06 +08:00
|
|
|
messages = [
|
|
|
|
|
{
|
|
|
|
|
"role": "user",
|
|
|
|
|
"content": [
|
|
|
|
|
{"type": "image_url", "image_url": {"url": image_url}},
|
|
|
|
|
{"type": "text", "text": prompt},
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
]
|
2026-02-07 21:38:41 +08:00
|
|
|
|
|
|
|
|
# Don't catch exceptions here - let them propagate for fallback handling
|
|
|
|
|
response = self.openai_client.chat.completions.create(
|
|
|
|
|
model="glm-ocr",
|
|
|
|
|
messages=messages,
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
markdown_content = response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
# Process LaTeX delimiters
|
|
|
|
|
if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
|
|
|
|
|
markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
|
|
|
|
|
markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
|
|
|
|
|
elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
|
|
|
|
|
markdown_content = f"$${markdown_content}$$"
|
|
|
|
|
|
|
|
|
|
# Apply postprocessing
|
|
|
|
|
markdown_content = _postprocess_markdown(markdown_content)
|
|
|
|
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"latex": convert_result.latex,
|
|
|
|
|
"mathml": convert_result.mathml,
|
|
|
|
|
"mml": convert_result.mml,
|
|
|
|
|
"markdown": markdown_content,
|
|
|
|
|
}
|
2026-02-06 15:06:50 +08:00
|
|
|
|
|
|
|
|
def recognize(self, image: np.ndarray) -> dict:
|
|
|
|
|
"""Recognize content using GLM-4V.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
|
|
|
|
|
"""
|
|
|
|
|
return self._recognize_formula(image)
|
|
|
|
|
|
|
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
class MineruOCRService(OCRServiceBase):
|
|
|
|
|
"""Service for OCR using local file_parse API."""
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
api_url: str = "http://127.0.0.1:8000/file_parse",
|
2026-03-09 16:51:06 +08:00
|
|
|
image_processor: ImageProcessor | None = None,
|
|
|
|
|
converter: Converter | None = None,
|
2026-02-09 22:26:31 +08:00
|
|
|
glm_ocr_url: str = "http://localhost:8002/v1",
|
2026-03-09 16:51:06 +08:00
|
|
|
layout_detector: LayoutDetector | None = None,
|
2026-01-05 17:30:54 +08:00
|
|
|
):
|
|
|
|
|
"""Initialize Local API service.
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
Args:
|
|
|
|
|
api_url: URL of the local file_parse API endpoint.
|
|
|
|
|
converter: Optional converter instance for format conversion.
|
2026-02-09 22:26:31 +08:00
|
|
|
glm_ocr_url: URL of the GLM-OCR vLLM server.
|
2026-01-05 17:30:54 +08:00
|
|
|
"""
|
|
|
|
|
self.api_url = api_url
|
2026-01-05 21:37:51 +08:00
|
|
|
self.image_processor = image_processor
|
2026-01-05 17:30:54 +08:00
|
|
|
self.converter = converter
|
2026-02-09 22:26:31 +08:00
|
|
|
self.glm_ocr_url = glm_ocr_url
|
|
|
|
|
self.openai_client = OpenAI(api_key="EMPTY", base_url=glm_ocr_url, timeout=3600)
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-03-10 21:36:35 +08:00
|
|
|
def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
|
2026-02-05 20:32:26 +08:00
|
|
|
"""Recognize formula using PaddleOCR-VL API.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
prompt: Recognition prompt (default: "Formula Recognition:")
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Recognized formula text (LaTeX format).
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
# Encode image to base64
|
|
|
|
|
success, encoded_image = cv2.imencode(".png", image)
|
|
|
|
|
if not success:
|
|
|
|
|
raise RuntimeError("Failed to encode image")
|
|
|
|
|
|
|
|
|
|
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
|
|
|
|
|
image_url = f"data:image/png;base64,{image_base64}"
|
|
|
|
|
|
|
|
|
|
# Call OpenAI-compatible API
|
2026-03-09 16:51:06 +08:00
|
|
|
messages = [
|
|
|
|
|
{
|
|
|
|
|
"role": "user",
|
|
|
|
|
"content": [
|
|
|
|
|
{"type": "image_url", "image_url": {"url": image_url}},
|
|
|
|
|
{"type": "text", "text": prompt},
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
]
|
2026-02-05 20:32:26 +08:00
|
|
|
|
|
|
|
|
response = self.openai_client.chat.completions.create(
|
2026-02-09 22:18:30 +08:00
|
|
|
model="glm-ocr",
|
2026-02-05 20:32:26 +08:00
|
|
|
messages=messages,
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
|
|
|
|
|
|
2026-03-10 21:36:35 +08:00
|
|
|
def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
|
2026-02-05 20:32:26 +08:00
|
|
|
"""Extract image references from markdown and recognize formulas.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
markdown_content: Markdown content with potential image references.
|
|
|
|
|
original_image: Original input image.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Markdown content with formulas recognized by PaddleOCR-VL.
|
|
|
|
|
"""
|
2026-02-05 20:50:04 +08:00
|
|
|
# Pattern to match image references:  or 
|
2026-02-05 20:32:26 +08:00
|
|
|
image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
|
|
|
|
|
|
|
|
|
|
if not image_pattern.search(markdown_content):
|
|
|
|
|
return markdown_content
|
|
|
|
|
|
2026-02-05 21:26:23 +08:00
|
|
|
formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
|
2026-02-05 20:50:04 +08:00
|
|
|
|
2026-02-05 21:50:12 +08:00
|
|
|
if formula_text.startswith(r"\[") or formula_text.startswith(r"\("):
|
|
|
|
|
formula_text = formula_text.replace(r"\[", "$$").replace(r"\(", "$$")
|
|
|
|
|
formula_text = formula_text.replace(r"\]", "$$").replace(r"\)", "$$")
|
2026-02-25 09:52:45 +08:00
|
|
|
elif not formula_text.startswith("$$") and not formula_text.startswith("$"):
|
2026-02-05 21:26:23 +08:00
|
|
|
formula_text = f"$${formula_text}$$"
|
2026-02-05 20:32:26 +08:00
|
|
|
|
2026-02-05 21:26:23 +08:00
|
|
|
return formula_text
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-02-07 16:53:09 +08:00
|
|
|
def recognize(self, image_bytes: BytesIO) -> dict:
|
2026-01-05 17:30:54 +08:00
|
|
|
"""Recognize content using local file_parse API.
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
Args:
|
2026-02-07 16:53:09 +08:00
|
|
|
image_bytes: Input image as BytesIO object (already encoded as PNG).
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
Returns:
|
|
|
|
|
Dict with 'markdown', 'latex', 'mathml' keys.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
2026-02-09 22:23:52 +08:00
|
|
|
# Decode image_bytes to numpy array for potential formula recognition
|
|
|
|
|
image_bytes.seek(0)
|
|
|
|
|
image_data = np.frombuffer(image_bytes.read(), dtype=np.uint8)
|
|
|
|
|
original_image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
|
2026-02-09 22:26:31 +08:00
|
|
|
|
2026-02-09 22:23:52 +08:00
|
|
|
# Reset image_bytes for API request
|
|
|
|
|
image_bytes.seek(0)
|
2026-02-09 22:26:31 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
# Prepare multipart form data
|
2026-02-04 12:00:06 +08:00
|
|
|
files = {"files": ("image.png", image_bytes, "image/png")}
|
|
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
data = {
|
2026-02-04 12:00:06 +08:00
|
|
|
"return_middle_json": "false",
|
|
|
|
|
"return_model_output": "false",
|
|
|
|
|
"return_md": "true",
|
|
|
|
|
"return_images": "false",
|
|
|
|
|
"end_page_id": "99999",
|
|
|
|
|
"start_page_id": "0",
|
|
|
|
|
"lang_list": "en",
|
|
|
|
|
"server_url": "string",
|
|
|
|
|
"return_content_list": "false",
|
|
|
|
|
"backend": "hybrid-auto-engine",
|
|
|
|
|
"table_enable": "true",
|
|
|
|
|
"response_format_zip": "false",
|
|
|
|
|
"formula_enable": "true",
|
|
|
|
|
"parse_method": "ocr",
|
2026-01-05 17:30:54 +08:00
|
|
|
}
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
# Make API request
|
2026-03-09 16:51:06 +08:00
|
|
|
response = requests.post(
|
|
|
|
|
self.api_url,
|
|
|
|
|
files=files,
|
|
|
|
|
data=data,
|
|
|
|
|
headers={"accept": "application/json"},
|
|
|
|
|
timeout=30,
|
|
|
|
|
)
|
2026-01-05 17:30:54 +08:00
|
|
|
response.raise_for_status()
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
result = response.json()
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
# Extract markdown content from response
|
|
|
|
|
markdown_content = ""
|
2026-02-04 12:00:06 +08:00
|
|
|
if "results" in result and "image" in result["results"]:
|
|
|
|
|
markdown_content = result["results"]["image"].get("md_content", "")
|
2026-01-05 21:37:51 +08:00
|
|
|
|
2026-02-09 22:18:30 +08:00
|
|
|
if "
|
2026-02-09 22:18:30 +08:00
|
|
|
|
2026-02-04 16:07:04 +08:00
|
|
|
# Apply postprocessing to fix OCR errors
|
|
|
|
|
markdown_content = _postprocess_markdown(markdown_content)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
# Convert to other formats if converter is available
|
|
|
|
|
latex = ""
|
|
|
|
|
mathml = ""
|
2026-02-04 12:00:06 +08:00
|
|
|
mml = ""
|
2026-01-05 17:30:54 +08:00
|
|
|
if self.converter and markdown_content:
|
|
|
|
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
|
|
|
|
latex = convert_result.latex
|
|
|
|
|
mathml = convert_result.mathml
|
2026-02-04 12:00:06 +08:00
|
|
|
mml = convert_result.mml
|
|
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
return {
|
|
|
|
|
"markdown": markdown_content,
|
|
|
|
|
"latex": latex,
|
|
|
|
|
"mathml": mathml,
|
2026-02-04 12:00:06 +08:00
|
|
|
"mml": mml,
|
2026-01-05 17:30:54 +08:00
|
|
|
}
|
2026-02-04 12:00:06 +08:00
|
|
|
|
2026-01-05 17:30:54 +08:00
|
|
|
except requests.RequestException as e:
|
|
|
|
|
raise RuntimeError(f"Local API request failed: {e}") from e
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"Recognition failed: {e}") from e
|
|
|
|
|
|
|
|
|
|
|
2026-03-09 16:51:06 +08:00
|
|
|
# Task-specific prompts (from GLM-OCR SDK config.yaml)
|
|
|
|
|
_TASK_PROMPTS: dict[str, str] = {
|
2026-03-12 22:30:27 +08:00
|
|
|
"text": "Text Recognition. If the content is a formula, please ouput display latex code, else output text",
|
2026-03-09 16:51:06 +08:00
|
|
|
"formula": "Formula Recognition:",
|
|
|
|
|
"table": "Table Recognition:",
|
|
|
|
|
}
|
2026-03-12 22:30:27 +08:00
|
|
|
_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please ouput display latex code, else output text"
|
2026-03-09 16:51:06 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class GLMOCREndToEndService(OCRServiceBase):
|
|
|
|
|
"""End-to-end OCR using GLM-OCR pipeline: layout detection → per-region OCR.
|
|
|
|
|
|
|
|
|
|
Pipeline:
|
|
|
|
|
1. Add padding (ImageProcessor)
|
|
|
|
|
2. Detect layout regions (LayoutDetector → PP-DocLayoutV3)
|
|
|
|
|
3. Crop each region and call vLLM with a task-specific prompt (parallel)
|
|
|
|
|
4. GLMResultFormatter: clean, format titles/bullets/formulas, merge tags
|
|
|
|
|
5. _postprocess_markdown: LaTeX math error correction
|
|
|
|
|
6. Converter: markdown → latex/mathml/mml
|
|
|
|
|
|
|
|
|
|
This replaces both GLMOCRService (formula-only) and MineruOCRService (mixed).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
vl_server_url: str,
|
|
|
|
|
image_processor: ImageProcessor,
|
|
|
|
|
converter: Converter,
|
|
|
|
|
layout_detector: LayoutDetector,
|
|
|
|
|
max_workers: int = 8,
|
|
|
|
|
):
|
|
|
|
|
self.vl_server_url = vl_server_url or settings.glm_ocr_url
|
|
|
|
|
self.image_processor = image_processor
|
|
|
|
|
self.converter = converter
|
|
|
|
|
self.layout_detector = layout_detector
|
|
|
|
|
self.max_workers = max_workers
|
|
|
|
|
self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600)
|
|
|
|
|
self._formatter = GLMResultFormatter()
|
|
|
|
|
|
|
|
|
|
def _encode_region(self, image: np.ndarray) -> str:
|
|
|
|
|
"""Convert BGR numpy array to base64 JPEG string."""
|
|
|
|
|
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
|
|
|
pil_img = PILImage.fromarray(rgb)
|
|
|
|
|
buf = BytesIO()
|
|
|
|
|
pil_img.save(buf, format="JPEG")
|
|
|
|
|
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
|
|
|
|
|
|
|
|
|
def _call_vllm(self, image: np.ndarray, prompt: str) -> str:
|
|
|
|
|
"""Send image + prompt to vLLM and return raw content string."""
|
|
|
|
|
img_b64 = self._encode_region(image)
|
|
|
|
|
data_url = f"data:image/jpeg;base64,{img_b64}"
|
|
|
|
|
messages = [
|
|
|
|
|
{
|
|
|
|
|
"role": "user",
|
|
|
|
|
"content": [
|
|
|
|
|
{"type": "image_url", "image_url": {"url": data_url}},
|
|
|
|
|
{"type": "text", "text": prompt},
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
response = self.openai_client.chat.completions.create(
|
|
|
|
|
model="glm-ocr",
|
|
|
|
|
messages=messages,
|
|
|
|
|
temperature=0.01,
|
|
|
|
|
max_tokens=settings.max_tokens,
|
|
|
|
|
)
|
|
|
|
|
return response.choices[0].message.content.strip()
|
|
|
|
|
|
|
|
|
|
def _normalize_bbox(self, bbox: list[float], img_w: int, img_h: int) -> list[int]:
|
|
|
|
|
"""Convert pixel bbox [x1,y1,x2,y2] to 0-1000 normalised coords."""
|
|
|
|
|
x1, y1, x2, y2 = bbox
|
|
|
|
|
return [
|
|
|
|
|
int(x1 / img_w * 1000),
|
|
|
|
|
int(y1 / img_h * 1000),
|
|
|
|
|
int(x2 / img_w * 1000),
|
|
|
|
|
int(y2 / img_h * 1000),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def recognize(self, image: np.ndarray) -> dict:
|
|
|
|
|
"""Full pipeline: padding → layout → per-region OCR → postprocess → markdown.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image: Input image as numpy array in BGR format.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'markdown', 'latex', 'mathml', 'mml' keys.
|
|
|
|
|
"""
|
2026-03-10 19:52:22 +08:00
|
|
|
# 1. Layout detection
|
|
|
|
|
img_h, img_w = image.shape[:2]
|
|
|
|
|
layout_info = self.layout_detector.detect(image)
|
2026-03-09 16:51:06 +08:00
|
|
|
|
2026-03-09 17:57:05 +08:00
|
|
|
# Sort regions in reading order: top-to-bottom, left-to-right
|
|
|
|
|
layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
|
|
|
|
|
|
2026-03-09 16:51:06 +08:00
|
|
|
# 3. OCR: per-region (parallel) or full-image fallback
|
2026-03-12 22:30:27 +08:00
|
|
|
if not layout_info.regions or (len(layout_info.regions) == 1 and not layout_info.MixedRecognition):
|
2026-03-10 09:54:54 +08:00
|
|
|
# No layout detected → assume it's a formula, use formula recognition
|
|
|
|
|
logger.info("No layout regions detected, treating image as formula")
|
2026-03-10 19:52:22 +08:00
|
|
|
raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"])
|
2026-03-10 09:54:54 +08:00
|
|
|
# Format as display formula markdown
|
|
|
|
|
formatted_content = raw_content.strip()
|
|
|
|
|
if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")):
|
|
|
|
|
formatted_content = f"$$\n{formatted_content}\n$$"
|
|
|
|
|
markdown_content = formatted_content
|
2026-03-09 16:51:06 +08:00
|
|
|
else:
|
|
|
|
|
# Build task list for non-figure regions
|
|
|
|
|
tasks = []
|
|
|
|
|
for idx, region in enumerate(layout_info.regions):
|
|
|
|
|
if region.type == "figure":
|
|
|
|
|
continue
|
|
|
|
|
x1, y1, x2, y2 = (int(c) for c in region.bbox)
|
2026-03-10 19:52:22 +08:00
|
|
|
cropped = image[y1:y2, x1:x2]
|
2026-03-09 17:57:05 +08:00
|
|
|
if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
|
|
|
|
|
logger.warning(
|
|
|
|
|
"Skipping region idx=%d (label=%s): crop too small %s",
|
|
|
|
|
idx,
|
|
|
|
|
region.native_label,
|
|
|
|
|
cropped.shape[:2],
|
|
|
|
|
)
|
2026-03-09 16:51:06 +08:00
|
|
|
continue
|
|
|
|
|
prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
|
|
|
|
|
tasks.append((idx, region, cropped, prompt))
|
|
|
|
|
|
|
|
|
|
if not tasks:
|
2026-03-10 19:52:22 +08:00
|
|
|
raw_content = self._call_vllm(image, _DEFAULT_PROMPT)
|
2026-03-09 16:51:06 +08:00
|
|
|
markdown_content = self._formatter._clean_content(raw_content)
|
|
|
|
|
else:
|
|
|
|
|
# Parallel OCR calls
|
|
|
|
|
raw_results: dict[int, str] = {}
|
|
|
|
|
with ThreadPoolExecutor(max_workers=min(self.max_workers, len(tasks))) as ex:
|
2026-03-10 21:36:35 +08:00
|
|
|
future_map = {ex.submit(self._call_vllm, cropped, prompt): idx for idx, region, cropped, prompt in tasks}
|
2026-03-09 16:51:06 +08:00
|
|
|
for future in as_completed(future_map):
|
|
|
|
|
idx = future_map[future]
|
|
|
|
|
try:
|
|
|
|
|
raw_results[idx] = future.result()
|
2026-03-09 17:57:05 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning("vLLM call failed for region idx=%d: %s", idx, e)
|
2026-03-09 16:51:06 +08:00
|
|
|
raw_results[idx] = ""
|
|
|
|
|
|
|
|
|
|
# Build structured region dicts for GLMResultFormatter
|
|
|
|
|
region_dicts = []
|
|
|
|
|
for idx, region, _cropped, _prompt in tasks:
|
|
|
|
|
region_dicts.append(
|
|
|
|
|
{
|
|
|
|
|
"index": idx,
|
|
|
|
|
"label": region.type,
|
|
|
|
|
"native_label": region.native_label,
|
|
|
|
|
"content": raw_results.get(idx, ""),
|
|
|
|
|
"bbox_2d": self._normalize_bbox(region.bbox, img_w, img_h),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 4. GLM-OCR postprocessing: clean, format, merge, bullets
|
|
|
|
|
markdown_content = self._formatter.process(region_dicts)
|
|
|
|
|
|
|
|
|
|
# 5. LaTeX math error correction (our existing pipeline)
|
|
|
|
|
markdown_content = _postprocess_markdown(markdown_content)
|
|
|
|
|
|
|
|
|
|
# 6. Format conversion
|
|
|
|
|
latex, mathml, mml = "", "", ""
|
|
|
|
|
if markdown_content and self.converter:
|
2026-03-09 17:57:05 +08:00
|
|
|
try:
|
|
|
|
|
fmt = self.converter.convert_to_formats(markdown_content)
|
|
|
|
|
latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
|
|
|
|
|
except RuntimeError as e:
|
|
|
|
|
logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)
|
2026-03-09 16:51:06 +08:00
|
|
|
|
|
|
|
|
return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}
|