feat: add paddleocr-vl
This commit is contained in:
@@ -5,6 +5,7 @@ import numpy as np
|
||||
import cv2
|
||||
import requests
|
||||
from io import BytesIO
|
||||
import base64
|
||||
from app.core.config import get_settings
|
||||
from paddleocr import PaddleOCRVL
|
||||
from typing import Optional
|
||||
@@ -12,6 +13,7 @@ from app.services.layout_detector import LayoutDetector
|
||||
from app.services.image_processor import ImageProcessor
|
||||
from app.services.converter import Converter
|
||||
from abc import ABC, abstractmethod
|
||||
from openai import OpenAI
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
@@ -90,42 +92,42 @@ def _split_glued_command_token(token: str) -> str:
|
||||
|
||||
def _clean_latex_syntax_spaces(expr: str) -> str:
|
||||
"""Clean unwanted spaces in LaTeX syntax (common OCR errors).
|
||||
|
||||
|
||||
OCR often adds spaces in LaTeX syntax structures where they shouldn't be:
|
||||
- Subscripts: a _ {i 1} -> a_{i1}
|
||||
- Superscripts: x ^ {2 3} -> x^{23}
|
||||
- Fractions: \\frac { a } { b } -> \\frac{a}{b}
|
||||
- Commands: \\ alpha -> \\alpha
|
||||
- Braces: { a b } -> {ab} (within subscripts/superscripts)
|
||||
|
||||
|
||||
This is safe because these spaces are always OCR errors - LaTeX doesn't
|
||||
need or want spaces in these positions.
|
||||
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression.
|
||||
|
||||
|
||||
Returns:
|
||||
Expression with LaTeX syntax spaces cleaned.
|
||||
"""
|
||||
# Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
|
||||
# a _ {i} -> a_{i}, x ^ {2} -> x^{2}
|
||||
expr = re.sub(r'\s*_\s*', '_', expr)
|
||||
expr = re.sub(r'\s*\^\s*', '^', expr)
|
||||
|
||||
expr = re.sub(r"\s*_\s*", "_", expr)
|
||||
expr = re.sub(r"\s*\^\s*", "^", expr)
|
||||
|
||||
# Pattern 2: Spaces inside braces that follow _ or ^
|
||||
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
||||
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
||||
def clean_subscript_superscript_braces(match):
|
||||
operator = match.group(1) # _ or ^
|
||||
content = match.group(2) # content inside braces
|
||||
content = match.group(2) # content inside braces
|
||||
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
|
||||
# Only remove spaces between non-backslash characters
|
||||
cleaned = re.sub(r'(?<!\\)\s+(?!\\)', '', content)
|
||||
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
|
||||
return f"{operator}{{{cleaned}}}"
|
||||
|
||||
|
||||
# Match _{ ... } or ^{ ... }
|
||||
expr = re.sub(r'([_^])\{([^}]+)\}', clean_subscript_superscript_braces, expr)
|
||||
|
||||
expr = re.sub(r"([_^])\{([^}]+)\}", clean_subscript_superscript_braces, expr)
|
||||
|
||||
# Pattern 3: Spaces inside \frac arguments
|
||||
# \frac { a } { b } -> \frac{a}{b}
|
||||
# \frac{ a + b }{ c } -> \frac{a+b}{c}
|
||||
@@ -133,47 +135,46 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
|
||||
numerator = match.group(1).strip()
|
||||
denominator = match.group(2).strip()
|
||||
return f"\\frac{{{numerator}}}{{{denominator}}}"
|
||||
|
||||
expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}',
|
||||
clean_frac_braces, expr)
|
||||
|
||||
|
||||
expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr)
|
||||
|
||||
# Pattern 4: Spaces after backslash in LaTeX commands
|
||||
# \ alpha -> \alpha, \ beta -> \beta
|
||||
expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr)
|
||||
|
||||
expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr)
|
||||
|
||||
# Pattern 5: Spaces before/after braces in general contexts (conservative)
|
||||
# Only remove if the space is clearly wrong (e.g., after operators)
|
||||
# { x } in standalone context is kept as-is to avoid breaking valid spacing
|
||||
# But after operators like \sqrt{ x } -> \sqrt{x}
|
||||
expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr) # \sqrt { -> \sqrt{
|
||||
|
||||
expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr) # \sqrt { -> \sqrt{
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
def _postprocess_math(expr: str) -> str:
|
||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$).
|
||||
|
||||
|
||||
Processing stages:
|
||||
0. Fix OCR number errors (spaces in numbers)
|
||||
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
|
||||
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
|
||||
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
||||
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression without delimiters.
|
||||
|
||||
|
||||
Returns:
|
||||
Processed LaTeX expression.
|
||||
"""
|
||||
# stage0: fix OCR number errors (digits with spaces)
|
||||
expr = _fix_ocr_number_errors(expr)
|
||||
|
||||
|
||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||
|
||||
|
||||
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
|
||||
expr = _clean_latex_syntax_spaces(expr)
|
||||
|
||||
|
||||
# stage3: normalize differentials - DISABLED
|
||||
# This feature is disabled because it's too aggressive and can break:
|
||||
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
|
||||
@@ -186,40 +187,36 @@ def _postprocess_math(expr: str) -> str:
|
||||
#
|
||||
# If differential normalization is needed, implement a context-aware version:
|
||||
# expr = _normalize_differentials_contextaware(expr)
|
||||
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
def _normalize_differentials_contextaware(expr: str) -> str:
|
||||
"""Context-aware differential normalization (optional, not used by default).
|
||||
|
||||
|
||||
Only normalizes differentials in specific mathematical contexts:
|
||||
1. After integral symbols: \\int dx, \\iint dA, \\oint dr
|
||||
2. In fraction denominators: \\frac{dy}{dx}
|
||||
3. In explicit differential notation: f(x)dx (function followed by differential)
|
||||
|
||||
|
||||
This avoids false positives like variable names, subscripts, or LaTeX commands.
|
||||
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression.
|
||||
|
||||
|
||||
Returns:
|
||||
Expression with differentials normalized in safe contexts only.
|
||||
"""
|
||||
# Pattern 1: After integral commands
|
||||
# \int dx -> \int d x
|
||||
integral_pattern = re.compile(
|
||||
r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
|
||||
)
|
||||
expr = integral_pattern.sub(r'\1 \2 d \3', expr)
|
||||
|
||||
integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])")
|
||||
expr = integral_pattern.sub(r"\1 \2 d \3", expr)
|
||||
|
||||
# Pattern 2: In fraction denominators
|
||||
# \frac{...}{dx} -> \frac{...}{d x}
|
||||
frac_pattern = re.compile(
|
||||
r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
|
||||
)
|
||||
expr = frac_pattern.sub(r'\1d \2\3', expr)
|
||||
|
||||
frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})")
|
||||
expr = frac_pattern.sub(r"\1d \2\3", expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
@@ -241,21 +238,21 @@ def _fix_ocr_number_errors(expr: str) -> str:
|
||||
"""
|
||||
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
|
||||
# Example: "2 2. 2" → "22.2"
|
||||
expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
|
||||
|
||||
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
|
||||
|
||||
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
|
||||
# Example: "22. 2" → "22.2"
|
||||
expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
|
||||
|
||||
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
|
||||
|
||||
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
|
||||
# Be careful: only merge if followed by decimal point or comma/end
|
||||
# Example: "1 5 0" → "150" when followed by comma or end
|
||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
|
||||
|
||||
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
|
||||
|
||||
# Fix pattern 4: Multiple spaces in decimal numbers
|
||||
# Example: "2 2 . 2" → "22.2"
|
||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
|
||||
|
||||
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
@@ -273,76 +270,76 @@ def _postprocess_markdown(markdown_content: str) -> str:
|
||||
return seg
|
||||
|
||||
markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
|
||||
|
||||
|
||||
# Apply markdown-level postprocessing (after LaTeX processing)
|
||||
markdown_content = _remove_false_heading_from_single_formula(markdown_content)
|
||||
|
||||
|
||||
return markdown_content
|
||||
|
||||
|
||||
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
||||
"""Remove false heading markers from single-formula content.
|
||||
|
||||
|
||||
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
|
||||
This function detects and removes the heading marker when:
|
||||
1. The content contains only one formula (display or inline)
|
||||
2. The formula line starts with '#' (heading marker)
|
||||
3. No other non-formula text content exists
|
||||
|
||||
|
||||
Examples:
|
||||
Input: "# $$E = mc^2$$"
|
||||
Output: "$$E = mc^2$$"
|
||||
|
||||
|
||||
Input: "# $x = y$"
|
||||
Output: "$x = y$"
|
||||
|
||||
|
||||
Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading)
|
||||
Output: "# Introduction\n$$E = mc^2$$"
|
||||
|
||||
|
||||
Args:
|
||||
markdown_content: Markdown text with potential false headings.
|
||||
|
||||
|
||||
Returns:
|
||||
Markdown text with false heading markers removed.
|
||||
"""
|
||||
if not markdown_content or not markdown_content.strip():
|
||||
return markdown_content
|
||||
|
||||
lines = markdown_content.split('\n')
|
||||
|
||||
|
||||
lines = markdown_content.split("\n")
|
||||
|
||||
# Count formulas and heading lines
|
||||
formula_count = 0
|
||||
heading_lines = []
|
||||
has_non_formula_text = False
|
||||
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip()
|
||||
|
||||
|
||||
if not line_stripped:
|
||||
continue
|
||||
|
||||
|
||||
# Check if line starts with heading marker
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
|
||||
|
||||
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped)
|
||||
|
||||
if heading_match:
|
||||
heading_level = heading_match.group(1)
|
||||
content = heading_match.group(2)
|
||||
|
||||
|
||||
# Check if the heading content is a formula
|
||||
if re.fullmatch(r'\$\$?.+\$\$?', content):
|
||||
if re.fullmatch(r"\$\$?.+\$\$?", content):
|
||||
# This is a heading with a formula
|
||||
heading_lines.append((i, heading_level, content))
|
||||
formula_count += 1
|
||||
else:
|
||||
# This is a real heading with text
|
||||
has_non_formula_text = True
|
||||
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
|
||||
elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped):
|
||||
# Standalone formula line (not in a heading)
|
||||
formula_count += 1
|
||||
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
|
||||
elif line_stripped and not re.match(r"^#+\s*$", line_stripped):
|
||||
# Non-empty, non-heading, non-formula line
|
||||
has_non_formula_text = True
|
||||
|
||||
|
||||
# Only remove heading markers if:
|
||||
# 1. There's exactly one formula
|
||||
# 2. That formula is in a heading line
|
||||
@@ -351,8 +348,8 @@ def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
||||
# Remove the heading marker from the formula
|
||||
line_idx, heading_level, formula_content = heading_lines[0]
|
||||
lines[line_idx] = formula_content
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class OCRServiceBase(ABC):
|
||||
@@ -492,16 +489,87 @@ class MineruOCRService(OCRServiceBase):
|
||||
api_url: str = "http://127.0.0.1:8000/file_parse",
|
||||
image_processor: Optional[ImageProcessor] = None,
|
||||
converter: Optional[Converter] = None,
|
||||
paddleocr_vl_url: str = "http://localhost:8000/v1",
|
||||
):
|
||||
"""Initialize Local API service.
|
||||
|
||||
Args:
|
||||
api_url: URL of the local file_parse API endpoint.
|
||||
converter: Optional converter instance for format conversion.
|
||||
paddleocr_vl_url: URL of the PaddleOCR-VL vLLM server.
|
||||
"""
|
||||
self.api_url = api_url
|
||||
self.image_processor = image_processor
|
||||
self.converter = converter
|
||||
self.paddleocr_vl_url = paddleocr_vl_url
|
||||
self.openai_client = OpenAI(api_key="EMPTY", base_url=paddleocr_vl_url, timeout=3600)
|
||||
|
||||
def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
|
||||
"""Recognize formula using PaddleOCR-VL API.
|
||||
|
||||
Args:
|
||||
image: Input image as numpy array in BGR format.
|
||||
prompt: Recognition prompt (default: "Formula Recognition:")
|
||||
|
||||
Returns:
|
||||
Recognized formula text (LaTeX format).
|
||||
"""
|
||||
try:
|
||||
# Encode image to base64
|
||||
success, encoded_image = cv2.imencode(".png", image)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image")
|
||||
|
||||
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
|
||||
image_url = f"data:image/png;base64,{image_base64}"
|
||||
|
||||
# Call OpenAI-compatible API
|
||||
messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
|
||||
|
||||
response = self.openai_client.chat.completions.create(
|
||||
model="PaddlePaddle/PaddleOCR-VL",
|
||||
messages=messages,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
|
||||
|
||||
def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
|
||||
"""Extract image references from markdown and recognize formulas.
|
||||
|
||||
Args:
|
||||
markdown_content: Markdown content with potential image references.
|
||||
original_image: Original input image.
|
||||
|
||||
Returns:
|
||||
Markdown content with formulas recognized by PaddleOCR-VL.
|
||||
"""
|
||||
# Pattern to match image references: 
|
||||
image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
|
||||
|
||||
if not image_pattern.search(markdown_content):
|
||||
return markdown_content
|
||||
|
||||
try:
|
||||
# For now, use the entire image for formula recognition
|
||||
# TODO: Extract specific regions if image paths contain coordinates
|
||||
formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
|
||||
|
||||
# Replace image references with recognized formulas
|
||||
# Wrap in display math delimiters if not already wrapped
|
||||
if not formula_text.startswith("$$"):
|
||||
formula_text = f"$${formula_text}$$"
|
||||
|
||||
markdown_content = image_pattern.sub(formula_text, markdown_content)
|
||||
|
||||
except Exception as e:
|
||||
# If formula recognition fails, keep original content
|
||||
print(f"Warning: Formula recognition failed: {e}")
|
||||
|
||||
return markdown_content
|
||||
|
||||
def recognize(self, image: np.ndarray) -> dict:
|
||||
"""Recognize content using local file_parse API.
|
||||
@@ -554,6 +622,11 @@ class MineruOCRService(OCRServiceBase):
|
||||
if "results" in result and "image" in result["results"]:
|
||||
markdown_content = result["results"]["image"].get("md_content", "")
|
||||
|
||||
# Check if markdown contains formula image references
|
||||
if "
|
||||
|
||||
# Apply postprocessing to fix OCR errors
|
||||
markdown_content = _postprocess_markdown(markdown_content)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user