304 lines
9.8 KiB
Python
304 lines
9.8 KiB
Python
|
|
"""PaddleOCR-VL client service for text and formula recognition."""
|
||
|
|
|
||
|
|
import io
|
||
|
|
import tempfile
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import cv2
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
from app.core.config import get_settings
|
||
|
|
from app.schemas.image import LayoutInfo
|
||
|
|
|
||
|
|
settings = get_settings()
|
||
|
|
|
||
|
|
|
||
|
|
class OCRService:
|
||
|
|
"""Service for OCR using PaddleOCR-VL."""
|
||
|
|
|
||
|
|
FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
vl_server_url: str | None = None,
|
||
|
|
pp_doclayout_model_dir: str | None = None,
|
||
|
|
):
|
||
|
|
"""Initialize OCR service.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
|
||
|
|
pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
|
||
|
|
"""
|
||
|
|
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
|
||
|
|
self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
|
||
|
|
self._pipeline = None
|
||
|
|
|
||
|
|
def _get_pipeline(self):
|
||
|
|
"""Get or create PaddleOCR-VL pipeline.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
PaddleOCRVL pipeline instance.
|
||
|
|
"""
|
||
|
|
if self._pipeline is None:
|
||
|
|
from paddleocr import PaddleOCRVL
|
||
|
|
|
||
|
|
self._pipeline = PaddleOCRVL(
|
||
|
|
vl_rec_backend="vllm-server",
|
||
|
|
vl_rec_server_url=self.vl_server_url,
|
||
|
|
layout_detection_model_name="PP-DocLayoutV2",
|
||
|
|
layout_detection_model_dir=self.pp_doclayout_model_dir,
|
||
|
|
)
|
||
|
|
return self._pipeline
|
||
|
|
|
||
|
|
def _save_temp_image(self, image: np.ndarray) -> str:
|
||
|
|
"""Save image to a temporary file.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
image: Image as numpy array in BGR format.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Path to temporary file.
|
||
|
|
"""
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
||
|
|
cv2.imwrite(f.name, image)
|
||
|
|
return f.name
|
||
|
|
|
||
|
|
def recognize_mixed(self, image: np.ndarray) -> dict:
|
||
|
|
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
|
||
|
|
|
||
|
|
This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
|
||
|
|
recognition of mixed content.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
image: Input image as numpy array in BGR format.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dict with 'markdown', 'latex', 'mathml' keys.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
pipeline = self._get_pipeline()
|
||
|
|
temp_path = self._save_temp_image(image)
|
||
|
|
|
||
|
|
try:
|
||
|
|
results = list(pipeline.predict(temp_path))
|
||
|
|
|
||
|
|
markdown_content = ""
|
||
|
|
for result in results:
|
||
|
|
# PaddleOCR-VL results can be saved to markdown
|
||
|
|
md_buffer = io.StringIO()
|
||
|
|
result.save_to_markdown(save_path=md_buffer)
|
||
|
|
markdown_content += md_buffer.getvalue()
|
||
|
|
|
||
|
|
# Convert markdown to other formats
|
||
|
|
latex = self._markdown_to_latex(markdown_content)
|
||
|
|
mathml = self._extract_mathml(markdown_content)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"markdown": markdown_content,
|
||
|
|
"latex": latex,
|
||
|
|
"mathml": mathml,
|
||
|
|
}
|
||
|
|
finally:
|
||
|
|
Path(temp_path).unlink(missing_ok=True)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
||
|
|
|
||
|
|
def recognize_formula(self, image: np.ndarray) -> dict:
|
||
|
|
"""Recognize formula/math content using PaddleOCR-VL with prompt.
|
||
|
|
|
||
|
|
This mode uses PaddleOCR-VL directly with a formula recognition prompt.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
image: Input image as numpy array in BGR format.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dict with 'latex', 'markdown', 'mathml' keys.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
temp_path = self._save_temp_image(image)
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Use vLLM API directly for formula recognition
|
||
|
|
import base64
|
||
|
|
|
||
|
|
with open(temp_path, "rb") as f:
|
||
|
|
image_base64 = base64.b64encode(f.read()).decode("utf-8")
|
||
|
|
|
||
|
|
# Call vLLM server with formula prompt
|
||
|
|
response = httpx.post(
|
||
|
|
f"{self.vl_server_url}/chat/completions",
|
||
|
|
json={
|
||
|
|
"model": "paddleocr-vl",
|
||
|
|
"messages": [
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [
|
||
|
|
{"type": "text", "text": self.FORMULA_PROMPT},
|
||
|
|
{
|
||
|
|
"type": "image_url",
|
||
|
|
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||
|
|
},
|
||
|
|
],
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"max_tokens": 1024,
|
||
|
|
},
|
||
|
|
timeout=60.0,
|
||
|
|
)
|
||
|
|
response.raise_for_status()
|
||
|
|
result = response.json()
|
||
|
|
|
||
|
|
latex = result["choices"][0]["message"]["content"].strip()
|
||
|
|
|
||
|
|
# Convert latex to other formats
|
||
|
|
markdown = self._latex_to_markdown(latex)
|
||
|
|
mathml = self._latex_to_mathml(latex)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"latex": latex,
|
||
|
|
"markdown": markdown,
|
||
|
|
"mathml": mathml,
|
||
|
|
}
|
||
|
|
finally:
|
||
|
|
Path(temp_path).unlink(missing_ok=True)
|
||
|
|
|
||
|
|
except httpx.HTTPStatusError as e:
|
||
|
|
raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
|
||
|
|
except Exception as e:
|
||
|
|
raise RuntimeError(f"Formula recognition failed: {e}") from e
|
||
|
|
|
||
|
|
def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
|
||
|
|
"""Recognize content based on layout detection results.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
image: Input image as numpy array in BGR format.
|
||
|
|
layout_info: Layout detection results.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dict with recognition results including mode used.
|
||
|
|
"""
|
||
|
|
# Decision logic:
|
||
|
|
# - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
|
||
|
|
# - Otherwise -> use formula_recognition (VL with prompt)
|
||
|
|
if layout_info.has_plain_text:
|
||
|
|
result = self.recognize_mixed(image)
|
||
|
|
result["recognition_mode"] = "mixed_recognition"
|
||
|
|
else:
|
||
|
|
result = self.recognize_formula(image)
|
||
|
|
result["recognition_mode"] = "formula_recognition"
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
def _markdown_to_latex(self, markdown: str) -> str:
|
||
|
|
"""Convert markdown to LaTeX.
|
||
|
|
|
||
|
|
Simple conversion - wraps content in LaTeX document structure.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
markdown: Markdown content.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
LaTeX representation.
|
||
|
|
"""
|
||
|
|
# Basic conversion: preserve math blocks, convert structure
|
||
|
|
lines = []
|
||
|
|
in_code_block = False
|
||
|
|
|
||
|
|
for line in markdown.split("\n"):
|
||
|
|
if line.startswith("```"):
|
||
|
|
in_code_block = not in_code_block
|
||
|
|
if in_code_block:
|
||
|
|
lines.append("\\begin{verbatim}")
|
||
|
|
else:
|
||
|
|
lines.append("\\end{verbatim}")
|
||
|
|
elif in_code_block:
|
||
|
|
lines.append(line)
|
||
|
|
elif line.startswith("# "):
|
||
|
|
lines.append(f"\\section{{{line[2:]}}}")
|
||
|
|
elif line.startswith("## "):
|
||
|
|
lines.append(f"\\subsection{{{line[3:]}}}")
|
||
|
|
elif line.startswith("### "):
|
||
|
|
lines.append(f"\\subsubsection{{{line[4:]}}}")
|
||
|
|
elif line.startswith("- "):
|
||
|
|
lines.append(f"\\item {line[2:]}")
|
||
|
|
elif line.startswith("$$"):
|
||
|
|
lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
|
||
|
|
elif "$" in line:
|
||
|
|
# Keep inline math as-is
|
||
|
|
lines.append(line)
|
||
|
|
else:
|
||
|
|
lines.append(line)
|
||
|
|
|
||
|
|
return "\n".join(lines)
|
||
|
|
|
||
|
|
def _latex_to_markdown(self, latex: str) -> str:
|
||
|
|
"""Convert LaTeX to markdown.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
latex: LaTeX content.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Markdown representation.
|
||
|
|
"""
|
||
|
|
# Wrap LaTeX in markdown math block
|
||
|
|
if latex.strip():
|
||
|
|
return f"$$\n{latex}\n$$"
|
||
|
|
return ""
|
||
|
|
|
||
|
|
def _latex_to_mathml(self, latex: str) -> str:
|
||
|
|
"""Convert LaTeX to MathML.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
latex: LaTeX content.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
MathML representation.
|
||
|
|
"""
|
||
|
|
# Basic LaTeX to MathML conversion
|
||
|
|
# For production, consider using latex2mathml library
|
||
|
|
if not latex.strip():
|
||
|
|
return ""
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Try to use latex2mathml if available
|
||
|
|
from latex2mathml.converter import convert
|
||
|
|
|
||
|
|
return convert(latex)
|
||
|
|
except ImportError:
|
||
|
|
# Fallback: wrap in basic MathML structure
|
||
|
|
return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
|
||
|
|
except Exception:
|
||
|
|
return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
|
||
|
|
|
||
|
|
def _extract_mathml(self, markdown: str) -> str:
|
||
|
|
"""Extract and convert math from markdown to MathML.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
markdown: Markdown content.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
MathML for any math content found.
|
||
|
|
"""
|
||
|
|
import re
|
||
|
|
|
||
|
|
# Find all math blocks
|
||
|
|
math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
|
||
|
|
inline_math = re.findall(r"\$([^$]+)\$", markdown)
|
||
|
|
|
||
|
|
all_math = math_blocks + inline_math
|
||
|
|
|
||
|
|
if not all_math:
|
||
|
|
return ""
|
||
|
|
|
||
|
|
# Convert each to MathML and combine
|
||
|
|
mathml_parts = []
|
||
|
|
for latex in all_math:
|
||
|
|
mathml = self._latex_to_mathml(latex.strip())
|
||
|
|
if mathml:
|
||
|
|
mathml_parts.append(mathml)
|
||
|
|
|
||
|
|
return "\n".join(mathml_parts)
|