fix: refact logic

This commit is contained in:
2025-12-31 17:38:32 +08:00
parent 6ac50f7d2f
commit 35928c2484
17 changed files with 678 additions and 738 deletions

View File

@@ -1,14 +1,12 @@
"""PaddleOCR-VL client service for text and formula recognition."""
import io
import tempfile
from pathlib import Path
import cv2
import numpy as np
from app.core.config import get_settings
from app.schemas.image import LayoutInfo
from paddleocr import PaddleOCRVL
from typing import Optional
from app.services.layout_detector import LayoutDetector
from app.services.image_processor import ImageProcessor
from app.services.converter import Converter
settings = get_settings()
@@ -16,52 +14,40 @@ settings = get_settings()
class OCRService:
"""Service for OCR using PaddleOCR-VL."""
FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
_pipeline: Optional[PaddleOCRVL] = None
_layout_detector: Optional[LayoutDetector] = None
def __init__(
self,
vl_server_url: str | None = None,
pp_doclayout_model_dir: str | None = None,
vl_server_url: str,
layout_detector: LayoutDetector,
image_processor: ImageProcessor,
converter: Converter,
):
"""Initialize OCR service.
Args:
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
layout_detector: Layout detector instance.
image_processor: Image processor instance.
"""
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
self._pipeline = None
def _get_pipeline(self):
self.layout_detector = layout_detector
self.image_processor = image_processor
self.converter = converter
def _get_pipeline(self):
"""Get or create PaddleOCR-VL pipeline.
Returns:
PaddleOCRVL pipeline instance.
"""
if self._pipeline is None:
from paddleocr import PaddleOCRVL
self._pipeline = PaddleOCRVL(
if OCRService._pipeline is None:
OCRService._pipeline = PaddleOCRVL(
vl_rec_backend="vllm-server",
vl_rec_server_url=self.vl_server_url,
layout_detection_model_name="PP-DocLayoutV2",
layout_detection_model_dir=self.pp_doclayout_model_dir,
)
return self._pipeline
def _save_temp_image(self, image: np.ndarray) -> str:
"""Save image to a temporary file.
Args:
image: Image as numpy array in BGR format.
Returns:
Path to temporary file.
"""
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
cv2.imwrite(f.name, image)
return f.name
return OCRService._pipeline
def recognize_mixed(self, image: np.ndarray) -> dict:
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
@@ -77,30 +63,21 @@ class OCRService:
"""
try:
pipeline = self._get_pipeline()
temp_path = self._save_temp_image(image)
try:
results = list(pipeline.predict(temp_path))
output = pipeline.predict(image, use_layout_detection=True)
markdown_content = ""
for result in results:
# PaddleOCR-VL results can be saved to markdown
md_buffer = io.StringIO()
result.save_to_markdown(save_path=md_buffer)
markdown_content += md_buffer.getvalue()
markdown_content = ""
# Convert markdown to other formats
latex = self._markdown_to_latex(markdown_content)
mathml = self._extract_mathml(markdown_content)
for res in output:
markdown_content += res.markdown.get("markdown_texts", "")
return {
"markdown": markdown_content,
"latex": latex,
"mathml": mathml,
}
finally:
Path(temp_path).unlink(missing_ok=True)
convert_result = self.converter.convert_to_formats(markdown_content)
return {
"markdown": markdown_content,
"latex": convert_result.latex,
"mathml": convert_result.mathml,
}
except Exception as e:
raise RuntimeError(f"Mixed recognition failed: {e}") from e
@@ -116,188 +93,49 @@ class OCRService:
Dict with 'latex', 'markdown', 'mathml' keys.
"""
try:
import httpx
pipeline = self._get_pipeline()
temp_path = self._save_temp_image(image)
output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")
try:
# Use vLLM API directly for formula recognition
import base64
markdown_content = ""
with open(temp_path, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode("utf-8")
for res in output:
markdown_content += res.markdown.get("markdown_texts", "")
# Call vLLM server with formula prompt
response = httpx.post(
f"{self.vl_server_url}/chat/completions",
json={
"model": "paddleocr-vl",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": self.FORMULA_PROMPT},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
],
}
],
"max_tokens": 1024,
},
timeout=60.0,
)
response.raise_for_status()
result = response.json()
convert_result = self.converter.convert_to_formats(markdown_content)
latex = result["choices"][0]["message"]["content"].strip()
# Convert latex to other formats
markdown = self._latex_to_markdown(latex)
mathml = self._latex_to_mathml(latex)
return {
"latex": latex,
"markdown": markdown,
"mathml": mathml,
}
finally:
Path(temp_path).unlink(missing_ok=True)
except httpx.HTTPStatusError as e:
raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
return {
"latex": convert_result.latex,
"mathml": convert_result.mathml,
"markdown": markdown_content,
}
except Exception as e:
raise RuntimeError(f"Formula recognition failed: {e}") from e
def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
"""Recognize content based on layout detection results.
def recognize(self, image: np.ndarray) -> dict:
"""Recognize content using PaddleOCR-VL.
Args:
image: Input image as numpy array in BGR format.
layout_info: Layout detection results.
Returns:
Dict with recognition results including mode used.
Dict with 'latex', 'markdown', 'mathml' keys.
"""
# Decision logic:
# - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
# - Otherwise -> use formula_recognition (VL with prompt)
if layout_info.has_plain_text:
result = self.recognize_mixed(image)
result["recognition_mode"] = "mixed_recognition"
padded_image = self.image_processor.add_padding(image)
layout_info = self.layout_detector.detect(padded_image)
if layout_info.MixedRecognition:
return self.recognize_mixed(image)
else:
result = self.recognize_formula(image)
result["recognition_mode"] = "formula_recognition"
return self.recognize_formula(image)
return result
def _markdown_to_latex(self, markdown: str) -> str:
"""Convert markdown to LaTeX.
Simple conversion - wraps content in LaTeX document structure.
Args:
markdown: Markdown content.
Returns:
LaTeX representation.
"""
# Basic conversion: preserve math blocks, convert structure
lines = []
in_code_block = False
for line in markdown.split("\n"):
if line.startswith("```"):
in_code_block = not in_code_block
if in_code_block:
lines.append("\\begin{verbatim}")
else:
lines.append("\\end{verbatim}")
elif in_code_block:
lines.append(line)
elif line.startswith("# "):
lines.append(f"\\section{{{line[2:]}}}")
elif line.startswith("## "):
lines.append(f"\\subsection{{{line[3:]}}}")
elif line.startswith("### "):
lines.append(f"\\subsubsection{{{line[4:]}}}")
elif line.startswith("- "):
lines.append(f"\\item {line[2:]}")
elif line.startswith("$$"):
lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
elif "$" in line:
# Keep inline math as-is
lines.append(line)
else:
lines.append(line)
return "\n".join(lines)
def _latex_to_markdown(self, latex: str) -> str:
"""Convert LaTeX to markdown.
Args:
latex: LaTeX content.
Returns:
Markdown representation.
"""
# Wrap LaTeX in markdown math block
if latex.strip():
return f"$$\n{latex}\n$$"
return ""
def _latex_to_mathml(self, latex: str) -> str:
"""Convert LaTeX to MathML.
Args:
latex: LaTeX content.
Returns:
MathML representation.
"""
# Basic LaTeX to MathML conversion
# For production, consider using latex2mathml library
if not latex.strip():
return ""
try:
# Try to use latex2mathml if available
from latex2mathml.converter import convert
return convert(latex)
except ImportError:
# Fallback: wrap in basic MathML structure
return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
except Exception:
return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
def _extract_mathml(self, markdown: str) -> str:
"""Extract and convert math from markdown to MathML.
Args:
markdown: Markdown content.
Returns:
MathML for any math content found.
"""
import re
# Find all math blocks
math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
inline_math = re.findall(r"\$([^$]+)\$", markdown)
all_math = math_blocks + inline_math
if not all_math:
return ""
# Convert each to MathML and combine
mathml_parts = []
for latex in all_math:
mathml = self._latex_to_mathml(latex.strip())
if mathml:
mathml_parts.append(mathml)
return "\n".join(mathml_parts)
if __name__ == "__main__":
import cv2
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
image_processor = ImageProcessor(padding_ratio=0.15)
layout_detector = LayoutDetector()
ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
image = cv2.imread("test/image.png")
ocr_result = ocr_service.recognize(image)
print(ocr_result)