"""PaddleOCR-VL client service for text and formula recognition.""" import io import tempfile from pathlib import Path import cv2 import numpy as np from app.core.config import get_settings from app.schemas.image import LayoutInfo settings = get_settings() class OCRService: """Service for OCR using PaddleOCR-VL.""" FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format." def __init__( self, vl_server_url: str | None = None, pp_doclayout_model_dir: str | None = None, ): """Initialize OCR service. Args: vl_server_url: URL of the vLLM server for PaddleOCR-VL. pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir self._pipeline = None def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: PaddleOCRVL pipeline instance. """ if self._pipeline is None: from paddleocr import PaddleOCRVL self._pipeline = PaddleOCRVL( vl_rec_backend="vllm-server", vl_rec_server_url=self.vl_server_url, layout_detection_model_name="PP-DocLayoutV2", layout_detection_model_dir=self.pp_doclayout_model_dir, ) return self._pipeline def _save_temp_image(self, image: np.ndarray) -> str: """Save image to a temporary file. Args: image: Image as numpy array in BGR format. Returns: Path to temporary file. """ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: cv2.imwrite(f.name, image) return f.name def recognize_mixed(self, image: np.ndarray) -> dict: """Recognize mixed content (text + formulas) using PP-DocLayoutV2. This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware recognition of mixed content. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: pipeline = self._get_pipeline() temp_path = self._save_temp_image(image) try: results = list(pipeline.predict(temp_path)) markdown_content = "" for result in results: # PaddleOCR-VL results can be saved to markdown md_buffer = io.StringIO() result.save_to_markdown(save_path=md_buffer) markdown_content += md_buffer.getvalue() # Convert markdown to other formats latex = self._markdown_to_latex(markdown_content) mathml = self._extract_mathml(markdown_content) return { "markdown": markdown_content, "latex": latex, "mathml": mathml, } finally: Path(temp_path).unlink(missing_ok=True) except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e def recognize_formula(self, image: np.ndarray) -> dict: """Recognize formula/math content using PaddleOCR-VL with prompt. This mode uses PaddleOCR-VL directly with a formula recognition prompt. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ try: import httpx temp_path = self._save_temp_image(image) try: # Use vLLM API directly for formula recognition import base64 with open(temp_path, "rb") as f: image_base64 = base64.b64encode(f.read()).decode("utf-8") # Call vLLM server with formula prompt response = httpx.post( f"{self.vl_server_url}/chat/completions", json={ "model": "paddleocr-vl", "messages": [ { "role": "user", "content": [ {"type": "text", "text": self.FORMULA_PROMPT}, { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}, }, ], } ], "max_tokens": 1024, }, timeout=60.0, ) response.raise_for_status() result = response.json() latex = result["choices"][0]["message"]["content"].strip() # Convert latex to other formats markdown = self._latex_to_markdown(latex) mathml = self._latex_to_mathml(latex) return { "latex": latex, "markdown": markdown, "mathml": mathml, } finally: Path(temp_path).unlink(missing_ok=True) except httpx.HTTPStatusError as e: raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e except Exception as e: raise RuntimeError(f"Formula recognition failed: {e}") from e def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict: """Recognize content based on layout detection results. Args: image: Input image as numpy array in BGR format. layout_info: Layout detection results. Returns: Dict with recognition results including mode used. """ # Decision logic: # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2) # - Otherwise -> use formula_recognition (VL with prompt) if layout_info.has_plain_text: result = self.recognize_mixed(image) result["recognition_mode"] = "mixed_recognition" else: result = self.recognize_formula(image) result["recognition_mode"] = "formula_recognition" return result def _markdown_to_latex(self, markdown: str) -> str: """Convert markdown to LaTeX. Simple conversion - wraps content in LaTeX document structure. Args: markdown: Markdown content. Returns: LaTeX representation. """ # Basic conversion: preserve math blocks, convert structure lines = [] in_code_block = False for line in markdown.split("\n"): if line.startswith("```"): in_code_block = not in_code_block if in_code_block: lines.append("\\begin{verbatim}") else: lines.append("\\end{verbatim}") elif in_code_block: lines.append(line) elif line.startswith("# "): lines.append(f"\\section{{{line[2:]}}}") elif line.startswith("## "): lines.append(f"\\subsection{{{line[3:]}}}") elif line.startswith("### "): lines.append(f"\\subsubsection{{{line[4:]}}}") elif line.startswith("- "): lines.append(f"\\item {line[2:]}") elif line.startswith("$$"): lines.append(line.replace("$$", "\\[").replace("$$", "\\]")) elif "$" in line: # Keep inline math as-is lines.append(line) else: lines.append(line) return "\n".join(lines) def _latex_to_markdown(self, latex: str) -> str: """Convert LaTeX to markdown. Args: latex: LaTeX content. Returns: Markdown representation. """ # Wrap LaTeX in markdown math block if latex.strip(): return f"$$\n{latex}\n$$" return "" def _latex_to_mathml(self, latex: str) -> str: """Convert LaTeX to MathML. Args: latex: LaTeX content. Returns: MathML representation. """ # Basic LaTeX to MathML conversion # For production, consider using latex2mathml library if not latex.strip(): return "" try: # Try to use latex2mathml if available from latex2mathml.converter import convert return convert(latex) except ImportError: # Fallback: wrap in basic MathML structure return f'{latex}' except Exception: return f'{latex}' def _extract_mathml(self, markdown: str) -> str: """Extract and convert math from markdown to MathML. Args: markdown: Markdown content. Returns: MathML for any math content found. """ import re # Find all math blocks math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL) inline_math = re.findall(r"\$([^$]+)\$", markdown) all_math = math_blocks + inline_math if not all_math: return "" # Convert each to MathML and combine mathml_parts = [] for latex in all_math: mathml = self._latex_to_mathml(latex.strip()) if mathml: mathml_parts.append(mathml) return "\n".join(mathml_parts)