app/services/ocr_service.py

"""PaddleOCR-VL client service for text and formula recognition."""

import io
import tempfile
from pathlib import Path

import cv2
import numpy as np

from app.core.config import get_settings
from app.schemas.image import LayoutInfo

settings = get_settings()


class OCRService:
    """Service for OCR using PaddleOCR-VL."""

    FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."

    def __init__(
        self,
        vl_server_url: str | None = None,
        pp_doclayout_model_dir: str | None = None,
    ):
        """Initialize OCR service.

        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
            pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
        self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
        self._pipeline = None

    def _get_pipeline(self):
        """Get or create PaddleOCR-VL pipeline.

        Returns:
            PaddleOCRVL pipeline instance.
        """
        if self._pipeline is None:
            from paddleocr import PaddleOCRVL

            self._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
                layout_detection_model_dir=self.pp_doclayout_model_dir,
            )
        return self._pipeline

    def _save_temp_image(self, image: np.ndarray) -> str:
        """Save image to a temporary file.

        Args:
            image: Image as numpy array in BGR format.

        Returns:
            Path to temporary file.
        """
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
            cv2.imwrite(f.name, image)
            return f.name

    def recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.

        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
        recognition of mixed content.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()
            temp_path = self._save_temp_image(image)

            try:
                results = list(pipeline.predict(temp_path))

                markdown_content = ""
                for result in results:
                    # PaddleOCR-VL results can be saved to markdown
                    md_buffer = io.StringIO()
                    result.save_to_markdown(save_path=md_buffer)
                    markdown_content += md_buffer.getvalue()

                # Convert markdown to other formats
                latex = self._markdown_to_latex(markdown_content)
                mathml = self._extract_mathml(markdown_content)

                return {
                    "markdown": markdown_content,
                    "latex": latex,
                    "mathml": mathml,
                }
            finally:
                Path(temp_path).unlink(missing_ok=True)

        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

    def recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using PaddleOCR-VL with prompt.

        This mode uses PaddleOCR-VL directly with a formula recognition prompt.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
            import httpx

            temp_path = self._save_temp_image(image)

            try:
                # Use vLLM API directly for formula recognition
                import base64

                with open(temp_path, "rb") as f:
                    image_base64 = base64.b64encode(f.read()).decode("utf-8")

                # Call vLLM server with formula prompt
                response = httpx.post(
                    f"{self.vl_server_url}/chat/completions",
                    json={
                        "model": "paddleocr-vl",
                        "messages": [
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": self.FORMULA_PROMPT},
                                    {
                                        "type": "image_url",
                                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                                    },
                                ],
                            }
                        ],
                        "max_tokens": 1024,
                    },
                    timeout=60.0,
                )
                response.raise_for_status()
                result = response.json()

                latex = result["choices"][0]["message"]["content"].strip()

                # Convert latex to other formats
                markdown = self._latex_to_markdown(latex)
                mathml = self._latex_to_mathml(latex)

                return {
                    "latex": latex,
                    "markdown": markdown,
                    "mathml": mathml,
                }
            finally:
                Path(temp_path).unlink(missing_ok=True)

        except httpx.HTTPStatusError as e:
            raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e

    def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
        """Recognize content based on layout detection results.

        Args:
            image: Input image as numpy array in BGR format.
            layout_info: Layout detection results.

        Returns:
            Dict with recognition results including mode used.
        """
        # Decision logic:
        # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
        # - Otherwise -> use formula_recognition (VL with prompt)
        if layout_info.has_plain_text:
            result = self.recognize_mixed(image)
            result["recognition_mode"] = "mixed_recognition"
        else:
            result = self.recognize_formula(image)
            result["recognition_mode"] = "formula_recognition"

        return result

    def _markdown_to_latex(self, markdown: str) -> str:
        """Convert markdown to LaTeX.

        Simple conversion - wraps content in LaTeX document structure.

        Args:
            markdown: Markdown content.

        Returns:
            LaTeX representation.
        """
        # Basic conversion: preserve math blocks, convert structure
        lines = []
        in_code_block = False

        for line in markdown.split("\n"):
            if line.startswith("```"):
                in_code_block = not in_code_block
                if in_code_block:
                    lines.append("\\begin{verbatim}")
                else:
                    lines.append("\\end{verbatim}")
            elif in_code_block:
                lines.append(line)
            elif line.startswith("# "):
                lines.append(f"\\section{{{line[2:]}}}")
            elif line.startswith("## "):
                lines.append(f"\\subsection{{{line[3:]}}}")
            elif line.startswith("### "):
                lines.append(f"\\subsubsection{{{line[4:]}}}")
            elif line.startswith("- "):
                lines.append(f"\\item {line[2:]}")
            elif line.startswith("$$"):
                lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
            elif "$" in line:
                # Keep inline math as-is
                lines.append(line)
            else:
                lines.append(line)

        return "\n".join(lines)

    def _latex_to_markdown(self, latex: str) -> str:
        """Convert LaTeX to markdown.

        Args:
            latex: LaTeX content.

        Returns:
            Markdown representation.
        """
        # Wrap LaTeX in markdown math block
        if latex.strip():
            return f"$$\n{latex}\n$$"
        return ""

    def _latex_to_mathml(self, latex: str) -> str:
        """Convert LaTeX to MathML.

        Args:
            latex: LaTeX content.

        Returns:
            MathML representation.
        """
        # Basic LaTeX to MathML conversion
        # For production, consider using latex2mathml library
        if not latex.strip():
            return ""

        try:
            # Try to use latex2mathml if available
            from latex2mathml.converter import convert

            return convert(latex)
        except ImportError:
            # Fallback: wrap in basic MathML structure
            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
        except Exception:
            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'

    def _extract_mathml(self, markdown: str) -> str:
        """Extract and convert math from markdown to MathML.

        Args:
            markdown: Markdown content.

        Returns:
            MathML for any math content found.
        """
        import re

        # Find all math blocks
        math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
        inline_math = re.findall(r"\$([^$]+)\$", markdown)

        all_math = math_blocks + inline_math

        if not all_math:
            return ""

        # Convert each to MathML and combine
        mathml_parts = []
        for latex in all_math:
            mathml = self._latex_to_mathml(latex.strip())
            if mathml:
                mathml_parts.append(mathml)

        return "\n".join(mathml_parts)
init repo 2025-12-29 17:34:58 +08:00			`"""PaddleOCR-VL client service for text and formula recognition."""`

			`import io`
			`import tempfile`
			`from pathlib import Path`

			`import cv2`
			`import numpy as np`

			`from app.core.config import get_settings`
			`from app.schemas.image import LayoutInfo`

			`settings = get_settings()`


			`class OCRService:`
			`"""Service for OCR using PaddleOCR-VL."""`

			`FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."`

			`def __init__(`
			`self,`
			`vl_server_url: str \| None = None,`
			`pp_doclayout_model_dir: str \| None = None,`
			`):`
			`"""Initialize OCR service.`

			`Args:`
			`vl_server_url: URL of the vLLM server for PaddleOCR-VL.`
			`pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.`
			`"""`
			`self.vl_server_url = vl_server_url or settings.paddleocr_vl_url`
			`self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir`
			`self._pipeline = None`

			`def _get_pipeline(self):`
			`"""Get or create PaddleOCR-VL pipeline.`

			`Returns:`
			`PaddleOCRVL pipeline instance.`
			`"""`
			`if self._pipeline is None:`
			`from paddleocr import PaddleOCRVL`

			`self._pipeline = PaddleOCRVL(`
			`vl_rec_backend="vllm-server",`
			`vl_rec_server_url=self.vl_server_url,`
			`layout_detection_model_name="PP-DocLayoutV2",`
			`layout_detection_model_dir=self.pp_doclayout_model_dir,`
			`)`
			`return self._pipeline`

			`def _save_temp_image(self, image: np.ndarray) -> str:`
			`"""Save image to a temporary file.`

			`Args:`
			`image: Image as numpy array in BGR format.`

			`Returns:`
			`Path to temporary file.`
			`"""`
			`with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:`
			`cv2.imwrite(f.name, image)`
			`return f.name`

			`def recognize_mixed(self, image: np.ndarray) -> dict:`
			`"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.`

			`This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware`
			`recognition of mixed content.`

			`Args:`
			`image: Input image as numpy array in BGR format.`

			`Returns:`
			`Dict with 'markdown', 'latex', 'mathml' keys.`
			`"""`
			`try:`
			`pipeline = self._get_pipeline()`
			`temp_path = self._save_temp_image(image)`

			`try:`
			`results = list(pipeline.predict(temp_path))`

			`markdown_content = ""`
			`for result in results:`
			`# PaddleOCR-VL results can be saved to markdown`
			`md_buffer = io.StringIO()`
			`result.save_to_markdown(save_path=md_buffer)`
			`markdown_content += md_buffer.getvalue()`

			`# Convert markdown to other formats`
			`latex = self._markdown_to_latex(markdown_content)`
			`mathml = self._extract_mathml(markdown_content)`

			`return {`
			`"markdown": markdown_content,`
			`"latex": latex,`
			`"mathml": mathml,`
			`}`
			`finally:`
			`Path(temp_path).unlink(missing_ok=True)`

			`except Exception as e:`
			`raise RuntimeError(f"Mixed recognition failed: {e}") from e`

			`def recognize_formula(self, image: np.ndarray) -> dict:`
			`"""Recognize formula/math content using PaddleOCR-VL with prompt.`

			`This mode uses PaddleOCR-VL directly with a formula recognition prompt.`

			`Args:`
			`image: Input image as numpy array in BGR format.`

			`Returns:`
			`Dict with 'latex', 'markdown', 'mathml' keys.`
			`"""`
			`try:`
			`import httpx`

			`temp_path = self._save_temp_image(image)`

			`try:`
			`# Use vLLM API directly for formula recognition`
			`import base64`

			`with open(temp_path, "rb") as f:`
			`image_base64 = base64.b64encode(f.read()).decode("utf-8")`

			`# Call vLLM server with formula prompt`
			`response = httpx.post(`
			`f"{self.vl_server_url}/chat/completions",`
			`json={`
			`"model": "paddleocr-vl",`
			`"messages": [`
			`{`
			`"role": "user",`
			`"content": [`
			`{"type": "text", "text": self.FORMULA_PROMPT},`
			`{`
			`"type": "image_url",`
			`"image_url": {"url": f"data:image/png;base64,{image_base64}"},`
			`},`
			`],`
			`}`
			`],`
			`"max_tokens": 1024,`
			`},`
			`timeout=60.0,`
			`)`
			`response.raise_for_status()`
			`result = response.json()`

			`latex = result["choices"][0]["message"]["content"].strip()`

			`# Convert latex to other formats`
			`markdown = self._latex_to_markdown(latex)`
			`mathml = self._latex_to_mathml(latex)`

			`return {`
			`"latex": latex,`
			`"markdown": markdown,`
			`"mathml": mathml,`
			`}`
			`finally:`
			`Path(temp_path).unlink(missing_ok=True)`

			`except httpx.HTTPStatusError as e:`
			`raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e`
			`except Exception as e:`
			`raise RuntimeError(f"Formula recognition failed: {e}") from e`

			`def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:`
			`"""Recognize content based on layout detection results.`

			`Args:`
			`image: Input image as numpy array in BGR format.`
			`layout_info: Layout detection results.`

			`Returns:`
			`Dict with recognition results including mode used.`
			`"""`
			`# Decision logic:`
			`# - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)`
			`# - Otherwise -> use formula_recognition (VL with prompt)`
			`if layout_info.has_plain_text:`
			`result = self.recognize_mixed(image)`
			`result["recognition_mode"] = "mixed_recognition"`
			`else:`
			`result = self.recognize_formula(image)`
			`result["recognition_mode"] = "formula_recognition"`

			`return result`

			`def _markdown_to_latex(self, markdown: str) -> str:`
			`"""Convert markdown to LaTeX.`

			`Simple conversion - wraps content in LaTeX document structure.`

			`Args:`
			`markdown: Markdown content.`

			`Returns:`
			`LaTeX representation.`
			`"""`
			`# Basic conversion: preserve math blocks, convert structure`
			`lines = []`
			`in_code_block = False`

			`for line in markdown.split("\n"):`
			if line.startswith("```"):
			`in_code_block = not in_code_block`
			`if in_code_block:`
			`lines.append("\\begin{verbatim}")`
			`else:`
			`lines.append("\\end{verbatim}")`
			`elif in_code_block:`
			`lines.append(line)`
			`elif line.startswith("# "):`
			`lines.append(f"\\section{{{line[2:]}}}")`
			`elif line.startswith("## "):`
			`lines.append(f"\\subsection{{{line[3:]}}}")`
			`elif line.startswith("### "):`
			`lines.append(f"\\subsubsection{{{line[4:]}}}")`
			`elif line.startswith("- "):`
			`lines.append(f"\\item {line[2:]}")`
			`elif line.startswith("$$"):`
			`lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))`
			`elif "$" in line:`
			`# Keep inline math as-is`
			`lines.append(line)`
			`else:`
			`lines.append(line)`

			`return "\n".join(lines)`

			`def _latex_to_markdown(self, latex: str) -> str:`
			`"""Convert LaTeX to markdown.`

			`Args:`
			`latex: LaTeX content.`

			`Returns:`
			`Markdown representation.`
			`"""`
			`# Wrap LaTeX in markdown math block`
			`if latex.strip():`
			`return f"$$\n{latex}\n$$"`
			`return ""`

			`def _latex_to_mathml(self, latex: str) -> str:`
			`"""Convert LaTeX to MathML.`

			`Args:`
			`latex: LaTeX content.`

			`Returns:`
			`MathML representation.`
			`"""`
			`# Basic LaTeX to MathML conversion`
			`# For production, consider using latex2mathml library`
			`if not latex.strip():`
			`return ""`

			`try:`
			`# Try to use latex2mathml if available`
			`from latex2mathml.converter import convert`

			`return convert(latex)`
			`except ImportError:`
			`# Fallback: wrap in basic MathML structure`
			`return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'`
			`except Exception:`
			`return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'`

			`def _extract_mathml(self, markdown: str) -> str:`
			`"""Extract and convert math from markdown to MathML.`

			`Args:`
			`markdown: Markdown content.`

			`Returns:`
			`MathML for any math content found.`
			`"""`
			`import re`

			`# Find all math blocks`
			`math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)`
			`inline_math = re.findall(r"\$([^$]+)\$", markdown)`

			`all_math = math_blocks + inline_math`

			`if not all_math:`
			`return ""`

			`# Convert each to MathML and combine`
			`mathml_parts = []`
			`for latex in all_math:`
			`mathml = self._latex_to_mathml(latex.strip())`
			`if mathml:`
			`mathml_parts.append(mathml)`

			`return "\n".join(mathml_parts)`