fix: refact logic

2025-12-31 17:38:32 +08:00
parent 6ac50f7d2f
commit 35928c2484
17 changed files with 678 additions and 738 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,14 +1,12 @@
 """PaddleOCR-VL client service for text and formula recognition."""

-import io
-import tempfile
-from pathlib import Path
-
-import cv2
 import numpy as np
-
 from app.core.config import get_settings
-from app.schemas.image import LayoutInfo
+from paddleocr import PaddleOCRVL
+from typing import Optional
+from app.services.layout_detector import LayoutDetector
+from app.services.image_processor import ImageProcessor
+from app.services.converter import Converter

 settings = get_settings()

@@ -16,52 +14,40 @@ settings = get_settings()
 class OCRService:
    """Service for OCR using PaddleOCR-VL."""

-    FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
+    _pipeline: Optional[PaddleOCRVL] = None
+    _layout_detector: Optional[LayoutDetector] = None

    def __init__(
        self,
-        vl_server_url: str | None = None,
-        pp_doclayout_model_dir: str | None = None,
+        vl_server_url: str,
+        layout_detector: LayoutDetector,
+        image_processor: ImageProcessor,
+        converter: Converter,
    ):
        """Initialize OCR service.

        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
-            pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
+            layout_detector: Layout detector instance.
+            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
-        self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
-        self._pipeline = None
-
-    def _get_pipeline(self):
+        self.layout_detector = layout_detector 
+        self.image_processor = image_processor
+        self.converter = converter
+    def _get_pipeline(self):    
        """Get or create PaddleOCR-VL pipeline.

        Returns:
            PaddleOCRVL pipeline instance.
        """
-        if self._pipeline is None:
-            from paddleocr import PaddleOCRVL
-
-            self._pipeline = PaddleOCRVL(
+        if OCRService._pipeline is None:
+            OCRService._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
-                layout_detection_model_dir=self.pp_doclayout_model_dir,
            )
-        return self._pipeline
-
-    def _save_temp_image(self, image: np.ndarray) -> str:
-        """Save image to a temporary file.
-
-        Args:
-            image: Image as numpy array in BGR format.
-
-        Returns:
-            Path to temporary file.
-        """
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
-            cv2.imwrite(f.name, image)
-            return f.name
+        return OCRService._pipeline

    def recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.
@@ -77,30 +63,21 @@ class OCRService:
        """
        try:
            pipeline = self._get_pipeline()
-            temp_path = self._save_temp_image(image)

-            try:
-                results = list(pipeline.predict(temp_path))
+            output = pipeline.predict(image, use_layout_detection=True)

-                markdown_content = ""
-                for result in results:
-                    # PaddleOCR-VL results can be saved to markdown
-                    md_buffer = io.StringIO()
-                    result.save_to_markdown(save_path=md_buffer)
-                    markdown_content += md_buffer.getvalue()
+            markdown_content = ""

-                # Convert markdown to other formats
-                latex = self._markdown_to_latex(markdown_content)
-                mathml = self._extract_mathml(markdown_content)
+            for res in output:
+                markdown_content += res.markdown.get("markdown_texts", "")

-                return {
-                    "markdown": markdown_content,
-                    "latex": latex,
-                    "mathml": mathml,
-                }
-            finally:
-                Path(temp_path).unlink(missing_ok=True)
+            convert_result  = self.converter.convert_to_formats(markdown_content)

+            return {
+                "markdown": markdown_content,
+                "latex": convert_result.latex,
+                "mathml": convert_result.mathml,
+            }
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

@@ -116,188 +93,49 @@ class OCRService:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
-            import httpx
+            pipeline = self._get_pipeline()

-            temp_path = self._save_temp_image(image)
+            output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")

-            try:
-                # Use vLLM API directly for formula recognition
-                import base64
+            markdown_content = ""

-                with open(temp_path, "rb") as f:
-                    image_base64 = base64.b64encode(f.read()).decode("utf-8")
+            for res in output:
+                markdown_content += res.markdown.get("markdown_texts", "")

-                # Call vLLM server with formula prompt
-                response = httpx.post(
-                    f"{self.vl_server_url}/chat/completions",
-                    json={
-                        "model": "paddleocr-vl",
-                        "messages": [
-                            {
-                                "role": "user",
-                                "content": [
-                                    {"type": "text", "text": self.FORMULA_PROMPT},
-                                    {
-                                        "type": "image_url",
-                                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
-                                    },
-                                ],
-                            }
-                        ],
-                        "max_tokens": 1024,
-                    },
-                    timeout=60.0,
-                )
-                response.raise_for_status()
-                result = response.json()
+            convert_result = self.converter.convert_to_formats(markdown_content)

-                latex = result["choices"][0]["message"]["content"].strip()
-
-                # Convert latex to other formats
-                markdown = self._latex_to_markdown(latex)
-                mathml = self._latex_to_mathml(latex)
-
-                return {
-                    "latex": latex,
-                    "markdown": markdown,
-                    "mathml": mathml,
-                }
-            finally:
-                Path(temp_path).unlink(missing_ok=True)
-
-        except httpx.HTTPStatusError as e:
-            raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
+            return {
+                "latex": convert_result.latex,
+                "mathml": convert_result.mathml,
+                "markdown": markdown_content,
+            }
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e

-    def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
-        """Recognize content based on layout detection results.
+    def recognize(self, image: np.ndarray) -> dict:
+        """Recognize content using PaddleOCR-VL.

        Args:
            image: Input image as numpy array in BGR format.
-            layout_info: Layout detection results.

        Returns:
-            Dict with recognition results including mode used.
+            Dict with 'latex', 'markdown', 'mathml' keys.
        """
-        # Decision logic:
-        # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
-        # - Otherwise -> use formula_recognition (VL with prompt)
-        if layout_info.has_plain_text:
-            result = self.recognize_mixed(image)
-            result["recognition_mode"] = "mixed_recognition"
+        padded_image = self.image_processor.add_padding(image)
+        layout_info = self.layout_detector.detect(padded_image)
+        if layout_info.MixedRecognition:
+            return self.recognize_mixed(image)
        else:
-            result = self.recognize_formula(image)
-            result["recognition_mode"] = "formula_recognition"
+            return self.recognize_formula(image)

-        return result

-    def _markdown_to_latex(self, markdown: str) -> str:
-        """Convert markdown to LaTeX.
-
-        Simple conversion - wraps content in LaTeX document structure.
-
-        Args:
-            markdown: Markdown content.
-
-        Returns:
-            LaTeX representation.
-        """
-        # Basic conversion: preserve math blocks, convert structure
-        lines = []
-        in_code_block = False
-
-        for line in markdown.split("\n"):
-            if line.startswith("```"):
-                in_code_block = not in_code_block
-                if in_code_block:
-                    lines.append("\\begin{verbatim}")
-                else:
-                    lines.append("\\end{verbatim}")
-            elif in_code_block:
-                lines.append(line)
-            elif line.startswith("# "):
-                lines.append(f"\\section{{{line[2:]}}}")
-            elif line.startswith("## "):
-                lines.append(f"\\subsection{{{line[3:]}}}")
-            elif line.startswith("### "):
-                lines.append(f"\\subsubsection{{{line[4:]}}}")
-            elif line.startswith("- "):
-                lines.append(f"\\item {line[2:]}")
-            elif line.startswith("$$"):
-                lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
-            elif "$" in line:
-                # Keep inline math as-is
-                lines.append(line)
-            else:
-                lines.append(line)
-
-        return "\n".join(lines)
-
-    def _latex_to_markdown(self, latex: str) -> str:
-        """Convert LaTeX to markdown.
-
-        Args:
-            latex: LaTeX content.
-
-        Returns:
-            Markdown representation.
-        """
-        # Wrap LaTeX in markdown math block
-        if latex.strip():
-            return f"$$\n{latex}\n$$"
-        return ""
-
-    def _latex_to_mathml(self, latex: str) -> str:
-        """Convert LaTeX to MathML.
-
-        Args:
-            latex: LaTeX content.
-
-        Returns:
-            MathML representation.
-        """
-        # Basic LaTeX to MathML conversion
-        # For production, consider using latex2mathml library
-        if not latex.strip():
-            return ""
-
-        try:
-            # Try to use latex2mathml if available
-            from latex2mathml.converter import convert
-
-            return convert(latex)
-        except ImportError:
-            # Fallback: wrap in basic MathML structure
-            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
-        except Exception:
-            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
-
-    def _extract_mathml(self, markdown: str) -> str:
-        """Extract and convert math from markdown to MathML.
-
-        Args:
-            markdown: Markdown content.
-
-        Returns:
-            MathML for any math content found.
-        """
-        import re
-
-        # Find all math blocks
-        math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
-        inline_math = re.findall(r"\$([^$]+)\$", markdown)
-
-        all_math = math_blocks + inline_math
-
-        if not all_math:
-            return ""
-
-        # Convert each to MathML and combine
-        mathml_parts = []
-        for latex in all_math:
-            mathml = self._latex_to_mathml(latex.strip())
-            if mathml:
-                mathml_parts.append(mathml)
-
-        return "\n".join(mathml_parts)
+if __name__ == "__main__":
+    import cv2
+    from app.services.image_processor import ImageProcessor
+    from app.services.layout_detector import LayoutDetector
+    image_processor = ImageProcessor(padding_ratio=0.15)
+    layout_detector = LayoutDetector()
+    ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
+    image = cv2.imread("test/image.png")
+    ocr_result = ocr_service.recognize(image)
+    print(ocr_result)