feat: optimize the format convert

2026-02-04 12:00:06 +08:00
parent 10dbd59161
commit 526c1f3a0d
7 changed files with 571 additions and 187 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -17,13 +17,31 @@ settings = get_settings()

 _COMMANDS_NEED_SPACE = {
    # operators / calculus
-    "cdot", "times", "div", "pm", "mp",
-    "int", "iint", "iiint", "oint", "sum", "prod", "lim",
+    "cdot",
+    "times",
+    "div",
+    "pm",
+    "mp",
+    "int",
+    "iint",
+    "iiint",
+    "oint",
+    "sum",
+    "prod",
+    "lim",
    # common functions
-    "sin", "cos", "tan", "cot", "sec", "csc",
-    "log", "ln", "exp",
+    "sin",
+    "cos",
+    "tan",
+    "cot",
+    "sec",
+    "csc",
+    "log",
+    "ln",
+    "exp",
    # misc
-    "partial", "nabla",
+    "partial",
+    "nabla",
 }

 _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
@@ -58,7 +76,7 @@ def _split_glued_command_token(token: str) -> str:
    if not best:
        return token

-    suffix = body[len(best):]
+    suffix = body[len(best) :]
    if not suffix:
        return token

@@ -118,11 +136,11 @@ class OCRService(OCRServiceBase):
            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
-        self.layout_detector = layout_detector 
+        self.layout_detector = layout_detector
        self.image_processor = image_processor
        self.converter = converter

-    def _get_pipeline(self):    
+    def _get_pipeline(self):
        """Get or create PaddleOCR-VL pipeline.

        Returns:
@@ -159,12 +177,13 @@ class OCRService(OCRServiceBase):
                markdown_content += res.markdown.get("markdown_texts", "")

            markdown_content = _postprocess_markdown(markdown_content)
-            convert_result  = self.converter.convert_to_formats(markdown_content)
+            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
                "markdown": markdown_content,
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
+                "mml": convert_result.mml,
            }
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e
@@ -196,6 +215,7 @@ class OCRService(OCRServiceBase):
            return {
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
+                "mml": convert_result.mml,
                "markdown": markdown_content,
            }
        except Exception as e:
@@ -220,7 +240,7 @@ class OCRService(OCRServiceBase):

 class MineruOCRService(OCRServiceBase):
    """Service for OCR using local file_parse API."""
-    
+
    def __init__(
        self,
        api_url: str = "http://127.0.0.1:8000/file_parse",
@@ -228,7 +248,7 @@ class MineruOCRService(OCRServiceBase):
        converter: Optional[Converter] = None,
    ):
        """Initialize Local API service.
-        
+
        Args:
            api_url: URL of the local file_parse API endpoint.
            converter: Optional converter instance for format conversion.
@@ -236,13 +256,13 @@ class MineruOCRService(OCRServiceBase):
        self.api_url = api_url
        self.image_processor = image_processor
        self.converter = converter
-    
+
    def recognize(self, image: np.ndarray) -> dict:
        """Recognize content using local file_parse API.
-        
+
        Args:
            image: Input image as numpy array in BGR format.
-            
+
        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
@@ -251,78 +271,71 @@ class MineruOCRService(OCRServiceBase):
                image = self.image_processor.add_padding(image)

            # Convert numpy array to image bytes
-            success, encoded_image = cv2.imencode('.png', image)
+            success, encoded_image = cv2.imencode(".png", image)
            if not success:
                raise RuntimeError("Failed to encode image")
-            
+
            image_bytes = BytesIO(encoded_image.tobytes())
-            
+
            # Prepare multipart form data
-            files = {
-                'files': ('image.png', image_bytes, 'image/png')
-            }
-            
+            files = {"files": ("image.png", image_bytes, "image/png")}
+
            data = {
-                'return_middle_json': 'false',
-                'return_model_output': 'false',
-                'return_md': 'true',
-                'return_images': 'false',
-                'end_page_id': '99999',
-                'start_page_id': '0',
-                'lang_list': 'en',
-                'server_url': 'string',
-                'return_content_list': 'false',
-                'backend': 'hybrid-auto-engine',
-                'table_enable': 'true',
-                'response_format_zip': 'false',
-                'formula_enable': 'true',
-                'parse_method': 'ocr'
+                "return_middle_json": "false",
+                "return_model_output": "false",
+                "return_md": "true",
+                "return_images": "false",
+                "end_page_id": "99999",
+                "start_page_id": "0",
+                "lang_list": "en",
+                "server_url": "string",
+                "return_content_list": "false",
+                "backend": "hybrid-auto-engine",
+                "table_enable": "true",
+                "response_format_zip": "false",
+                "formula_enable": "true",
+                "parse_method": "ocr",
            }
-            
+
            # Make API request
-            response = requests.post(
-                self.api_url,
-                files=files,
-                data=data,
-                headers={'accept': 'application/json'},
-                timeout=30
-            )
+            response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30)
            response.raise_for_status()
-            
+
            result = response.json()
-            
+
            # Extract markdown content from response
            markdown_content = ""
-            if 'results' in result and 'image' in result['results']:
-                markdown_content = result['results']['image'].get('md_content', '')
+            if "results" in result and "image" in result["results"]:
+                markdown_content = result["results"]["image"].get("md_content", "")

            # markdown_content = _postprocess_markdown(markdown_content)
-            
+
            # Convert to other formats if converter is available
            latex = ""
            mathml = ""
+            mml = ""
            if self.converter and markdown_content:
                convert_result = self.converter.convert_to_formats(markdown_content)
                latex = convert_result.latex
                mathml = convert_result.mathml
-            
+                mml = convert_result.mml
+
            return {
                "markdown": markdown_content,
                "latex": latex,
                "mathml": mathml,
+                "mml": mml,
            }
-            
+
        except requests.RequestException as e:
            raise RuntimeError(f"Local API request failed: {e}") from e
        except Exception as e:
            raise RuntimeError(f"Recognition failed: {e}") from e


-
-
 if __name__ == "__main__":
    mineru_service = MineruOCRService()
    image = cv2.imread("test/complex_formula.png")
    image_numpy = np.array(image)
    ocr_result = mineru_service.recognize(image_numpy)
-    print(ocr_result)
+    print(ocr_result)