fix: add image padding for mineru

2026-01-05 21:37:51 +08:00
parent 6ea37c9380
commit df2b664af4
2 changed files with 88 additions and 1 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,5 +1,6 @@
 """PaddleOCR-VL client service for text and formula recognition."""

+import re
 import numpy as np
 import cv2
 import requests
@@ -14,6 +15,82 @@ from abc import ABC, abstractmethod

 settings = get_settings()

+_COMMANDS_NEED_SPACE = {
+    # operators / calculus
+    "cdot", "times", "div", "pm", "mp",
+    "int", "iint", "iiint", "oint", "sum", "prod", "lim",
+    # common functions
+    "sin", "cos", "tan", "cot", "sec", "csc",
+    "log", "ln", "exp",
+    # misc
+    "partial", "nabla",
+}
+
+_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
+_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
+
+# stage2: differentials inside math segments
+_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
+_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
+
+
+def _split_glued_command_token(token: str) -> str:
+    """Split OCR-glued LaTeX command token by whitelist longest-prefix.
+
+    Examples:
+    - \\cdotdS -> \\cdot dS
+    - \\intdx  -> \\int dx
+    """
+    if not token.startswith("\\"):
+        return token
+
+    body = token[1:]
+    if len(body) < 2:
+        return token
+
+    best = None
+    # longest prefix that is in whitelist
+    for i in range(1, len(body)):
+        prefix = body[:i]
+        if prefix in _COMMANDS_NEED_SPACE:
+            best = prefix
+
+    if not best:
+        return token
+
+    suffix = body[len(best):]
+    if not suffix:
+        return token
+
+    return f"\\{best} {suffix}"
+
+
+def _postprocess_math(expr: str) -> str:
+    """Postprocess a *math* expression (already inside $...$ or $$...$$)."""
+    # stage1: split glued command tokens (e.g. \cdotdS)
+    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
+    # stage2: normalize differentials (keep conservative)
+    expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
+    expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
+    return expr
+
+
+def _postprocess_markdown(markdown_content: str) -> str:
+    """Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
+    if not markdown_content:
+        return markdown_content
+
+    def _fix_segment(m: re.Match) -> str:
+        seg = m.group(0)
+        if seg.startswith("$$") and seg.endswith("$$"):
+            return f"$${_postprocess_math(seg[2:-2])}$$"
+        if seg.startswith("$") and seg.endswith("$"):
+            return f"${_postprocess_math(seg[1:-1])}$"
+        return seg
+
+    return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
+
+
 class OCRServiceBase(ABC):
    @abstractmethod
    def recognize(self, image: np.ndarray) -> dict:
@@ -81,6 +158,7 @@ class OCRService(OCRServiceBase):
            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

+            markdown_content = _postprocess_markdown(markdown_content)
            convert_result  = self.converter.convert_to_formats(markdown_content)

            return {
@@ -112,6 +190,7 @@ class OCRService(OCRServiceBase):
            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

+            markdown_content = _postprocess_markdown(markdown_content)
            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
@@ -145,6 +224,7 @@ class MineruOCRService(OCRServiceBase):
    def __init__(
        self,
        api_url: str = "http://127.0.0.1:8000/file_parse",
+        image_processor: Optional[ImageProcessor] = None,
        converter: Optional[Converter] = None,
    ):
        """Initialize Local API service.
@@ -154,6 +234,7 @@ class MineruOCRService(OCRServiceBase):
            converter: Optional converter instance for format conversion.
        """
        self.api_url = api_url
+        self.image_processor = image_processor
        self.converter = converter
    
    def recognize(self, image: np.ndarray) -> dict:
@@ -166,6 +247,9 @@ class MineruOCRService(OCRServiceBase):
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
+            if self.image_processor:
+                image = self.image_processor.add_padding(image)
+
            # Convert numpy array to image bytes
            success, encoded_image = cv2.imencode('.png', image)
            if not success:
@@ -184,7 +268,6 @@ class MineruOCRService(OCRServiceBase):
                'return_md': 'true',
                'return_images': 'false',
                'end_page_id': '99999',
-                'parse_method': 'auto',
                'start_page_id': '0',
                'lang_list': 'en',
                'server_url': 'string',
@@ -193,6 +276,7 @@ class MineruOCRService(OCRServiceBase):
                'table_enable': 'true',
                'response_format_zip': 'false',
                'formula_enable': 'true',
+                'parse_method': 'ocr'
            }
            
            # Make API request
@@ -211,6 +295,8 @@ class MineruOCRService(OCRServiceBase):
            markdown_content = ""
            if 'results' in result and 'image' in result['results']:
                markdown_content = result['results']['image'].get('md_content', '')
+
+            # markdown_content = _postprocess_markdown(markdown_content)
            
            # Convert to other formats if converter is available
            latex = ""