diff --git a/app/core/dependencies.py b/app/core/dependencies.py index 20d5a99..7e45829 100644 --- a/app/core/dependencies.py +++ b/app/core/dependencies.py @@ -53,5 +53,6 @@ def get_mineru_ocr_service() -> MineruOCRService: return MineruOCRService( api_url=api_url, converter=get_converter(), + image_processor=get_image_processor(), ) diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index ebfbf42..aa8342a 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -1,5 +1,6 @@ """PaddleOCR-VL client service for text and formula recognition.""" +import re import numpy as np import cv2 import requests @@ -14,6 +15,82 @@ from abc import ABC, abstractmethod settings = get_settings() +_COMMANDS_NEED_SPACE = { + # operators / calculus + "cdot", "times", "div", "pm", "mp", + "int", "iint", "iiint", "oint", "sum", "prod", "lim", + # common functions + "sin", "cos", "tan", "cot", "sec", "csc", + "log", "ln", "exp", + # misc + "partial", "nabla", +} + +_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL) +_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+") + +# stage2: differentials inside math segments +_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(? str: + """Split OCR-glued LaTeX command token by whitelist longest-prefix. + + Examples: + - \\cdotdS -> \\cdot dS + - \\intdx -> \\int dx + """ + if not token.startswith("\\"): + return token + + body = token[1:] + if len(body) < 2: + return token + + best = None + # longest prefix that is in whitelist + for i in range(1, len(body)): + prefix = body[:i] + if prefix in _COMMANDS_NEED_SPACE: + best = prefix + + if not best: + return token + + suffix = body[len(best):] + if not suffix: + return token + + return f"\\{best} {suffix}" + + +def _postprocess_math(expr: str) -> str: + """Postprocess a *math* expression (already inside $...$ or $$...$$).""" + # stage1: split glued command tokens (e.g. \cdotdS) + expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) + # stage2: normalize differentials (keep conservative) + expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr) + expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr) + return expr + + +def _postprocess_markdown(markdown_content: str) -> str: + """Apply LaTeX postprocessing only within $...$ / $$...$$ segments.""" + if not markdown_content: + return markdown_content + + def _fix_segment(m: re.Match) -> str: + seg = m.group(0) + if seg.startswith("$$") and seg.endswith("$$"): + return f"$${_postprocess_math(seg[2:-2])}$$" + if seg.startswith("$") and seg.endswith("$"): + return f"${_postprocess_math(seg[1:-1])}$" + return seg + + return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content) + + class OCRServiceBase(ABC): @abstractmethod def recognize(self, image: np.ndarray) -> dict: @@ -81,6 +158,7 @@ class OCRService(OCRServiceBase): for res in output: markdown_content += res.markdown.get("markdown_texts", "") + markdown_content = _postprocess_markdown(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content) return { @@ -112,6 +190,7 @@ class OCRService(OCRServiceBase): for res in output: markdown_content += res.markdown.get("markdown_texts", "") + markdown_content = _postprocess_markdown(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content) return { @@ -145,6 +224,7 @@ class MineruOCRService(OCRServiceBase): def __init__( self, api_url: str = "http://127.0.0.1:8000/file_parse", + image_processor: Optional[ImageProcessor] = None, converter: Optional[Converter] = None, ): """Initialize Local API service. @@ -154,6 +234,7 @@ class MineruOCRService(OCRServiceBase): converter: Optional converter instance for format conversion. """ self.api_url = api_url + self.image_processor = image_processor self.converter = converter def recognize(self, image: np.ndarray) -> dict: @@ -166,6 +247,9 @@ class MineruOCRService(OCRServiceBase): Dict with 'markdown', 'latex', 'mathml' keys. """ try: + if self.image_processor: + image = self.image_processor.add_padding(image) + # Convert numpy array to image bytes success, encoded_image = cv2.imencode('.png', image) if not success: @@ -184,7 +268,6 @@ class MineruOCRService(OCRServiceBase): 'return_md': 'true', 'return_images': 'false', 'end_page_id': '99999', - 'parse_method': 'auto', 'start_page_id': '0', 'lang_list': 'en', 'server_url': 'string', @@ -193,6 +276,7 @@ class MineruOCRService(OCRServiceBase): 'table_enable': 'true', 'response_format_zip': 'false', 'formula_enable': 'true', + 'parse_method': 'ocr' } # Make API request @@ -211,6 +295,8 @@ class MineruOCRService(OCRServiceBase): markdown_content = "" if 'results' in result and 'image' in result['results']: markdown_content = result['results']['image'].get('md_content', '') + + # markdown_content = _postprocess_markdown(markdown_content) # Convert to other formats if converter is available latex = ""