"""PaddleOCR-VL client service for text and formula recognition.""" import base64 import logging import re from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor, as_completed from io import BytesIO import cv2 import numpy as np import requests from openai import OpenAI from paddleocr import PaddleOCRVL from PIL import Image as PILImage from app.core.config import get_settings from app.services.converter import Converter from app.services.glm_postprocess import GLMResultFormatter from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector settings = get_settings() logger = logging.getLogger(__name__) _COMMANDS_NEED_SPACE = { # operators / calculus "cdot", "times", "div", "pm", "mp", "int", "iint", "iiint", "oint", "sum", "prod", "lim", # common functions "sin", "cos", "tan", "cot", "sec", "csc", "log", "ln", "exp", # set relations (often glued by OCR) "in", "notin", "subset", "supset", "subseteq", "supseteq", "cap", "cup", # misc "partial", "nabla", } _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL) # Match LaTeX commands: \command (greedy match all letters) # The splitting logic in _split_glued_command_token will handle \inX -> \in X _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+") # stage2: differentials inside math segments # IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables # Only match differentials in specific contexts (after integrals, in fractions) # (? str: """Split OCR-glued LaTeX command token by whitelist longest-prefix. Examples: - \\cdotdS -> \\cdot dS - \\intdx -> \\int dx - \\inX -> \\in X (stop at uppercase letter) """ if not token.startswith("\\"): return token body = token[1:] if len(body) < 2: return token best = None # Find longest prefix that is in whitelist for i in range(1, len(body) + 1): prefix = body[:i] if prefix in _COMMANDS_NEED_SPACE: best = prefix if not best: return token suffix = body[len(best) :] if not suffix: return token return f"\\{best} {suffix}" def _clean_latex_syntax_spaces(expr: str) -> str: """Clean unwanted spaces in LaTeX syntax (common OCR errors). OCR often adds spaces in LaTeX syntax structures where they shouldn't be: - Subscripts: a _ {i 1} -> a_{i1} - Superscripts: x ^ {2 3} -> x^{23} - Fractions: \\frac { a } { b } -> \\frac{a}{b} - Commands: \\ alpha -> \\alpha - Braces: { a b } -> {ab} (within subscripts/superscripts) This is safe because these spaces are always OCR errors - LaTeX doesn't need or want spaces in these positions. Args: expr: LaTeX math expression. Returns: Expression with LaTeX syntax spaces cleaned. """ # Pattern 1: Spaces around _ and ^ (subscript/superscript operators) # a _ {i} -> a_{i}, x ^ {2} -> x^{2} expr = re.sub(r"\s*_\s*", "_", expr) expr = re.sub(r"\s*\^\s*", "^", expr) # Pattern 2: Spaces inside braces that follow _ or ^ # _{i 1} -> _{i1}, ^{2 3} -> ^{23} # This is safe because spaces inside subscript/superscript braces are usually OCR errors # BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them # must be preserved as they serve as command terminators (\in X != \inX) def clean_subscript_superscript_braces(match): operator = match.group(1) # _ or ^ content = match.group(2) # content inside braces if "\\" not in content: # No LaTeX commands: safe to remove all spaces cleaned = re.sub(r"\s+", "", content) else: # Contains LaTeX commands: remove spaces carefully # Keep spaces that follow a LaTeX command (e.g., \in X must keep the space) # Remove spaces everywhere else (e.g., x \in -> x\in is fine) # Strategy: remove spaces before \ and between non-command chars, # but preserve the space after \command when followed by a non-\ char cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd cleaned = re.sub(r"(? \frac{a}{b} # \frac{ a + b }{ c } -> \frac{a+b}{c} def clean_frac_braces(match): numerator = match.group(1).strip() denominator = match.group(2).strip() return f"\\frac{{{numerator}}}{{{denominator}}}" expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr) # Pattern 4: Spaces after backslash in LaTeX commands # \ alpha -> \alpha, \ beta -> \beta expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr) # Pattern 5: Spaces before/after braces in general contexts (conservative) # Only remove if the space is clearly wrong (e.g., after operators) # { x } in standalone context is kept as-is to avoid breaking valid spacing # But after operators like \sqrt{ x } -> \sqrt{x} expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr) # \sqrt { -> \sqrt{ return expr def _postprocess_math(expr: str) -> str: """Postprocess a *math* expression (already inside $...$ or $$...$$). Processing stages: 0. Fix OCR number errors (spaces in numbers) 1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X) 2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1}) 3. Normalize differentials (DISABLED by default to avoid breaking variables) Args: expr: LaTeX math expression without delimiters. Returns: Processed LaTeX expression. """ # stage0: fix OCR number errors (digits with spaces) expr = _fix_ocr_number_errors(expr) # stage1: split glued command tokens (e.g. \cdotdS, \inX) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces) expr = _clean_latex_syntax_spaces(expr) # stage3: normalize differentials - DISABLED # This feature is disabled because it's too aggressive and can break: # - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc. # - Variable names: dx, dy, dz might be variable names, not differentials # - Subscripts: x_{dx}, y_{dy} # - Function names or custom notation # # The risk of false positives (breaking valid LaTeX) outweighs the benefit # of normalizing differentials for OCR output. # # If differential normalization is needed, implement a context-aware version: # expr = _normalize_differentials_contextaware(expr) return expr def _normalize_differentials_contextaware(expr: str) -> str: """Context-aware differential normalization (optional, not used by default). Only normalizes differentials in specific mathematical contexts: 1. After integral symbols: \\int dx, \\iint dA, \\oint dr 2. In fraction denominators: \\frac{dy}{dx} 3. In explicit differential notation: f(x)dx (function followed by differential) This avoids false positives like variable names, subscripts, or LaTeX commands. Args: expr: LaTeX math expression. Returns: Expression with differentials normalized in safe contexts only. """ # Pattern 1: After integral commands # \int dx -> \int d x integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])") expr = integral_pattern.sub(r"\1 \2 d \3", expr) # Pattern 2: In fraction denominators # \frac{...}{dx} -> \frac{...}{d x} frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})") expr = frac_pattern.sub(r"\1d \2\3", expr) return expr def _fix_ocr_number_errors(expr: str) -> str: """Fix common OCR errors in LaTeX math expressions. OCR often splits numbers incorrectly, especially decimals: - "2 2. 2" should be "22.2" - "3 0. 4" should be "30.4" - "1 5 0" should be "150" This function merges digit sequences that are separated by spaces. Args: expr: LaTeX math expression. Returns: LaTeX expression with number errors fixed. """ # Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)" # Example: "2 2. 2" → "22.2" expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr) # Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)" # Example: "22. 2" → "22.2" expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr) # Fix pattern 3: "digit space digit" (no decimal point, within same number context) # Be careful: only merge if followed by decimal point or comma/end # Example: "1 5 0" → "150" when followed by comma or end expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr) # Fix pattern 4: Multiple spaces in decimal numbers # Example: "2 2 . 2" → "22.2" expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr) return expr def _postprocess_markdown(markdown_content: str) -> str: """Apply LaTeX postprocessing only within $...$ / $$...$$ segments.""" if not markdown_content: return markdown_content def _fix_segment(m: re.Match) -> str: seg = m.group(0) if seg.startswith("$$") and seg.endswith("$$"): return f"$${_postprocess_math(seg[2:-2])}$$" if seg.startswith("$") and seg.endswith("$"): return f"${_postprocess_math(seg[1:-1])}$" return seg markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content) # Apply markdown-level postprocessing (after LaTeX processing) markdown_content = _remove_false_heading_from_single_formula(markdown_content) return markdown_content def _remove_false_heading_from_single_formula(markdown_content: str) -> str: """Remove false heading markers from single-formula content. OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix. This function detects and removes the heading marker when: 1. The content contains only one formula (display or inline) 2. The formula line starts with '#' (heading marker) 3. No other non-formula text content exists Examples: Input: "# $$E = mc^2$$" Output: "$$E = mc^2$$" Input: "# $x = y$" Output: "$x = y$" Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading) Output: "# Introduction\n$$E = mc^2$$" Args: markdown_content: Markdown text with potential false headings. Returns: Markdown text with false heading markers removed. """ if not markdown_content or not markdown_content.strip(): return markdown_content lines = markdown_content.split("\n") # Count formulas and heading lines formula_count = 0 heading_lines = [] has_non_formula_text = False for i, line in enumerate(lines): line_stripped = line.strip() if not line_stripped: continue # Check if line starts with heading marker heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped) if heading_match: heading_level = heading_match.group(1) content = heading_match.group(2) # Check if the heading content is a formula if re.fullmatch(r"\$\$?.+\$\$?", content): # This is a heading with a formula heading_lines.append((i, heading_level, content)) formula_count += 1 else: # This is a real heading with text has_non_formula_text = True elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped): # Standalone formula line (not in a heading) formula_count += 1 elif line_stripped and not re.match(r"^#+\s*$", line_stripped): # Non-empty, non-heading, non-formula line has_non_formula_text = True # Only remove heading markers if: # 1. There's exactly one formula # 2. That formula is in a heading line # 3. There's no other text content if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text: # Remove the heading marker from the formula line_idx, heading_level, formula_content = heading_lines[0] lines[line_idx] = formula_content return "\n".join(lines) class OCRServiceBase(ABC): @abstractmethod def recognize(self, image: np.ndarray) -> dict: pass class OCRService(OCRServiceBase): """Service for OCR using PaddleOCR-VL.""" _pipeline: PaddleOCRVL | None = None _layout_detector: LayoutDetector | None = None def __init__( self, vl_server_url: str, layout_detector: LayoutDetector, image_processor: ImageProcessor, converter: Converter, ): """Initialize OCR service. Args: vl_server_url: URL of the vLLM server for PaddleOCR-VL. layout_detector: Layout detector instance. image_processor: Image processor instance. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url self.layout_detector = layout_detector self.image_processor = image_processor self.converter = converter def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: PaddleOCRVL pipeline instance. """ if OCRService._pipeline is None: OCRService._pipeline = PaddleOCRVL( vl_rec_backend="vllm-server", vl_rec_server_url=self.vl_server_url, layout_detection_model_name="PP-DocLayoutV2", ) return OCRService._pipeline def _recognize_mixed(self, image: np.ndarray) -> dict: """Recognize mixed content (text + formulas) using PP-DocLayoutV2. This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware recognition of mixed content. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=True) markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") markdown_content = _postprocess_markdown(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content) return { "markdown": markdown_content, "latex": convert_result.latex, "mathml": convert_result.mathml, "mml": convert_result.mml, } except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e def _recognize_formula(self, image: np.ndarray) -> dict: """Recognize formula/math content using PaddleOCR-VL with prompt. This mode uses PaddleOCR-VL directly with a formula recognition prompt. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ try: pipeline = self._get_pipeline() output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula") markdown_content = "" for res in output: markdown_content += res.markdown.get("markdown_texts", "") markdown_content = _postprocess_markdown(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content) return { "latex": convert_result.latex, "mathml": convert_result.mathml, "mml": convert_result.mml, "markdown": markdown_content, } except Exception as e: raise RuntimeError(f"Formula recognition failed: {e}") from e def recognize(self, image: np.ndarray) -> dict: """Recognize content using PaddleOCR-VL. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml' keys. """ padded_image = self.image_processor.add_padding(image) layout_info = self.layout_detector.detect(padded_image) if layout_info.MixedRecognition: return self._recognize_mixed(image) else: return self._recognize_formula(image) class GLMOCRService(OCRServiceBase): """Service for OCR using GLM-4V model via vLLM.""" def __init__( self, vl_server_url: str, image_processor: ImageProcessor, converter: Converter, ): """Initialize GLM OCR service. Args: vl_server_url: URL of the vLLM server for GLM-4V (default: http://127.0.0.1:8002/v1). image_processor: Image processor instance. converter: Converter instance for format conversion. """ self.vl_server_url = vl_server_url or settings.glm_ocr_url self.image_processor = image_processor self.converter = converter self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600) def _recognize_formula(self, image: np.ndarray) -> dict: """Recognize formula/math content using GLM-4V. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml', 'mml' keys. Raises: RuntimeError: If recognition fails (preserves original exception for fallback handling). """ # Add padding to image padded_image = self.image_processor.add_padding(image) # Encode image to base64 success, encoded_image = cv2.imencode(".png", padded_image) if not success: raise RuntimeError("Failed to encode image") image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8") image_url = f"data:image/png;base64,{image_base64}" # Call OpenAI-compatible API with formula recognition prompt prompt = "Formula Recognition:" messages = [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}, ], } ] # Don't catch exceptions here - let them propagate for fallback handling response = self.openai_client.chat.completions.create( model="glm-ocr", messages=messages, temperature=0.0, ) markdown_content = response.choices[0].message.content # Process LaTeX delimiters if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("): markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$") markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$") elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"): markdown_content = f"$${markdown_content}$$" # Apply postprocessing markdown_content = _postprocess_markdown(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content) return { "latex": convert_result.latex, "mathml": convert_result.mathml, "mml": convert_result.mml, "markdown": markdown_content, } def recognize(self, image: np.ndarray) -> dict: """Recognize content using GLM-4V. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'latex', 'markdown', 'mathml', 'mml' keys. """ return self._recognize_formula(image) class MineruOCRService(OCRServiceBase): """Service for OCR using local file_parse API.""" def __init__( self, api_url: str = "http://127.0.0.1:8000/file_parse", image_processor: ImageProcessor | None = None, converter: Converter | None = None, glm_ocr_url: str = "http://localhost:8002/v1", layout_detector: LayoutDetector | None = None, ): """Initialize Local API service. Args: api_url: URL of the local file_parse API endpoint. converter: Optional converter instance for format conversion. glm_ocr_url: URL of the GLM-OCR vLLM server. """ self.api_url = api_url self.image_processor = image_processor self.converter = converter self.glm_ocr_url = glm_ocr_url self.openai_client = OpenAI(api_key="EMPTY", base_url=glm_ocr_url, timeout=3600) def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str: """Recognize formula using PaddleOCR-VL API. Args: image: Input image as numpy array in BGR format. prompt: Recognition prompt (default: "Formula Recognition:") Returns: Recognized formula text (LaTeX format). """ try: # Encode image to base64 success, encoded_image = cv2.imencode(".png", image) if not success: raise RuntimeError("Failed to encode image") image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8") image_url = f"data:image/png;base64,{image_base64}" # Call OpenAI-compatible API messages = [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}, ], } ] response = self.openai_client.chat.completions.create( model="glm-ocr", messages=messages, temperature=0.0, ) return response.choices[0].message.content except Exception as e: raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str: """Extract image references from markdown and recognize formulas. Args: markdown_content: Markdown content with potential image references. original_image: Original input image. Returns: Markdown content with formulas recognized by PaddleOCR-VL. """ # Pattern to match image references: ![](images/xxx.png) or ![](images/xxx.jpg) image_pattern = re.compile(r"!\[\]\(images/[^)]+\)") if not image_pattern.search(markdown_content): return markdown_content formula_text = self._recognize_formula_with_paddleocr_vl(original_image) if formula_text.startswith(r"\[") or formula_text.startswith(r"\("): formula_text = formula_text.replace(r"\[", "$$").replace(r"\(", "$$") formula_text = formula_text.replace(r"\]", "$$").replace(r"\)", "$$") elif not formula_text.startswith("$$") and not formula_text.startswith("$"): formula_text = f"$${formula_text}$$" return formula_text def recognize(self, image_bytes: BytesIO) -> dict: """Recognize content using local file_parse API. Args: image_bytes: Input image as BytesIO object (already encoded as PNG). Returns: Dict with 'markdown', 'latex', 'mathml' keys. """ try: # Decode image_bytes to numpy array for potential formula recognition image_bytes.seek(0) image_data = np.frombuffer(image_bytes.read(), dtype=np.uint8) original_image = cv2.imdecode(image_data, cv2.IMREAD_COLOR) # Reset image_bytes for API request image_bytes.seek(0) # Prepare multipart form data files = {"files": ("image.png", image_bytes, "image/png")} data = { "return_middle_json": "false", "return_model_output": "false", "return_md": "true", "return_images": "false", "end_page_id": "99999", "start_page_id": "0", "lang_list": "en", "server_url": "string", "return_content_list": "false", "backend": "hybrid-auto-engine", "table_enable": "true", "response_format_zip": "false", "formula_enable": "true", "parse_method": "ocr", } # Make API request response = requests.post( self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30, ) response.raise_for_status() result = response.json() # Extract markdown content from response markdown_content = "" if "results" in result and "image" in result["results"]: markdown_content = result["results"]["image"].get("md_content", "") if "![](images/" in markdown_content: markdown_content = self._extract_and_recognize_formulas(markdown_content, original_image) # Apply postprocessing to fix OCR errors markdown_content = _postprocess_markdown(markdown_content) # Convert to other formats if converter is available latex = "" mathml = "" mml = "" if self.converter and markdown_content: convert_result = self.converter.convert_to_formats(markdown_content) latex = convert_result.latex mathml = convert_result.mathml mml = convert_result.mml return { "markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml, } except requests.RequestException as e: raise RuntimeError(f"Local API request failed: {e}") from e except Exception as e: raise RuntimeError(f"Recognition failed: {e}") from e # Task-specific prompts (from GLM-OCR SDK config.yaml) _TASK_PROMPTS: dict[str, str] = { "text": "Text Recognition. If the content is a formula, please output display latex code, else output text", "formula": "Formula Recognition:", "table": "Table Recognition:", } _DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please output display latex code, else output text" class GLMOCREndToEndService(OCRServiceBase): """End-to-end OCR using GLM-OCR pipeline: layout detection → per-region OCR. Pipeline: 1. Add padding (ImageProcessor) 2. Detect layout regions (LayoutDetector → PP-DocLayoutV3) 3. Crop each region and call vLLM with a task-specific prompt (parallel) 4. GLMResultFormatter: clean, format titles/bullets/formulas, merge tags 5. _postprocess_markdown: LaTeX math error correction 6. Converter: markdown → latex/mathml/mml This replaces both GLMOCRService (formula-only) and MineruOCRService (mixed). """ def __init__( self, vl_server_url: str, image_processor: ImageProcessor, converter: Converter, layout_detector: LayoutDetector, max_workers: int = 8, ): self.vl_server_url = vl_server_url or settings.glm_ocr_url self.image_processor = image_processor self.converter = converter self.layout_detector = layout_detector self.max_workers = max_workers self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600) self._formatter = GLMResultFormatter() def _encode_region(self, image: np.ndarray) -> str: """Convert BGR numpy array to base64 JPEG string.""" rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) pil_img = PILImage.fromarray(rgb) buf = BytesIO() pil_img.save(buf, format="JPEG") return base64.b64encode(buf.getvalue()).decode("utf-8") def _call_vllm(self, image: np.ndarray, prompt: str) -> str: """Send image + prompt to vLLM and return raw content string.""" img_b64 = self._encode_region(image) data_url = f"data:image/jpeg;base64,{img_b64}" messages = [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": data_url}}, {"type": "text", "text": prompt}, ], } ] response = self.openai_client.chat.completions.create( model="glm-ocr", messages=messages, temperature=0.01, max_tokens=settings.max_tokens, ) return response.choices[0].message.content.strip() def _normalize_bbox(self, bbox: list[float], img_w: int, img_h: int) -> list[int]: """Convert pixel bbox [x1,y1,x2,y2] to 0-1000 normalised coords.""" x1, y1, x2, y2 = bbox return [ int(x1 / img_w * 1000), int(y1 / img_h * 1000), int(x2 / img_w * 1000), int(y2 / img_h * 1000), ] def recognize(self, image: np.ndarray) -> dict: """Full pipeline: padding → layout → per-region OCR → postprocess → markdown. Args: image: Input image as numpy array in BGR format. Returns: Dict with 'markdown', 'latex', 'mathml', 'mml' keys. """ # 1. Layout detection img_h, img_w = image.shape[:2] padded_image = self.image_processor.add_padding(image) layout_info = self.layout_detector.detect(padded_image) # Sort regions in reading order: top-to-bottom, left-to-right layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0])) # 3. OCR: per-region (parallel) or full-image fallback if not layout_info.regions or (len(layout_info.regions) == 1 and not layout_info.MixedRecognition): # No layout detected → assume it's a formula, use formula recognition logger.info("No layout regions detected, treating image as formula") raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"]) # Format as display formula markdown formatted_content = raw_content.strip() if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")): formatted_content = f"$$\n{formatted_content}\n$$" markdown_content = formatted_content else: # Build task list for non-figure regions tasks = [] for idx, region in enumerate(layout_info.regions): if region.type == "figure": continue x1, y1, x2, y2 = (int(c) for c in region.bbox) cropped = padded_image[y1:y2, x1:x2] if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10: logger.warning( "Skipping region idx=%d (label=%s): crop too small %s", idx, region.native_label, cropped.shape[:2], ) continue prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT) tasks.append((idx, region, cropped, prompt)) if not tasks: raw_content = self._call_vllm(image, _DEFAULT_PROMPT) markdown_content = self._formatter._clean_content(raw_content) else: # Parallel OCR calls raw_results: dict[int, str] = {} with ThreadPoolExecutor(max_workers=min(self.max_workers, len(tasks))) as ex: future_map = {ex.submit(self._call_vllm, cropped, prompt): idx for idx, region, cropped, prompt in tasks} for future in as_completed(future_map): idx = future_map[future] try: raw_results[idx] = future.result() except Exception as e: logger.warning("vLLM call failed for region idx=%d: %s", idx, e) raw_results[idx] = "" # Build structured region dicts for GLMResultFormatter region_dicts = [] for idx, region, _cropped, _prompt in tasks: region_dicts.append( { "index": idx, "label": region.type, "native_label": region.native_label, "content": raw_results.get(idx, ""), "bbox_2d": self._normalize_bbox(region.bbox, img_w, img_h), } ) # 4. GLM-OCR postprocessing: clean, format, merge, bullets markdown_content = self._formatter.process(region_dicts) # 5. LaTeX math error correction (our existing pipeline) markdown_content = _postprocess_markdown(markdown_content) # 6. Format conversion latex, mathml, mml = "", "", "" if markdown_content and self.converter: try: fmt = self.converter.convert_to_formats(markdown_content) latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml except RuntimeError as e: logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e) return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}