doc_processer/app/services/ocr_service.py

"""PaddleOCR-VL client service for text and formula recognition."""

import base64
import logging
import re
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO

import cv2
import numpy as np
import requests
from openai import OpenAI
from paddleocr import PaddleOCRVL
from PIL import Image as PILImage

from app.core.config import get_settings
from app.services.converter import Converter
from app.services.glm_postprocess import GLMResultFormatter
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector

settings = get_settings()
logger = logging.getLogger(__name__)

_COMMANDS_NEED_SPACE = {
    # operators / calculus
    "cdot",
    "times",
    "div",
    "pm",
    "mp",
    "int",
    "iint",
    "iiint",
    "oint",
    "sum",
    "prod",
    "lim",
    # common functions
    "sin",
    "cos",
    "tan",
    "cot",
    "sec",
    "csc",
    "log",
    "ln",
    "exp",
    # set relations (often glued by OCR)
    "in",
    "notin",
    "subset",
    "supset",
    "subseteq",
    "supseteq",
    "cap",
    "cup",
    # misc
    "partial",
    "nabla",
}

_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
# Match LaTeX commands: \command (greedy match all letters)
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")

# stage2: differentials inside math segments
# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
# Only match differentials in specific contexts (after integrals, in fractions)
# (?<!\\) - not preceded by backslash (not a LaTeX command)
# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")


def _split_glued_command_token(token: str) -> str:
    """Split OCR-glued LaTeX command token by whitelist longest-prefix.

    Examples:
    - \\cdotdS -> \\cdot dS
    - \\intdx  -> \\int dx
    - \\inX    -> \\in X (stop at uppercase letter)
    """
    if not token.startswith("\\"):
        return token

    body = token[1:]
    if len(body) < 2:
        return token

    best = None
    # Find longest prefix that is in whitelist
    for i in range(1, len(body) + 1):
        prefix = body[:i]
        if prefix in _COMMANDS_NEED_SPACE:
            best = prefix

    if not best:
        return token

    suffix = body[len(best) :]
    if not suffix:
        return token

    return f"\\{best} {suffix}"


def _clean_latex_syntax_spaces(expr: str) -> str:
    """Clean unwanted spaces in LaTeX syntax (common OCR errors).

    OCR often adds spaces in LaTeX syntax structures where they shouldn't be:
    - Subscripts: a _ {i 1} -> a_{i1}
    - Superscripts: x ^ {2 3} -> x^{23}
    - Fractions: \\frac { a } { b } -> \\frac{a}{b}
    - Commands: \\ alpha -> \\alpha
    - Braces: { a b } -> {ab} (within subscripts/superscripts)

    This is safe because these spaces are always OCR errors - LaTeX doesn't
    need or want spaces in these positions.

    Args:
        expr: LaTeX math expression.

    Returns:
        Expression with LaTeX syntax spaces cleaned.
    """
    # Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
    # a _ {i} -> a_{i}, x ^ {2} -> x^{2}
    expr = re.sub(r"\s*_\s*", "_", expr)
    expr = re.sub(r"\s*\^\s*", "^", expr)

    # Pattern 2: Spaces inside braces that follow _ or ^
    # _{i 1} -> _{i1}, ^{2 3} -> ^{23}
    # This is safe because spaces inside subscript/superscript braces are usually OCR errors
    # BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
    # must be preserved as they serve as command terminators (\in X != \inX)
    def clean_subscript_superscript_braces(match):
        operator = match.group(1)  # _ or ^
        content = match.group(2)  # content inside braces
        if "\\" not in content:
            # No LaTeX commands: safe to remove all spaces
            cleaned = re.sub(r"\s+", "", content)
        else:
            # Contains LaTeX commands: remove spaces carefully
            # Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
            # Remove spaces everywhere else (e.g., x \in -> x\in is fine)
            # Strategy: remove spaces before \ and between non-command chars,
            # but preserve the space after \command when followed by a non-\ char
            cleaned = re.sub(r"\s+(?=\\)", "", content)  # remove space before \cmd
            cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned)  # remove space after non-letter non-\
        return f"{operator}{{{cleaned}}}"

    # Match _{ ... } or ^{ ... }
    expr = re.sub(r"([_^])\{([^}]+)\}", clean_subscript_superscript_braces, expr)

    # Pattern 3: Spaces inside \frac arguments
    # \frac { a } { b } -> \frac{a}{b}
    # \frac{ a + b }{ c } -> \frac{a+b}{c}
    def clean_frac_braces(match):
        numerator = match.group(1).strip()
        denominator = match.group(2).strip()
        return f"\\frac{{{numerator}}}{{{denominator}}}"

    expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr)

    # Pattern 4: Spaces after backslash in LaTeX commands
    # \ alpha -> \alpha, \ beta -> \beta
    expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr)

    # Pattern 5: Spaces before/after braces in general contexts (conservative)
    # Only remove if the space is clearly wrong (e.g., after operators)
    # { x } in standalone context is kept as-is to avoid breaking valid spacing
    # But after operators like \sqrt{ x } -> \sqrt{x}
    expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr)  # \sqrt { -> \sqrt{

    return expr


def _postprocess_math(expr: str) -> str:
    """Postprocess a *math* expression (already inside $...$ or $$...$$).

    Processing stages:
    0. Fix OCR number errors (spaces in numbers)
    1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
    2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
    3. Normalize differentials (DISABLED by default to avoid breaking variables)

    Args:
        expr: LaTeX math expression without delimiters.

    Returns:
        Processed LaTeX expression.
    """
    # stage0: fix OCR number errors (digits with spaces)
    expr = _fix_ocr_number_errors(expr)

    # stage1: split glued command tokens (e.g. \cdotdS, \inX)
    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)

    # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
    expr = _clean_latex_syntax_spaces(expr)

    # stage3: normalize differentials - DISABLED
    # This feature is disabled because it's too aggressive and can break:
    # - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
    # - Variable names: dx, dy, dz might be variable names, not differentials
    # - Subscripts: x_{dx}, y_{dy}
    # - Function names or custom notation
    #
    # The risk of false positives (breaking valid LaTeX) outweighs the benefit
    # of normalizing differentials for OCR output.
    #
    # If differential normalization is needed, implement a context-aware version:
    # expr = _normalize_differentials_contextaware(expr)

    return expr


def _normalize_differentials_contextaware(expr: str) -> str:
    """Context-aware differential normalization (optional, not used by default).

    Only normalizes differentials in specific mathematical contexts:
    1. After integral symbols: \\int dx, \\iint dA, \\oint dr
    2. In fraction denominators: \\frac{dy}{dx}
    3. In explicit differential notation: f(x)dx (function followed by differential)

    This avoids false positives like variable names, subscripts, or LaTeX commands.

    Args:
        expr: LaTeX math expression.

    Returns:
        Expression with differentials normalized in safe contexts only.
    """
    # Pattern 1: After integral commands
    # \int dx -> \int d x
    integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])")
    expr = integral_pattern.sub(r"\1 \2 d \3", expr)

    # Pattern 2: In fraction denominators
    # \frac{...}{dx} -> \frac{...}{d x}
    frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})")
    expr = frac_pattern.sub(r"\1d \2\3", expr)

    return expr


def _fix_ocr_number_errors(expr: str) -> str:
    """Fix common OCR errors in LaTeX math expressions.

    OCR often splits numbers incorrectly, especially decimals:
    - "2 2. 2" should be "22.2"
    - "3 0. 4" should be "30.4"
    - "1 5 0" should be "150"

    This function merges digit sequences that are separated by spaces.

    Args:
        expr: LaTeX math expression.

    Returns:
        LaTeX expression with number errors fixed.
    """
    # Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
    # Example: "2 2. 2" → "22.2"
    expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)

    # Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
    # Example: "22. 2" → "22.2"
    expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)

    # Fix pattern 3: "digit space digit" (no decimal point, within same number context)
    # Be careful: only merge if followed by decimal point or comma/end
    # Example: "1 5 0" → "150" when followed by comma or end
    expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)

    # Fix pattern 4: Multiple spaces in decimal numbers
    # Example: "2  2  .  2" → "22.2"
    expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)

    return expr


def _postprocess_markdown(markdown_content: str) -> str:
    """Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
    if not markdown_content:
        return markdown_content

    def _fix_segment(m: re.Match) -> str:
        seg = m.group(0)
        if seg.startswith("$$") and seg.endswith("$$"):
            return f"$${_postprocess_math(seg[2:-2])}$$"
        if seg.startswith("$") and seg.endswith("$"):
            return f"${_postprocess_math(seg[1:-1])}$"
        return seg

    markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)

    # Apply markdown-level postprocessing (after LaTeX processing)
    markdown_content = _remove_false_heading_from_single_formula(markdown_content)

    return markdown_content


def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
    """Remove false heading markers from single-formula content.

    OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
    This function detects and removes the heading marker when:
    1. The content contains only one formula (display or inline)
    2. The formula line starts with '#' (heading marker)
    3. No other non-formula text content exists

    Examples:
        Input:  "# $$E = mc^2$$"
        Output: "$$E = mc^2$$"

        Input:  "# $x = y$"
        Output: "$x = y$"

        Input:  "# Introduction\n$$E = mc^2$$"  (has text, keep heading)
        Output: "# Introduction\n$$E = mc^2$$"

    Args:
        markdown_content: Markdown text with potential false headings.

    Returns:
        Markdown text with false heading markers removed.
    """
    if not markdown_content or not markdown_content.strip():
        return markdown_content

    lines = markdown_content.split("\n")

    # Count formulas and heading lines
    formula_count = 0
    heading_lines = []
    has_non_formula_text = False

    for i, line in enumerate(lines):
        line_stripped = line.strip()

        if not line_stripped:
            continue

        # Check if line starts with heading marker
        heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped)

        if heading_match:
            heading_level = heading_match.group(1)
            content = heading_match.group(2)

            # Check if the heading content is a formula
            if re.fullmatch(r"\$\$?.+\$\$?", content):
                # This is a heading with a formula
                heading_lines.append((i, heading_level, content))
                formula_count += 1
            else:
                # This is a real heading with text
                has_non_formula_text = True
        elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped):
            # Standalone formula line (not in a heading)
            formula_count += 1
        elif line_stripped and not re.match(r"^#+\s*$", line_stripped):
            # Non-empty, non-heading, non-formula line
            has_non_formula_text = True

    # Only remove heading markers if:
    # 1. There's exactly one formula
    # 2. That formula is in a heading line
    # 3. There's no other text content
    if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
        # Remove the heading marker from the formula
        line_idx, heading_level, formula_content = heading_lines[0]
        lines[line_idx] = formula_content

    return "\n".join(lines)


class OCRServiceBase(ABC):
    @abstractmethod
    def recognize(self, image: np.ndarray) -> dict:
        pass


class OCRService(OCRServiceBase):
    """Service for OCR using PaddleOCR-VL."""

    _pipeline: PaddleOCRVL | None = None
    _layout_detector: LayoutDetector | None = None

    def __init__(
        self,
        vl_server_url: str,
        layout_detector: LayoutDetector,
        image_processor: ImageProcessor,
        converter: Converter,
    ):
        """Initialize OCR service.

        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
            layout_detector: Layout detector instance.
            image_processor: Image processor instance.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
        self.layout_detector = layout_detector
        self.image_processor = image_processor
        self.converter = converter

    def _get_pipeline(self):
        """Get or create PaddleOCR-VL pipeline.

        Returns:
            PaddleOCRVL pipeline instance.
        """
        if OCRService._pipeline is None:
            OCRService._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
            )
        return OCRService._pipeline

    def _recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.

        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
        recognition of mixed content.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()

            output = pipeline.predict(image, use_layout_detection=True)

            markdown_content = ""

            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

            markdown_content = _postprocess_markdown(markdown_content)
            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
                "markdown": markdown_content,
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
                "mml": convert_result.mml,
            }
        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

    def _recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using PaddleOCR-VL with prompt.

        This mode uses PaddleOCR-VL directly with a formula recognition prompt.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()

            output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")

            markdown_content = ""

            for res in output:
                markdown_content += res.markdown.get("markdown_texts", "")

            markdown_content = _postprocess_markdown(markdown_content)
            convert_result = self.converter.convert_to_formats(markdown_content)

            return {
                "latex": convert_result.latex,
                "mathml": convert_result.mathml,
                "mml": convert_result.mml,
                "markdown": markdown_content,
            }
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e

    def recognize(self, image: np.ndarray) -> dict:
        """Recognize content using PaddleOCR-VL.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        padded_image = self.image_processor.add_padding(image)
        layout_info = self.layout_detector.detect(padded_image)
        if layout_info.MixedRecognition:
            return self._recognize_mixed(image)
        else:
            return self._recognize_formula(image)


class GLMOCRService(OCRServiceBase):
    """Service for OCR using GLM-4V model via vLLM."""

    def __init__(
        self,
        vl_server_url: str,
        image_processor: ImageProcessor,
        converter: Converter,
    ):
        """Initialize GLM OCR service.

        Args:
            vl_server_url: URL of the vLLM server for GLM-4V (default: http://127.0.0.1:8002/v1).
            image_processor: Image processor instance.
            converter: Converter instance for format conversion.
        """
        self.vl_server_url = vl_server_url or settings.glm_ocr_url
        self.image_processor = image_processor
        self.converter = converter
        self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600)

    def _recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using GLM-4V.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml', 'mml' keys.

        Raises:
            RuntimeError: If recognition fails (preserves original exception for fallback handling).
        """
        # Add padding to image
        padded_image = self.image_processor.add_padding(image)

        # Encode image to base64
        success, encoded_image = cv2.imencode(".png", padded_image)
        if not success:
            raise RuntimeError("Failed to encode image")

        image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
        image_url = f"data:image/png;base64,{image_base64}"

        # Call OpenAI-compatible API with formula recognition prompt
        prompt = "Formula Recognition:"
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        # Don't catch exceptions here - let them propagate for fallback handling
        response = self.openai_client.chat.completions.create(
            model="glm-ocr",
            messages=messages,
            temperature=0.0,
        )

        markdown_content = response.choices[0].message.content

        # Process LaTeX delimiters
        if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
            markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
            markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
        elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
            markdown_content = f"$${markdown_content}$$"

        # Apply postprocessing
        markdown_content = _postprocess_markdown(markdown_content)
        convert_result = self.converter.convert_to_formats(markdown_content)

        return {
            "latex": convert_result.latex,
            "mathml": convert_result.mathml,
            "mml": convert_result.mml,
            "markdown": markdown_content,
        }

    def recognize(self, image: np.ndarray) -> dict:
        """Recognize content using GLM-4V.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
        """
        return self._recognize_formula(image)


class MineruOCRService(OCRServiceBase):
    """Service for OCR using local file_parse API."""

    def __init__(
        self,
        api_url: str = "http://127.0.0.1:8000/file_parse",
        image_processor: ImageProcessor | None = None,
        converter: Converter | None = None,
        glm_ocr_url: str = "http://localhost:8002/v1",
        layout_detector: LayoutDetector | None = None,
    ):
        """Initialize Local API service.

        Args:
            api_url: URL of the local file_parse API endpoint.
            converter: Optional converter instance for format conversion.
            glm_ocr_url: URL of the GLM-OCR vLLM server.
        """
        self.api_url = api_url
        self.image_processor = image_processor
        self.converter = converter
        self.glm_ocr_url = glm_ocr_url
        self.openai_client = OpenAI(api_key="EMPTY", base_url=glm_ocr_url, timeout=3600)

    def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
        """Recognize formula using PaddleOCR-VL API.

        Args:
            image: Input image as numpy array in BGR format.
            prompt: Recognition prompt (default: "Formula Recognition:")

        Returns:
            Recognized formula text (LaTeX format).
        """
        try:
            # Encode image to base64
            success, encoded_image = cv2.imencode(".png", image)
            if not success:
                raise RuntimeError("Failed to encode image")

            image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
            image_url = f"data:image/png;base64,{image_base64}"

            # Call OpenAI-compatible API
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": image_url}},
                        {"type": "text", "text": prompt},
                    ],
                }
            ]

            response = self.openai_client.chat.completions.create(
                model="glm-ocr",
                messages=messages,
                temperature=0.0,
            )

            return response.choices[0].message.content

        except Exception as e:
            raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e

    def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
        """Extract image references from markdown and recognize formulas.

        Args:
            markdown_content: Markdown content with potential image references.
            original_image: Original input image.

        Returns:
            Markdown content with formulas recognized by PaddleOCR-VL.
        """
        # Pattern to match image references: ![](images/xxx.png) or ![](images/xxx.jpg)
        image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")

        if not image_pattern.search(markdown_content):
            return markdown_content

        formula_text = self._recognize_formula_with_paddleocr_vl(original_image)

        if formula_text.startswith(r"\[") or formula_text.startswith(r"\("):
            formula_text = formula_text.replace(r"\[", "$$").replace(r"\(", "$$")
            formula_text = formula_text.replace(r"\]", "$$").replace(r"\)", "$$")
        elif not formula_text.startswith("$$") and not formula_text.startswith("$"):
            formula_text = f"$${formula_text}$$"

        return formula_text

    def recognize(self, image_bytes: BytesIO) -> dict:
        """Recognize content using local file_parse API.

        Args:
            image_bytes: Input image as BytesIO object (already encoded as PNG).

        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
            # Decode image_bytes to numpy array for potential formula recognition
            image_bytes.seek(0)
            image_data = np.frombuffer(image_bytes.read(), dtype=np.uint8)
            original_image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

            # Reset image_bytes for API request
            image_bytes.seek(0)

            # Prepare multipart form data
            files = {"files": ("image.png", image_bytes, "image/png")}

            data = {
                "return_middle_json": "false",
                "return_model_output": "false",
                "return_md": "true",
                "return_images": "false",
                "end_page_id": "99999",
                "start_page_id": "0",
                "lang_list": "en",
                "server_url": "string",
                "return_content_list": "false",
                "backend": "hybrid-auto-engine",
                "table_enable": "true",
                "response_format_zip": "false",
                "formula_enable": "true",
                "parse_method": "ocr",
            }

            # Make API request
            response = requests.post(
                self.api_url,
                files=files,
                data=data,
                headers={"accept": "application/json"},
                timeout=30,
            )
            response.raise_for_status()

            result = response.json()

            # Extract markdown content from response
            markdown_content = ""
            if "results" in result and "image" in result["results"]:
                markdown_content = result["results"]["image"].get("md_content", "")

            if "![](images/" in markdown_content:
                markdown_content = self._extract_and_recognize_formulas(markdown_content, original_image)

            # Apply postprocessing to fix OCR errors
            markdown_content = _postprocess_markdown(markdown_content)

            # Convert to other formats if converter is available
            latex = ""
            mathml = ""
            mml = ""
            if self.converter and markdown_content:
                convert_result = self.converter.convert_to_formats(markdown_content)
                latex = convert_result.latex
                mathml = convert_result.mathml
                mml = convert_result.mml

            return {
                "markdown": markdown_content,
                "latex": latex,
                "mathml": mathml,
                "mml": mml,
            }

        except requests.RequestException as e:
            raise RuntimeError(f"Local API request failed: {e}") from e
        except Exception as e:
            raise RuntimeError(f"Recognition failed: {e}") from e


# Task-specific prompts (from GLM-OCR SDK config.yaml)
_TASK_PROMPTS: dict[str, str] = {
    "text": "Text Recognition. If the content is a formula, please ouput display latex code, else output text",
    "formula": "Formula Recognition:",
    "table": "Table Recognition:",
}
_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please ouput display latex code, else output text"


class GLMOCREndToEndService(OCRServiceBase):
    """End-to-end OCR using GLM-OCR pipeline: layout detection → per-region OCR.

    Pipeline:
      1. Add padding (ImageProcessor)
      2. Detect layout regions (LayoutDetector → PP-DocLayoutV3)
      3. Crop each region and call vLLM with a task-specific prompt (parallel)
      4. GLMResultFormatter: clean, format titles/bullets/formulas, merge tags
      5. _postprocess_markdown: LaTeX math error correction
      6. Converter: markdown → latex/mathml/mml

    This replaces both GLMOCRService (formula-only) and MineruOCRService (mixed).
    """

    def __init__(
        self,
        vl_server_url: str,
        image_processor: ImageProcessor,
        converter: Converter,
        layout_detector: LayoutDetector,
        max_workers: int = 8,
    ):
        self.vl_server_url = vl_server_url or settings.glm_ocr_url
        self.image_processor = image_processor
        self.converter = converter
        self.layout_detector = layout_detector
        self.max_workers = max_workers
        self.openai_client = OpenAI(api_key="EMPTY", base_url=self.vl_server_url, timeout=3600)
        self._formatter = GLMResultFormatter()

    def _encode_region(self, image: np.ndarray) -> str:
        """Convert BGR numpy array to base64 JPEG string."""
        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        pil_img = PILImage.fromarray(rgb)
        buf = BytesIO()
        pil_img.save(buf, format="JPEG")
        return base64.b64encode(buf.getvalue()).decode("utf-8")

    def _call_vllm(self, image: np.ndarray, prompt: str) -> str:
        """Send image + prompt to vLLM and return raw content string."""
        img_b64 = self._encode_region(image)
        data_url = f"data:image/jpeg;base64,{img_b64}"
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": data_url}},
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        response = self.openai_client.chat.completions.create(
            model="glm-ocr",
            messages=messages,
            temperature=0.01,
            max_tokens=settings.max_tokens,
        )
        return response.choices[0].message.content.strip()

    def _normalize_bbox(self, bbox: list[float], img_w: int, img_h: int) -> list[int]:
        """Convert pixel bbox [x1,y1,x2,y2] to 0-1000 normalised coords."""
        x1, y1, x2, y2 = bbox
        return [
            int(x1 / img_w * 1000),
            int(y1 / img_h * 1000),
            int(x2 / img_w * 1000),
            int(y2 / img_h * 1000),
        ]

    def recognize(self, image: np.ndarray) -> dict:
        """Full pipeline: padding → layout → per-region OCR → postprocess → markdown.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'markdown', 'latex', 'mathml', 'mml' keys.
        """
        # 1. Layout detection
        img_h, img_w = image.shape[:2]
        layout_info = self.layout_detector.detect(image)

        # Sort regions in reading order: top-to-bottom, left-to-right
        layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))

        # 3. OCR: per-region (parallel) or full-image fallback
        if not layout_info.regions or (len(layout_info.regions) == 1 and not layout_info.MixedRecognition):
            # No layout detected → assume it's a formula, use formula recognition
            logger.info("No layout regions detected, treating image as formula")
            raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"])
            # Format as display formula markdown
            formatted_content = raw_content.strip()
            if not (formatted_content.startswith("$$") and formatted_content.endswith("$$")):
                formatted_content = f"$$\n{formatted_content}\n$$"
            markdown_content = formatted_content
        else:
            # Build task list for non-figure regions
            tasks = []
            for idx, region in enumerate(layout_info.regions):
                if region.type == "figure":
                    continue
                x1, y1, x2, y2 = (int(c) for c in region.bbox)
                cropped = image[y1:y2, x1:x2]
                if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
                    logger.warning(
                        "Skipping region idx=%d (label=%s): crop too small %s",
                        idx,
                        region.native_label,
                        cropped.shape[:2],
                    )
                    continue
                prompt = _TASK_PROMPTS.get(region.type, _DEFAULT_PROMPT)
                tasks.append((idx, region, cropped, prompt))

            if not tasks:
                raw_content = self._call_vllm(image, _DEFAULT_PROMPT)
                markdown_content = self._formatter._clean_content(raw_content)
            else:
                # Parallel OCR calls
                raw_results: dict[int, str] = {}
                with ThreadPoolExecutor(max_workers=min(self.max_workers, len(tasks))) as ex:
                    future_map = {ex.submit(self._call_vllm, cropped, prompt): idx for idx, region, cropped, prompt in tasks}
                    for future in as_completed(future_map):
                        idx = future_map[future]
                        try:
                            raw_results[idx] = future.result()
                        except Exception as e:
                            logger.warning("vLLM call failed for region idx=%d: %s", idx, e)
                            raw_results[idx] = ""

                # Build structured region dicts for GLMResultFormatter
                region_dicts = []
                for idx, region, _cropped, _prompt in tasks:
                    region_dicts.append(
                        {
                            "index": idx,
                            "label": region.type,
                            "native_label": region.native_label,
                            "content": raw_results.get(idx, ""),
                            "bbox_2d": self._normalize_bbox(region.bbox, img_w, img_h),
                        }
                    )

                # 4. GLM-OCR postprocessing: clean, format, merge, bullets
                markdown_content = self._formatter.process(region_dicts)

        # 5. LaTeX math error correction (our existing pipeline)
        markdown_content = _postprocess_markdown(markdown_content)

        # 6. Format conversion
        latex, mathml, mml = "", "", ""
        if markdown_content and self.converter:
            try:
                fmt = self.converter.convert_to_formats(markdown_content)
                latex, mathml, mml = fmt.latex, fmt.mathml, fmt.mml
            except RuntimeError as e:
                logger.warning("Format conversion failed, returning empty latex/mathml/mml: %s", e)

        return {"markdown": markdown_content, "latex": latex, "mathml": mathml, "mml": mml}