"""PaddleOCR-VL client service for text and formula recognition."""

import io
import tempfile
from pathlib import Path

import cv2
import numpy as np

from app.core.config import get_settings
from app.schemas.image import LayoutInfo

settings = get_settings()


class OCRService:
    """Service for OCR using PaddleOCR-VL."""

    FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."

    def __init__(
        self,
        vl_server_url: str | None = None,
        pp_doclayout_model_dir: str | None = None,
    ):
        """Initialize OCR service.

        Args:
            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
            pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
        """
        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
        self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
        self._pipeline = None

    def _get_pipeline(self):
        """Get or create PaddleOCR-VL pipeline.

        Returns:
            PaddleOCRVL pipeline instance.
        """
        if self._pipeline is None:
            from paddleocr import PaddleOCRVL

            self._pipeline = PaddleOCRVL(
                vl_rec_backend="vllm-server",
                vl_rec_server_url=self.vl_server_url,
                layout_detection_model_name="PP-DocLayoutV2",
                layout_detection_model_dir=self.pp_doclayout_model_dir,
            )
        return self._pipeline

    def _save_temp_image(self, image: np.ndarray) -> str:
        """Save image to a temporary file.

        Args:
            image: Image as numpy array in BGR format.

        Returns:
            Path to temporary file.
        """
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
            cv2.imwrite(f.name, image)
            return f.name

    def recognize_mixed(self, image: np.ndarray) -> dict:
        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.

        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
        recognition of mixed content.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'markdown', 'latex', 'mathml' keys.
        """
        try:
            pipeline = self._get_pipeline()
            temp_path = self._save_temp_image(image)

            try:
                results = list(pipeline.predict(temp_path))

                markdown_content = ""
                for result in results:
                    # PaddleOCR-VL results can be saved to markdown
                    md_buffer = io.StringIO()
                    result.save_to_markdown(save_path=md_buffer)
                    markdown_content += md_buffer.getvalue()

                # Convert markdown to other formats
                latex = self._markdown_to_latex(markdown_content)
                mathml = self._extract_mathml(markdown_content)

                return {
                    "markdown": markdown_content,
                    "latex": latex,
                    "mathml": mathml,
                }
            finally:
                Path(temp_path).unlink(missing_ok=True)

        except Exception as e:
            raise RuntimeError(f"Mixed recognition failed: {e}") from e

    def recognize_formula(self, image: np.ndarray) -> dict:
        """Recognize formula/math content using PaddleOCR-VL with prompt.

        This mode uses PaddleOCR-VL directly with a formula recognition prompt.

        Args:
            image: Input image as numpy array in BGR format.

        Returns:
            Dict with 'latex', 'markdown', 'mathml' keys.
        """
        try:
            import httpx

            temp_path = self._save_temp_image(image)

            try:
                # Use vLLM API directly for formula recognition
                import base64

                with open(temp_path, "rb") as f:
                    image_base64 = base64.b64encode(f.read()).decode("utf-8")

                # Call vLLM server with formula prompt
                response = httpx.post(
                    f"{self.vl_server_url}/chat/completions",
                    json={
                        "model": "paddleocr-vl",
                        "messages": [
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": self.FORMULA_PROMPT},
                                    {
                                        "type": "image_url",
                                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                                    },
                                ],
                            }
                        ],
                        "max_tokens": 1024,
                    },
                    timeout=60.0,
                )
                response.raise_for_status()
                result = response.json()

                latex = result["choices"][0]["message"]["content"].strip()

                # Convert latex to other formats
                markdown = self._latex_to_markdown(latex)
                mathml = self._latex_to_mathml(latex)

                return {
                    "latex": latex,
                    "markdown": markdown,
                    "mathml": mathml,
                }
            finally:
                Path(temp_path).unlink(missing_ok=True)

        except httpx.HTTPStatusError as e:
            raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
        except Exception as e:
            raise RuntimeError(f"Formula recognition failed: {e}") from e

    def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
        """Recognize content based on layout detection results.

        Args:
            image: Input image as numpy array in BGR format.
            layout_info: Layout detection results.

        Returns:
            Dict with recognition results including mode used.
        """
        # Decision logic:
        # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
        # - Otherwise -> use formula_recognition (VL with prompt)
        if layout_info.has_plain_text:
            result = self.recognize_mixed(image)
            result["recognition_mode"] = "mixed_recognition"
        else:
            result = self.recognize_formula(image)
            result["recognition_mode"] = "formula_recognition"

        return result

    def _markdown_to_latex(self, markdown: str) -> str:
        """Convert markdown to LaTeX.

        Simple conversion - wraps content in LaTeX document structure.

        Args:
            markdown: Markdown content.

        Returns:
            LaTeX representation.
        """
        # Basic conversion: preserve math blocks, convert structure
        lines = []
        in_code_block = False

        for line in markdown.split("\n"):
            if line.startswith("```"):
                in_code_block = not in_code_block
                if in_code_block:
                    lines.append("\\begin{verbatim}")
                else:
                    lines.append("\\end{verbatim}")
            elif in_code_block:
                lines.append(line)
            elif line.startswith("# "):
                lines.append(f"\\section{{{line[2:]}}}")
            elif line.startswith("## "):
                lines.append(f"\\subsection{{{line[3:]}}}")
            elif line.startswith("### "):
                lines.append(f"\\subsubsection{{{line[4:]}}}")
            elif line.startswith("- "):
                lines.append(f"\\item {line[2:]}")
            elif line.startswith("$$"):
                lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
            elif "$" in line:
                # Keep inline math as-is
                lines.append(line)
            else:
                lines.append(line)

        return "\n".join(lines)

    def _latex_to_markdown(self, latex: str) -> str:
        """Convert LaTeX to markdown.

        Args:
            latex: LaTeX content.

        Returns:
            Markdown representation.
        """
        # Wrap LaTeX in markdown math block
        if latex.strip():
            return f"$$\n{latex}\n$$"
        return ""

    def _latex_to_mathml(self, latex: str) -> str:
        """Convert LaTeX to MathML.

        Args:
            latex: LaTeX content.

        Returns:
            MathML representation.
        """
        # Basic LaTeX to MathML conversion
        # For production, consider using latex2mathml library
        if not latex.strip():
            return ""

        try:
            # Try to use latex2mathml if available
            from latex2mathml.converter import convert

            return convert(latex)
        except ImportError:
            # Fallback: wrap in basic MathML structure
            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
        except Exception:
            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'

    def _extract_mathml(self, markdown: str) -> str:
        """Extract and convert math from markdown to MathML.

        Args:
            markdown: Markdown content.

        Returns:
            MathML for any math content found.
        """
        import re

        # Find all math blocks
        math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
        inline_math = re.findall(r"\$([^$]+)\$", markdown)

        all_math = math_blocks + inline_math

        if not all_math:
            return ""

        # Convert each to MathML and combine
        mathml_parts = []
        for latex in all_math:
            mathml = self._latex_to_mathml(latex.strip())
            if mathml:
                mathml_parts.append(mathml)

        return "\n".join(mathml_parts)