app/services/converter.py

"""Markdown conversion and export service using pypandoc."""

import os
import re
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from typing import Literal

import pypandoc
from latex2mathml.converter import convert as latex_to_mathml


@dataclass
class ConvertResult:
    """Result of markdown conversion.

    Only populated when input contains pure LaTeX formula.
    All fields are empty strings when input contains mixed content (text + formula).

    Attributes:
        latex: Pure LaTeX formula code (without delimiters).
        mathml: Standard MathML format.
        mml: XML MathML with mml: namespace prefix (mml:math).
    """

    latex: str
    mathml: str
    mml: str


@dataclass
class ExportResult:
    """Result of markdown export."""

    file_path: str
    content_type: str
    download_name: str


ExportType = Literal["docx", "pdf"]

# MathML namespace
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"

# XSLT for MathML to mml: namespace conversion
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:mml="http://www.w3.org/1998/Math/MathML"
    xmlns:m="http://www.w3.org/1998/Math/MathML"
    exclude-result-prefixes="m">

    <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>

    <!-- Match root math element -->
    <xsl:template match="m:math|math">
        <mml:math>
            <xsl:apply-templates select="@*|node()"/>
        </mml:math>
    </xsl:template>

    <!-- Match all other MathML elements -->
    <xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
        <xsl:element name="mml:{local-name()}">
            <xsl:apply-templates select="@*|node()"/>
        </xsl:element>
    </xsl:template>

    <!-- Copy attributes -->
    <xsl:template match="@*">
        <xsl:if test="local-name() != 'xmlns'">
            <xsl:copy/>
        </xsl:if>
    </xsl:template>

    <!-- Copy text nodes -->
    <xsl:template match="text()">
        <xsl:value-of select="."/>
    </xsl:template>

</xsl:stylesheet>
"""


class Converter:
    """Service for conversion and export operations.

    Conversion rules:
    - Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
    - Mixed content (text + formula) returns empty results for all formats.
    - OMML conversion is provided as a separate method due to performance overhead.

    Performance optimizations:
    - Pre-compiled regex patterns
    - XSLT-based MML conversion
    - Cached XSLT transforms
    - Direct Pandoc OMML output (avoids DOCX parsing)
    """

    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"

    # Pre-compiled regex patterns for formula detection
    _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
    _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
    _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
    _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
    _RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")

    # Pre-compiled regex patterns for preprocessing
    _RE_VSPACE = re.compile(r"\\\[1mm\]")
    _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
    _RE_BLOCK_FORMULA_LINE = re.compile(
        r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL
    )
    _RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
    _RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
    _RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
    _RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
    _RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
    _RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
    _RE_ALIGNED_BRACE = re.compile(
        r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL
    )
    _RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
    _RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
    _RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
    _RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)

    # Cached XSLT transform
    _mml_xslt_transform = None

    def __init__(self):
        """Initialize converter."""

    @classmethod
    def _get_mml_xslt_transform(cls):
        """Get cached XSLT transform for MathML to mml: conversion."""
        if cls._mml_xslt_transform is None:
            from lxml import etree

            xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
            cls._mml_xslt_transform = etree.XSLT(xslt_doc)
        return cls._mml_xslt_transform

    def _is_formula_only(self, text: str) -> bool:
        """Check if text contains only a LaTeX formula (no mixed content).

        A text is considered formula-only if it matches one of these patterns:
        - Display math: $$...$$ or \\[...\\]
        - Inline math: $...$ or \\(...\\)

        Args:
            text: Input text to check.

        Returns:
            True if the text contains only a LaTeX formula, False otherwise.
        """
        text = text.strip()

        if not text:
            return False

        # Strict patterns: entire text must be a single formula with delimiters
        # Using pre-compiled patterns with fullmatch semantics
        if self._RE_DISPLAY_DOLLAR.fullmatch(text):
            return True
        if self._RE_DISPLAY_BRACKET.fullmatch(text):
            return True
        if self._RE_INLINE_DOLLAR.fullmatch(text):
            return True
        if self._RE_INLINE_PAREN.fullmatch(text):
            return True

        return False

    def convert_to_formats(self, md_text: str) -> ConvertResult:
        """Convert markdown to LaTeX, MathML, and MML formats.

        Only converts when input contains a pure LaTeX formula.
        Mixed content (text + formula) returns empty strings for all fields.

        Args:
            md_text: Markdown text to convert.

        Returns:
            ConvertResult with latex, mathml, and mml fields.
            All fields are empty if input is not a pure formula.

        Raises:
            RuntimeError: If conversion fails for a valid formula.
        """
        # Empty input returns empty result
        if not md_text or not md_text.strip():
            return ConvertResult(latex="", mathml="", mml="")

        # Check if input is formula-only
        if not self._is_formula_only(md_text):
            # Mixed content: cannot convert to formula formats
            return ConvertResult(latex="", mathml="", mml="")

        try:
            # Detect if formula is display (block) or inline
            is_display = self._is_display_formula(md_text)

            # Extract the LaTeX formula content (remove delimiters)
            latex_formula = self._extract_latex_formula(md_text)

            # Preprocess formula for better conversion (fix array specifiers, etc.)
            preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)

            # Convert to MathML (pass display flag to use correct delimiters)
            mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)

            # Convert MathML to mml:math format (with namespace prefix)
            mml = self._mathml_to_mml(mathml)

            return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)

        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e

    def convert_to_omml(self, latex_formula: str) -> str:
        """Convert LaTeX formula to OMML (Office Math Markup Language).

        This is a separate method due to the performance overhead of OMML conversion,
        which requires creating a temporary DOCX file.

        The formula is preprocessed using the same logic as export_to_file to ensure
        proper conversion.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters like $ or $$).

        Returns:
            OMML representation as XML string.

        Raises:
            ValueError: If latex_formula is empty.
            RuntimeError: If conversion fails.
        """
        if not latex_formula or not latex_formula.strip():
            raise ValueError("LaTeX formula cannot be empty")

        # Preprocess formula using the same preprocessing as export
        preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())

        return self._latex_to_omml(preprocessed)

    def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
        """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).

        Applies the same preprocessing steps as preprocess_for_export to ensure
        consistency across all conversion paths. This fixes common issues that
        cause Pandoc conversion to fail.

        Note: OCR errors (number errors, command spacing) are fixed earlier in the
        pipeline (in ocr_service.py), so we don't need to handle them here.

        Args:
            latex_formula: Pure LaTeX formula.

        Returns:
            Preprocessed LaTeX formula.
        """
        # 1. Convert matrix environments
        latex_formula = self._convert_matrix_environments(latex_formula)

        # 2. Fix array column specifiers (remove spaces)
        latex_formula = self._fix_array_column_specifiers(latex_formula)

        # 3. Fix brace spacing
        latex_formula = self._fix_brace_spacing(latex_formula)

        # 4. Convert special environments (cases, aligned)
        latex_formula = self._convert_special_environments(latex_formula)

        return latex_formula

    def _is_display_formula(self, text: str) -> bool:
        """Check if the formula is a display (block) formula.

        Args:
            text: Text containing LaTeX formula with delimiters.

        Returns:
            True if display formula ($$...$$ or \\[...\\]), False if inline.
        """
        text = text.strip()

        # Display math delimiters: $$...$$ or \[...\]
        if text.startswith("$$") and text.endswith("$$"):
            return True
        if text.startswith("\\[") and text.endswith("\\]"):
            return True

        # Inline math delimiters: $...$ or \(...\)
        return False

    def _extract_latex_formula(self, text: str) -> str:
        """Extract LaTeX formula from text by removing delimiters.

        Args:
            text: Text containing LaTeX formula with delimiters.

        Returns:
            Pure LaTeX formula without delimiters.
        """
        text = text.strip()

        # Remove display math delimiters: $$...$$ or \[...\]
        if text.startswith("$$") and text.endswith("$$"):
            return text[2:-2].strip()
        if text.startswith("\\[") and text.endswith("\\]"):
            return text[2:-2].strip()

        # Remove inline math delimiters: $...$ or \(...\)
        if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
            return text[1:-1].strip()
        if text.startswith("\\(") and text.endswith("\\)"):
            return text[2:-2].strip()

        # If no delimiters, return as-is
        return text.strip()

    @staticmethod
    @lru_cache(maxsize=256)
    def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
        """Cached conversion of LaTeX formula to MathML.

        Uses Pandoc for conversion to ensure Word compatibility.
        Pandoc generates standard MathML that Word can properly import.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters).
            is_display: True if display (block) formula, False if inline.

        Returns:
            Standard MathML representation.
        """
        # Use appropriate delimiters based on formula type
        # Display formulas use $$...$$, inline formulas use $...$
        if is_display:
            pandoc_input = f"$${latex_formula}$$"
        else:
            pandoc_input = f"${latex_formula}$"

        try:
            # Use Pandoc for Word-compatible MathML (primary method)
            mathml_html = pypandoc.convert_text(
                pandoc_input,
                "html",
                format="markdown+tex_math_dollars",
                extra_args=["--mathml"],
            )
            # Extract just the <math> element from the HTML
            match = Converter._RE_MATH_ELEMENT.search(mathml_html)
            if match:
                mathml = match.group(0)
                # Post-process for Word compatibility
                return Converter._postprocess_mathml_for_word(mathml)

            # If Pandoc didn't generate MathML (returned HTML instead), use fallback
            # This happens when Pandoc's mathml output format is not available or fails
            raise ValueError("Pandoc did not generate MathML, got HTML instead")

        except Exception as pandoc_error:
            # Fallback: try latex2mathml (less Word-compatible)
            try:
                mathml = latex_to_mathml(latex_formula)
                return Converter._postprocess_mathml_for_word(mathml)
            except Exception as e:
                raise RuntimeError(
                    f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
                ) from e

    @staticmethod
    def _postprocess_mathml_for_word(mathml: str) -> str:
        """Post-process MathML to improve Word compatibility.

        Applies transformations to make MathML more compatible and concise:
        - Remove <semantics> and <annotation> wrappers (Word doesn't need them)
        - Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
        - Remove redundant single <mrow> wrappers
        - Change display="inline" to display="block" for better rendering
        - Decode Unicode entities to actual characters (Word prefers this)
        - Ensure proper namespace

        Args:
            mathml: MathML string.

        Returns:
            Simplified, Word-compatible MathML string.
        """
        import re

        # Step 1: Remove <semantics> and <annotation> wrappers
        # These often cause Word import issues
        if "<semantics>" in mathml:
            # Extract content between <semantics> and <annotation>
            match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
            if match:
                content = match.group(1).strip()

                # Get the math element attributes
                math_attrs = ""
                math_match = re.search(r"<math([^>]*)>", mathml)
                if math_match:
                    math_attrs = math_match.group(1)

                # Rebuild without semantics
                mathml = f"<math{math_attrs}>{content}</math>"

        # Step 2: Remove unnecessary attributes that don't affect rendering
        # These are verbose and Word doesn't need them
        unnecessary_attrs = [
            r'\s+form="prefix"',
            r'\s+form="postfix"',
            r'\s+form="infix"',
            r'\s+stretchy="true"',
            r'\s+stretchy="false"',
            r'\s+fence="true"',
            r'\s+fence="false"',
            r'\s+separator="true"',
            r'\s+separator="false"',
            r'\s+columnalign="[^"]*"',
            r'\s+columnspacing="[^"]*"',
            r'\s+rowspacing="[^"]*"',
            r'\s+class="[^"]*"',
            r'\s+style="[^"]*"',
        ]

        for attr_pattern in unnecessary_attrs:
            mathml = re.sub(attr_pattern, "", mathml)

        # Step 3: Remove redundant single <mrow> wrapper at the top level
        # Pattern: <math ...><mrow>content</mrow></math>
        # Simplify to: <math ...>content</math>
        mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
        match = re.search(mrow_pattern, mathml, re.DOTALL)
        if match:
            # Check if there's only one mrow at the top level
            content = match.group(2)
            # Only remove if the content doesn't have other top-level elements
            if not re.search(r"</[^>]+>\s*<[^/]", content):
                mathml = f"{match.group(1)}{content}{match.group(3)}"

        # Step 4: Change display to block for better Word rendering
        mathml = mathml.replace('display="inline"', 'display="block"')

        # Step 5: If no display attribute, add it
        if "display=" not in mathml and "<math" in mathml:
            mathml = mathml.replace("<math", '<math display="block"', 1)

        # Step 6: Ensure xmlns is present
        if "xmlns=" not in mathml and "<math" in mathml:
            mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)

        # Step 7: Decode common Unicode entities to actual characters (Word prefers this)
        unicode_map = {
            # Basic operators
            "&#x0002B;": "+",
            "&#x0002D;": "-",
            "&#x0002A;": "*",
            "&#x0002F;": "/",
            "&#x0003D;": "=",
            "&#x0003C;": "<",
            "&#x0003E;": ">",
            "&#x00028;": "(",
            "&#x00029;": ")",
            "&#x0002C;": ",",
            "&#x0002E;": ".",
            "&#x0007C;": "|",
            "&#x00B0;": "°",
            "&#x00D7;": "×",  # times
            "&#x00F7;": "÷",  # div
            "&#x00B1;": "±",  # pm
            "&#x2213;": "∓",  # mp
            # Ellipsis symbols
            "&#x02026;": "…",  # ldots (horizontal)
            "&#x022EE;": "⋮",  # vdots (vertical)
            "&#x022EF;": "⋯",  # cdots (centered)
            "&#x022F0;": "⋰",  # iddots (diagonal up)
            "&#x022F1;": "⋱",  # ddots (diagonal down)
            # Greek letters (lowercase)
            "&#x03B1;": "α",  # alpha
            "&#x03B2;": "β",  # beta
            "&#x03B3;": "γ",  # gamma
            "&#x03B4;": "δ",  # delta
            "&#x03B5;": "ε",  # epsilon
            "&#x03B6;": "ζ",  # zeta
            "&#x03B7;": "η",  # eta
            "&#x03B8;": "θ",  # theta
            "&#x03B9;": "ι",  # iota
            "&#x03BA;": "κ",  # kappa
            "&#x03BB;": "λ",  # lambda
            "&#x03BC;": "μ",  # mu
            "&#x03BD;": "ν",  # nu
            "&#x03BE;": "ξ",  # xi
            "&#x03BF;": "ο",  # omicron
            "&#x03C0;": "π",  # pi
            "&#x03C1;": "ρ",  # rho
            "&#x03C2;": "ς",  # final sigma
            "&#x03C3;": "σ",  # sigma
            "&#x03C4;": "τ",  # tau
            "&#x03C5;": "υ",  # upsilon
            "&#x03C6;": "φ",  # phi
            "&#x03C7;": "χ",  # chi
            "&#x03C8;": "ψ",  # psi
            "&#x03C9;": "ω",  # omega
            "&#x03D5;": "ϕ",  # phi variant
            # Greek letters (uppercase)
            "&#x0391;": "Α",  # Alpha
            "&#x0392;": "Β",  # Beta
            "&#x0393;": "Γ",  # Gamma
            "&#x0394;": "Δ",  # Delta
            "&#x0395;": "Ε",  # Epsilon
            "&#x0396;": "Ζ",  # Zeta
            "&#x0397;": "Η",  # Eta
            "&#x0398;": "Θ",  # Theta
            "&#x0399;": "Ι",  # Iota
            "&#x039A;": "Κ",  # Kappa
            "&#x039B;": "Λ",  # Lambda
            "&#x039C;": "Μ",  # Mu
            "&#x039D;": "Ν",  # Nu
            "&#x039E;": "Ξ",  # Xi
            "&#x039F;": "Ο",  # Omicron
            "&#x03A0;": "Π",  # Pi
            "&#x03A1;": "Ρ",  # Rho
            "&#x03A3;": "Σ",  # Sigma
            "&#x03A4;": "Τ",  # Tau
            "&#x03A5;": "Υ",  # Upsilon
            "&#x03A6;": "Φ",  # Phi
            "&#x03A7;": "Χ",  # Chi
            "&#x03A8;": "Ψ",  # Psi
            "&#x03A9;": "Ω",  # Omega
            # Math symbols
            "&#x2205;": "∅",  # emptyset
            "&#x2208;": "∈",  # in
            "&#x2209;": "∉",  # notin
            "&#x220B;": "∋",  # ni
            "&#x220C;": "∌",  # nni
            "&#x2211;": "∑",  # sum
            "&#x220F;": "∏",  # prod
            "&#x221A;": "√",  # sqrt
            "&#x221B;": "∛",  # cbrt
            "&#x221C;": "∜",  # fourthroot
            "&#x221E;": "∞",  # infty
            "&#x2229;": "∩",  # cap
            "&#x222A;": "∪",  # cup
            "&#x222B;": "∫",  # int
            "&#x222C;": "∬",  # iint
            "&#x222D;": "∭",  # iiint
            "&#x222E;": "∮",  # oint
            "&#x2282;": "⊂",  # subset
            "&#x2283;": "⊃",  # supset
            "&#x2284;": "⊄",  # nsubset
            "&#x2285;": "⊅",  # nsupset
            "&#x2286;": "⊆",  # subseteq
            "&#x2287;": "⊇",  # supseteq
            "&#x2288;": "⊈",  # nsubseteq
            "&#x2289;": "⊉",  # nsupseteq
            "&#x2264;": "≤",  # leq
            "&#x2265;": "≥",  # geq
            "&#x2260;": "≠",  # neq
            "&#x2261;": "≡",  # equiv
            "&#x2248;": "≈",  # approx
            "&#x2243;": "≃",  # simeq
            "&#x2245;": "≅",  # cong
            "&#x2202;": "∂",  # partial
            "&#x2207;": "∇",  # nabla
            "&#x2200;": "∀",  # forall
            "&#x2203;": "∃",  # exists
            "&#x2204;": "∄",  # nexists
            "&#x00AC;": "¬",  # neg/lnot
            "&#x2227;": "∧",  # wedge/land
            "&#x2228;": "∨",  # vee/lor
            "&#x2192;": "→",  # to/rightarrow
            "&#x2190;": "←",  # leftarrow
            "&#x2194;": "↔",  # leftrightarrow
            "&#x21D2;": "⇒",  # Rightarrow
            "&#x21D0;": "⇐",  # Leftarrow
            "&#x21D4;": "⇔",  # Leftrightarrow
            "&#x2191;": "↑",  # uparrow
            "&#x2193;": "↓",  # downarrow
            "&#x21D1;": "⇑",  # Uparrow
            "&#x21D3;": "⇓",  # Downarrow
            "&#x2195;": "↕",  # updownarrow
            "&#x21D5;": "⇕",  # Updownarrow
            "&#x226A;": "≪",  # ll
            "&#x226B;": "≫",  # gg
            "&#x2A7D;": "⩽",  # leqslant
            "&#x2A7E;": "⩾",  # geqslant
            "&#x22A5;": "⊥",  # perp
            "&#x2225;": "∥",  # parallel
            "&#x2220;": "∠",  # angle
            "&#x25B3;": "△",  # triangle
            "&#x25A1;": "□",  # square
            "&#x25CA;": "◊",  # diamond
            "&#x2660;": "♠",  # spadesuit
            "&#x2661;": "♡",  # heartsuit
            "&#x2662;": "♢",  # diamondsuit
            "&#x2663;": "♣",  # clubsuit
            "&#x2113;": "ℓ",  # ell
            "&#x2118;": "℘",  # wp (Weierstrass p)
            "&#x211C;": "ℜ",  # Re (real part)
            "&#x2111;": "ℑ",  # Im (imaginary part)
            "&#x2135;": "ℵ",  # aleph
            "&#x2136;": "ℶ",  # beth
        }

        for entity, char in unicode_map.items():
            mathml = mathml.replace(entity, char)

        # Also handle decimal entity format (&#NNNN;) for common characters
        # Convert decimal to hex-based lookup
        decimal_patterns = [
            (r"&#955;", "λ"),  # lambda (decimal 955 = hex 03BB)
            (r"&#8942;", "⋮"),  # vdots (decimal 8942 = hex 22EE)
            (r"&#8943;", "⋯"),  # cdots (decimal 8943 = hex 22EF)
            (r"&#8230;", "…"),  # ldots (decimal 8230 = hex 2026)
            (r"&#8734;", "∞"),  # infty (decimal 8734 = hex 221E)
            (r"&#8721;", "∑"),  # sum (decimal 8721 = hex 2211)
            (r"&#8719;", "∏"),  # prod (decimal 8719 = hex 220F)
            (r"&#8730;", "√"),  # sqrt (decimal 8730 = hex 221A)
            (r"&#8712;", "∈"),  # in (decimal 8712 = hex 2208)
            (r"&#8713;", "∉"),  # notin (decimal 8713 = hex 2209)
            (r"&#8745;", "∩"),  # cap (decimal 8745 = hex 2229)
            (r"&#8746;", "∪"),  # cup (decimal 8746 = hex 222A)
            (r"&#8804;", "≤"),  # leq (decimal 8804 = hex 2264)
            (r"&#8805;", "≥"),  # geq (decimal 8805 = hex 2265)
            (r"&#8800;", "≠"),  # neq (decimal 8800 = hex 2260)
            (r"&#8776;", "≈"),  # approx (decimal 8776 = hex 2248)
            (r"&#8801;", "≡"),  # equiv (decimal 8801 = hex 2261)
        ]

        for pattern, char in decimal_patterns:
            mathml = mathml.replace(pattern, char)

        # Step 8: Clean up extra whitespace
        mathml = re.sub(r">\s+<", "><", mathml)

        return mathml

    def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
        """Convert LaTeX formula to standard MathML.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters).
            is_display: True if display (block) formula, False if inline.

        Returns:
            Standard MathML representation.
        """
        return self._latex_to_mathml_cached(latex_formula, is_display=is_display)

    def _mathml_to_mml(self, mathml: str) -> str:
        """Convert standard MathML to mml:math format with namespace prefix.

        Uses XSLT for efficient transformation. Transforms:
        - <math ...> to <mml:math xmlns:mml="..." ...>
        - All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>

        Args:
            mathml: Standard MathML string.

        Returns:
            MathML with mml: namespace prefix.
        """
        if not mathml:
            return ""

        try:
            from lxml import etree

            # Parse MathML
            root = etree.fromstring(mathml.encode("utf-8"))

            # Apply XSLT transformation (cached)
            transform = self._get_mml_xslt_transform()
            result_tree = transform(root)

            # Serialize to string
            return str(result_tree)

        except Exception:
            # Fallback: simple string replacement (less robust but no lxml dependency)
            result = mathml
            # Add namespace to root math element
            result = re.sub(
                r"<math\b",
                f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
                result,
            )
            result = re.sub(r"</math>", "</mml:math>", result)

            # Add mml: prefix to all other elements using a single regex
            # Match opening tags
            result = re.sub(
                r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
                r"maction|semantics|annotation|annotation-xml)\b",
                r"<mml:\1",
                result,
            )
            # Match closing tags
            result = re.sub(
                r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
                r"maction|semantics|annotation|annotation-xml)>",
                r"</mml:\1>",
                result,
            )

            return result

    def _latex_to_omml(self, latex_formula: str) -> str:
        """Convert LaTeX formula to OMML (Office Math Markup Language).

        Uses Pandoc to create DOCX in memory and extracts OMML from it.
        Optimized to minimize disk I/O by using in-memory zip processing.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters).

        Returns:
            OMML representation as XML string.
        """
        import io
        import zipfile

        try:
            from lxml import etree

            # Convert to DOCX bytes using Pandoc
            # We still need a temp file for input, but output goes to temp file too
            # Then we process the DOCX in memory
            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
                f.write(f"$${latex_formula}$$\n")
                temp_md = f.name

            temp_docx = temp_md.replace(".md", ".docx")

            try:
                pypandoc.convert_file(
                    temp_md,
                    "docx",
                    format=self.INPUT_FORMAT,
                    outputfile=temp_docx,
                )

                # Read DOCX into memory and process as ZIP
                with open(temp_docx, "rb") as f:
                    docx_bytes = f.read()

                # Extract document.xml from DOCX (which is a ZIP file)
                with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
                    document_xml = zf.read("word/document.xml")

                # Parse XML and extract OMML
                root = etree.fromstring(document_xml)

                # Find all oMath elements
                omml_parts = []
                for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
                    omml_parts.append(etree.tostring(math, encoding="unicode"))

                return "\n".join(omml_parts)

            finally:
                # Cleanup temp files
                if os.path.exists(temp_md):
                    os.remove(temp_md)
                if os.path.exists(temp_docx):
                    os.remove(temp_docx)

        except Exception as e:
            raise RuntimeError(f"OMML conversion failed: {e}") from e

    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

        Uses pre-compiled regex patterns for better performance.

        Args:
            md_text: Raw markdown text.

        Returns:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
        md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
        md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
        md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)

        # Remove arithmatex span wrappers
        cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
        cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")

        # Convert block formulas: \[ \] => $$ $$
        cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")

        # Remove spaces between $ and formula content
        cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)

        # Fix array environment column specifiers (remove spaces)
        cleaned_md = self._fix_array_column_specifiers(cleaned_md)

        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)

        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)

        # Handle LaTeX \tag{} commands for equation numbering
        cleaned_md = self._convert_tag_commands(cleaned_md)

        return cleaned_md

    def _convert_matrix_environments(self, md_text: str) -> str:
        """Convert vmatrix/Vmatrix to left/right delimited forms.

        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
        md_text = self._RE_VMATRIX.sub(
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
        md_text = self._RE_VMATRIX_DOUBLE.sub(
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
        )

        return md_text

    def _fix_array_column_specifiers(self, md_text: str) -> str:
        """Fix array environment column specifiers by removing spaces.

        Pandoc's OMML converter doesn't accept spaces between column alignment
        specifiers in array environments. This converts patterns like
        {c c c c} to {cccc}.
        """

        def remove_spaces_in_specifier(match: re.Match) -> str:
            """Remove spaces from column specifier."""
            specifier = match.group(1)
            return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"

        return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)

    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
        md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
        md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
        """Convert cases and aligned environments to array format.

        These environments have better rendering support in Word/OMML.
        """
        # Pre-compiled pattern for alignment marker removal
        _re_align_marker = re.compile(r"(^|\\\\)\s*&")

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

        md_text = self._RE_CASES.sub(convert_cases, md_text)

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
            content = _re_align_marker.sub(r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

        md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
            content = _re_align_marker.sub(r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

        md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)

        return md_text

    def _convert_tag_commands(self, md_text: str) -> str:
        """Convert LaTeX \\tag{} commands to Word-compatible format.

        The \\tag{} command is not supported in Word OMML format, so we convert it to
        use simple spacing (\\quad) to push the equation number to the right side.
        """

        def convert_tag(match: re.Match) -> str:
            formula_content = match.group(1)
            tag_content = match.group(2)
            return f"$${formula_content} \\quad ({tag_content})$$"

        return self._RE_TAG.sub(convert_tag, md_text)

    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.

        Args:
            md_text: Markdown text to export.
            export_type: Export format, either 'docx' or 'pdf'.

        Returns:
            bytes of the exported file.

        Raises:
            ValueError: If export_type is not supported.
            RuntimeError: If export fails.

        """

        # Preprocess markdown
        cleaned_md = self.preprocess_for_export(md_text)

        # Create temp file for input
        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
            f_in.write(cleaned_md.encode("utf-8"))
            md_path = f_in.name

        output_file = md_path + "." + export_type

        try:
            if export_type == "docx":
                self._export_docx(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
            else:  # pdf
                self._export_pdf(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()

        except Exception as e:
            # Cleanup on error
            self._cleanup_files(md_path, output_file)
            raise RuntimeError(f"Export failed: {e}") from e
        finally:
            # Always cleanup input file
            if os.path.exists(md_path):
                os.remove(md_path)

    def _export_docx(self, input_path: str, output_path: str) -> None:
        """Export to DOCX format using pypandoc."""
        extra_args = [
            "--highlight-style=pygments",
            "--reference-doc=app/pkg/reference.docx",
        ]
        pypandoc.convert_file(
            input_path,
            "docx",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _export_pdf(self, input_path: str, output_path: str) -> None:
        """Export to PDF format using pypandoc with XeLaTeX."""
        extra_args = [
            "--pdf-engine=xelatex",
            "-V",
            "mainfont=Noto Sans CJK SC",
            "--highlight-style=pygments",
        ]
        pypandoc.convert_file(
            input_path,
            "pdf",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _cleanup_files(self, *paths: str) -> None:
        """Remove files if they exist."""
        for path in paths:
            if os.path.exists(path):
                os.remove(path)

    def cleanup_export_file(self, file_path: str) -> None:
        """Cleanup exported file after sending response.

        Call this after sending the file to the client.

        Args:
            file_path: Path to the exported file.
        """
        if os.path.exists(file_path):
            os.remove(file_path)