doc_processer/app/services/converter.py

"""Markdown conversion and export service using pypandoc."""

import os
import re
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from typing import Literal

import pypandoc
from latex2mathml.converter import convert as latex_to_mathml


@dataclass
class ConvertResult:
    """Result of markdown conversion.

    Only populated when input contains pure LaTeX formula.
    All fields are empty strings when input contains mixed content (text + formula).

    Attributes:
        latex: Pure LaTeX formula code (without delimiters).
        mathml: Standard MathML format.
        mml: XML MathML with mml: namespace prefix (mml:math).
    """

    latex: str
    mathml: str
    mml: str


@dataclass
class ExportResult:
    """Result of markdown export."""

    file_path: str
    content_type: str
    download_name: str


ExportType = Literal["docx", "pdf"]

# MathML namespace
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"

# XSLT for MathML to mml: namespace conversion
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:mml="http://www.w3.org/1998/Math/MathML"
    xmlns:m="http://www.w3.org/1998/Math/MathML"
    exclude-result-prefixes="m">

    <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>

    <!-- Match root math element -->
    <xsl:template match="m:math|math">
        <mml:math>
            <xsl:apply-templates select="@*|node()"/>
        </mml:math>
    </xsl:template>

    <!-- Match all other MathML elements -->
    <xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
        <xsl:element name="mml:{local-name()}">
            <xsl:apply-templates select="@*|node()"/>
        </xsl:element>
    </xsl:template>

    <!-- Copy attributes -->
    <xsl:template match="@*">
        <xsl:if test="local-name() != 'xmlns'">
            <xsl:copy/>
        </xsl:if>
    </xsl:template>

    <!-- Copy text nodes -->
    <xsl:template match="text()">
        <xsl:value-of select="."/>
    </xsl:template>

</xsl:stylesheet>
"""


class Converter:
    """Service for conversion and export operations.

    Conversion rules:
    - Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
    - Mixed content (text + formula) returns empty results for all formats.
    - OMML conversion is provided as a separate method due to performance overhead.

    Performance optimizations:
    - Pre-compiled regex patterns
    - XSLT-based MML conversion
    - Cached XSLT transforms
    - Direct Pandoc OMML output (avoids DOCX parsing)
    """

    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"

    # Pre-compiled regex patterns for formula detection
    _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
    _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
    _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
    _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
    _RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")

    # Pre-compiled regex patterns for preprocessing
    _RE_VSPACE = re.compile(r"\\\[1mm\]")
    _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
    _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
    _RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
    _RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
    _RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
    _RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
    _RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
    _RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
    _RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
    _RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
    _RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
    _RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
    _RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)

    # Cached XSLT transform
    _mml_xslt_transform = None

    def __init__(self):
        """Initialize converter."""

    @classmethod
    def _get_mml_xslt_transform(cls):
        """Get cached XSLT transform for MathML to mml: conversion."""
        if cls._mml_xslt_transform is None:
            from lxml import etree
            xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
            cls._mml_xslt_transform = etree.XSLT(xslt_doc)
        return cls._mml_xslt_transform

    def _is_formula_only(self, text: str) -> bool:
        """Check if text contains only a LaTeX formula (no mixed content).

        A text is considered formula-only if it matches one of these patterns:
        - Display math: $$...$$ or \\[...\\]
        - Inline math: $...$ or \\(...\\)

        Args:
            text: Input text to check.

        Returns:
            True if the text contains only a LaTeX formula, False otherwise.
        """
        text = text.strip()

        if not text:
            return False

        # Strict patterns: entire text must be a single formula with delimiters
        # Using pre-compiled patterns with fullmatch semantics
        if self._RE_DISPLAY_DOLLAR.fullmatch(text):
            return True
        if self._RE_DISPLAY_BRACKET.fullmatch(text):
            return True
        if self._RE_INLINE_DOLLAR.fullmatch(text):
            return True
        if self._RE_INLINE_PAREN.fullmatch(text):
            return True

        return False

    def convert_to_formats(self, md_text: str) -> ConvertResult:
        """Convert markdown to LaTeX, MathML, and MML formats.

        Only converts when input contains a pure LaTeX formula.
        Mixed content (text + formula) returns empty strings for all fields.

        Args:
            md_text: Markdown text to convert.

        Returns:
            ConvertResult with latex, mathml, and mml fields.
            All fields are empty if input is not a pure formula.

        Raises:
            RuntimeError: If conversion fails for a valid formula.
        """
        # Empty input returns empty result
        if not md_text or not md_text.strip():
            return ConvertResult(latex="", mathml="", mml="")

        # Check if input is formula-only
        if not self._is_formula_only(md_text):
            # Mixed content: cannot convert to formula formats
            return ConvertResult(latex="", mathml="", mml="")

        try:
            # Extract the LaTeX formula content (remove delimiters)
            latex_formula = self._extract_latex_formula(md_text)

            # Preprocess formula for better conversion (fix array specifiers, etc.)
            preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)

            # Convert to MathML
            mathml = self._latex_to_mathml(preprocessed_formula)

            # Convert MathML to mml:math format (with namespace prefix)
            mml = self._mathml_to_mml(mathml)

            return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)

        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e

    def convert_to_omml(self, latex_formula: str) -> str:
        """Convert LaTeX formula to OMML (Office Math Markup Language).

        This is a separate method due to the performance overhead of OMML conversion,
        which requires creating a temporary DOCX file.

        The formula is preprocessed using the same logic as export_to_file to ensure
        proper conversion.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters like $ or $$).

        Returns:
            OMML representation as XML string.

        Raises:
            ValueError: If latex_formula is empty.
            RuntimeError: If conversion fails.
        """
        if not latex_formula or not latex_formula.strip():
            raise ValueError("LaTeX formula cannot be empty")

        # Preprocess formula using the same preprocessing as export
        preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())

        return self._latex_to_omml(preprocessed)

    def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
        """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).

        Applies the same preprocessing steps as preprocess_for_export to ensure
        consistency across all conversion paths. This fixes common issues that
        cause Pandoc conversion to fail.

        Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
        so we don't need to handle them here.

        Args:
            latex_formula: Pure LaTeX formula.

        Returns:
            Preprocessed LaTeX formula.
        """
        # 1. Convert matrix environments
        latex_formula = self._convert_matrix_environments(latex_formula)

        # 2. Fix array column specifiers (remove spaces)
        latex_formula = self._fix_array_column_specifiers(latex_formula)

        # 3. Fix brace spacing
        latex_formula = self._fix_brace_spacing(latex_formula)

        # 4. Convert special environments (cases, aligned)
        latex_formula = self._convert_special_environments(latex_formula)

        return latex_formula

    def _extract_latex_formula(self, text: str) -> str:
        """Extract LaTeX formula from text by removing delimiters.

        Args:
            text: Text containing LaTeX formula with delimiters.

        Returns:
            Pure LaTeX formula without delimiters.
        """
        text = text.strip()

        # Remove display math delimiters: $$...$$ or \[...\]
        if text.startswith("$$") and text.endswith("$$"):
            return text[2:-2].strip()
        if text.startswith("\\[") and text.endswith("\\]"):
            return text[2:-2].strip()

        # Remove inline math delimiters: $...$ or \(...\)
        if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
            return text[1:-1].strip()
        if text.startswith("\\(") and text.endswith("\\)"):
            return text[2:-2].strip()

        # If no delimiters, return as-is
        return text.strip()

    @staticmethod
    @lru_cache(maxsize=256)
    def _latex_to_mathml_cached(latex_formula: str) -> str:
        """Cached conversion of LaTeX formula to MathML.

        Uses Pandoc for conversion to ensure Word compatibility.
        Pandoc generates standard MathML that Word can properly import.

        Uses LRU cache to avoid recomputing for repeated formulas.
        """
        try:
            # Use Pandoc for Word-compatible MathML (primary method)
            mathml_html = pypandoc.convert_text(
                f"${latex_formula}$",
                "html",
                format="markdown+tex_math_dollars",
                extra_args=["--mathml"],
            )
            # Extract just the <math> element from the HTML
            match = Converter._RE_MATH_ELEMENT.search(mathml_html)
            if match:
                mathml = match.group(0)
                # Post-process for Word compatibility
                return Converter._postprocess_mathml_for_word(mathml)

            # If no match, return as-is
            return mathml_html.rstrip("\n")

        except Exception as pandoc_error:
            # Fallback: try latex2mathml (less Word-compatible)
            try:
                mathml = latex_to_mathml(latex_formula)
                return Converter._postprocess_mathml_for_word(mathml)
            except Exception as e:
                raise RuntimeError(
                    f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
                ) from e

    @staticmethod
    def _postprocess_mathml_for_word(mathml: str) -> str:
        """Post-process MathML to improve Word compatibility.

        Applies transformations to make MathML more compatible and concise:
        - Remove <semantics> and <annotation> wrappers (Word doesn't need them)
        - Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
        - Remove redundant single <mrow> wrappers
        - Change display="inline" to display="block" for better rendering
        - Decode Unicode entities to actual characters (Word prefers this)
        - Ensure proper namespace

        Args:
            mathml: MathML string.

        Returns:
            Simplified, Word-compatible MathML string.
        """
        import re

        # Step 1: Remove <semantics> and <annotation> wrappers
        # These often cause Word import issues
        if '<semantics>' in mathml:
            # Extract content between <semantics> and <annotation>
            match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
            if match:
                content = match.group(1).strip()

                # Get the math element attributes
                math_attrs = ""
                math_match = re.search(r'<math([^>]*)>', mathml)
                if math_match:
                    math_attrs = math_match.group(1)

                # Rebuild without semantics
                mathml = f'<math{math_attrs}>{content}</math>'

        # Step 2: Remove unnecessary attributes that don't affect rendering
        # These are verbose and Word doesn't need them
        unnecessary_attrs = [
            r'\s+form="prefix"',
            r'\s+form="postfix"',
            r'\s+form="infix"',
            r'\s+stretchy="true"',
            r'\s+stretchy="false"',
            r'\s+fence="true"',
            r'\s+fence="false"',
            r'\s+separator="true"',
            r'\s+separator="false"',
            r'\s+columnalign="[^"]*"',
            r'\s+columnspacing="[^"]*"',
            r'\s+rowspacing="[^"]*"',
            r'\s+class="[^"]*"',
            r'\s+style="[^"]*"',
        ]

        for attr_pattern in unnecessary_attrs:
            mathml = re.sub(attr_pattern, '', mathml)

        # Step 3: Remove redundant single <mrow> wrapper at the top level
        # Pattern: <math ...><mrow>content</mrow></math>
        # Simplify to: <math ...>content</math>
        mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
        match = re.search(mrow_pattern, mathml, re.DOTALL)
        if match:
            # Check if there's only one mrow at the top level
            content = match.group(2)
            # Only remove if the content doesn't have other top-level elements
            if not re.search(r'</[^>]+>\s*<[^/]', content):
                mathml = f'{match.group(1)}{content}{match.group(3)}'

        # Step 4: Change display to block for better Word rendering
        mathml = mathml.replace('display="inline"', 'display="block"')

        # Step 5: If no display attribute, add it
        if 'display=' not in mathml and '<math' in mathml:
            mathml = mathml.replace('<math', '<math display="block"', 1)

        # Step 6: Ensure xmlns is present
        if 'xmlns=' not in mathml and '<math' in mathml:
            mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)

        # Step 7: Decode common Unicode entities to actual characters (Word prefers this)
        unicode_map = {
            '&#x0002B;': '+',
            '&#x0002D;': '-',
            '&#x0002A;': '*',
            '&#x0002F;': '/',
            '&#x0003D;': '=',
            '&#x0003C;': '<',
            '&#x0003E;': '>',
            '&#x00028;': '(',
            '&#x00029;': ')',
            '&#x0002C;': ',',
            '&#x0002E;': '.',
            '&#x0007C;': '|',
            '&#x02026;': '⋯',
            '&#x022EE;': '⋮',
            '&#x022EF;': '⋯',
            '&#x00B0;': '°',
            '&#x03B3;': 'γ',
            '&#x03C6;': 'φ',
            '&#x03D5;': 'ϕ',
            '&#x03B1;': 'α',
            '&#x03B2;': 'β',
            '&#x03B4;': 'δ',
            '&#x03B5;': 'ε',
            '&#x03B8;': 'θ',
            '&#x03BB;': 'λ',
            '&#x03BC;': 'μ',
            '&#x03C0;': 'π',
            '&#x03C1;': 'ρ',
            '&#x03C3;': 'σ',
            '&#x03C4;': 'τ',
            '&#x03C9;': 'ω',
        }

        for entity, char in unicode_map.items():
            mathml = mathml.replace(entity, char)

        # Step 8: Clean up extra whitespace
        mathml = re.sub(r'>\s+<', '><', mathml)

        return mathml

    def _latex_to_mathml(self, latex_formula: str) -> str:
        """Convert LaTeX formula to standard MathML.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters).

        Returns:
            Standard MathML representation.
        """
        return self._latex_to_mathml_cached(latex_formula)

    def _mathml_to_mml(self, mathml: str) -> str:
        """Convert standard MathML to mml:math format with namespace prefix.

        Uses XSLT for efficient transformation. Transforms:
        - <math ...> to <mml:math xmlns:mml="..." ...>
        - All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>

        Args:
            mathml: Standard MathML string.

        Returns:
            MathML with mml: namespace prefix.
        """
        if not mathml:
            return ""

        try:
            from lxml import etree

            # Parse MathML
            root = etree.fromstring(mathml.encode("utf-8"))

            # Apply XSLT transformation (cached)
            transform = self._get_mml_xslt_transform()
            result_tree = transform(root)

            # Serialize to string
            return str(result_tree)

        except Exception:
            # Fallback: simple string replacement (less robust but no lxml dependency)
            result = mathml
            # Add namespace to root math element
            result = re.sub(
                r"<math\b",
                f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
                result,
            )
            result = re.sub(r"</math>", "</mml:math>", result)

            # Add mml: prefix to all other elements using a single regex
            # Match opening tags
            result = re.sub(
                r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
                r"maction|semantics|annotation|annotation-xml)\b",
                r"<mml:\1",
                result,
            )
            # Match closing tags
            result = re.sub(
                r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
                r"maction|semantics|annotation|annotation-xml)>",
                r"</mml:\1>",
                result,
            )

            return result

    def _latex_to_omml(self, latex_formula: str) -> str:
        """Convert LaTeX formula to OMML (Office Math Markup Language).

        Uses Pandoc to create DOCX in memory and extracts OMML from it.
        Optimized to minimize disk I/O by using in-memory zip processing.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters).

        Returns:
            OMML representation as XML string.
        """
        import io
        import zipfile

        try:
            from lxml import etree

            # Convert to DOCX bytes using Pandoc
            # We still need a temp file for input, but output goes to temp file too
            # Then we process the DOCX in memory
            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
                f.write(f"$${latex_formula}$$\n")
                temp_md = f.name

            temp_docx = temp_md.replace(".md", ".docx")

            try:
                pypandoc.convert_file(
                    temp_md,
                    "docx",
                    format=self.INPUT_FORMAT,
                    outputfile=temp_docx,
                )

                # Read DOCX into memory and process as ZIP
                with open(temp_docx, "rb") as f:
                    docx_bytes = f.read()

                # Extract document.xml from DOCX (which is a ZIP file)
                with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
                    document_xml = zf.read("word/document.xml")

                # Parse XML and extract OMML
                root = etree.fromstring(document_xml)

                # Find all oMath elements
                omml_parts = []
                for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
                    omml_parts.append(etree.tostring(math, encoding="unicode"))

                return "\n".join(omml_parts)

            finally:
                # Cleanup temp files
                if os.path.exists(temp_md):
                    os.remove(temp_md)
                if os.path.exists(temp_docx):
                    os.remove(temp_docx)

        except Exception as e:
            raise RuntimeError(f"OMML conversion failed: {e}") from e

    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

        Uses pre-compiled regex patterns for better performance.

        Args:
            md_text: Raw markdown text.

        Returns:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
        md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
        md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
        md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)

        # Remove arithmatex span wrappers
        cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
        cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")

        # Convert block formulas: \[ \] => $$ $$
        cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")

        # Remove spaces between $ and formula content
        cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)

        # Fix array environment column specifiers (remove spaces)
        cleaned_md = self._fix_array_column_specifiers(cleaned_md)

        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)

        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)

        # Handle LaTeX \tag{} commands for equation numbering
        cleaned_md = self._convert_tag_commands(cleaned_md)

        return cleaned_md

    def _convert_matrix_environments(self, md_text: str) -> str:
        """Convert vmatrix/Vmatrix to left/right delimited forms.

        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
        md_text = self._RE_VMATRIX.sub(
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
        md_text = self._RE_VMATRIX_DOUBLE.sub(
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
        )

        return md_text

    def _fix_array_column_specifiers(self, md_text: str) -> str:
        """Fix array environment column specifiers by removing spaces.

        Pandoc's OMML converter doesn't accept spaces between column alignment
        specifiers in array environments. This converts patterns like
        {c c c c} to {cccc}.
        """

        def remove_spaces_in_specifier(match: re.Match) -> str:
            """Remove spaces from column specifier."""
            specifier = match.group(1)
            return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"

        return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)

    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
        md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
        md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
        """Convert cases and aligned environments to array format.

        These environments have better rendering support in Word/OMML.
        """
        # Pre-compiled pattern for alignment marker removal
        _re_align_marker = re.compile(r"(^|\\\\)\s*&")

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

        md_text = self._RE_CASES.sub(convert_cases, md_text)

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
            content = _re_align_marker.sub(r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

        md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
            content = _re_align_marker.sub(r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

        md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)

        return md_text

    def _convert_tag_commands(self, md_text: str) -> str:
        """Convert LaTeX \\tag{} commands to Word-compatible format.

        The \\tag{} command is not supported in Word OMML format, so we convert it to
        use simple spacing (\\quad) to push the equation number to the right side.
        """

        def convert_tag(match: re.Match) -> str:
            formula_content = match.group(1)
            tag_content = match.group(2)
            return f"$${formula_content} \\quad ({tag_content})$$"

        return self._RE_TAG.sub(convert_tag, md_text)

    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.

        Args:
            md_text: Markdown text to export.
            export_type: Export format, either 'docx' or 'pdf'.

        Returns:
            bytes of the exported file.

        Raises:
            ValueError: If export_type is not supported.
            RuntimeError: If export fails.

        """

        # Preprocess markdown
        cleaned_md = self.preprocess_for_export(md_text)

        # Create temp file for input
        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
            f_in.write(cleaned_md.encode("utf-8"))
            md_path = f_in.name

        output_file = md_path + "." + export_type

        try:
            if export_type == "docx":
                self._export_docx(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
            else:  # pdf
                self._export_pdf(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()

        except Exception as e:
            # Cleanup on error
            self._cleanup_files(md_path, output_file)
            raise RuntimeError(f"Export failed: {e}") from e
        finally:
            # Always cleanup input file
            if os.path.exists(md_path):
                os.remove(md_path)

    def _export_docx(self, input_path: str, output_path: str) -> None:
        """Export to DOCX format using pypandoc."""
        extra_args = [
            "--highlight-style=pygments",
            f"--reference-doc=app/pkg/reference.docx",
        ]
        pypandoc.convert_file(
            input_path,
            "docx",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _export_pdf(self, input_path: str, output_path: str) -> None:
        """Export to PDF format using pypandoc with XeLaTeX."""
        extra_args = [
            "--pdf-engine=xelatex",
            "-V",
            "mainfont=Noto Sans CJK SC",
            "--highlight-style=pygments",
        ]
        pypandoc.convert_file(
            input_path,
            "pdf",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _cleanup_files(self, *paths: str) -> None:
        """Remove files if they exist."""
        for path in paths:
            if os.path.exists(path):
                os.remove(path)

    def cleanup_export_file(self, file_path: str) -> None:
        """Cleanup exported file after sending response.

        Call this after sending the file to the client.

        Args:
            file_path: Path to the exported file.
        """
        if os.path.exists(file_path):
            os.remove(file_path)