"""Markdown conversion and export service using pypandoc.""" import os import re import tempfile from dataclasses import dataclass from functools import lru_cache from typing import Literal import pypandoc from latex2mathml.converter import convert as latex_to_mathml @dataclass class ConvertResult: """Result of markdown conversion. Only populated when input contains pure LaTeX formula. All fields are empty strings when input contains mixed content (text + formula). Attributes: latex: Pure LaTeX formula code (without delimiters). mathml: Standard MathML format. mml: XML MathML with mml: namespace prefix (mml:math). """ latex: str mathml: str mml: str @dataclass class ExportResult: """Result of markdown export.""" file_path: str content_type: str download_name: str ExportType = Literal["docx", "pdf"] # MathML namespace MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML" OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math" # XSLT for MathML to mml: namespace conversion MML_XSLT = """

""" class Converter: """Service for conversion and export operations. Conversion rules: - Only pure LaTeX formulas can be converted to latex/mathml/mml formats. - Mixed content (text + formula) returns empty results for all formats. - OMML conversion is provided as a separate method due to performance overhead. Performance optimizations: - Pre-compiled regex patterns - XSLT-based MML conversion - Cached XSLT transforms - Direct Pandoc OMML output (avoids DOCX parsing) """ # Pandoc input format with LaTeX math extensions INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" # Pre-compiled regex patterns for formula detection _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$") _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]") _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)") _RE_INLINE_PAREN = re.compile(r"\\$[\s\S]+\\$") _RE_MATH_ELEMENT = re.compile(r"]*>[\s\S]*?") # Pre-compiled regex patterns for preprocessing _RE_VSPACE = re.compile(r"\\\[1mm\]") _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL) _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL) _RE_ARITHMATEX = re.compile(r'(.*?)') _RE_INLINE_SPACE = re.compile(r"(? bool: """Check if text contains only a LaTeX formula (no mixed content). A text is considered formula-only if it matches one of these patterns: - Display math: $$...$$ or \\[...\\] - Inline math: $...$ or \$...\$ Args: text: Input text to check. Returns: True if the text contains only a LaTeX formula, False otherwise. """ text = text.strip() if not text: return False # Strict patterns: entire text must be a single formula with delimiters # Using pre-compiled patterns with fullmatch semantics if self._RE_DISPLAY_DOLLAR.fullmatch(text): return True if self._RE_DISPLAY_BRACKET.fullmatch(text): return True if self._RE_INLINE_DOLLAR.fullmatch(text): return True if self._RE_INLINE_PAREN.fullmatch(text): return True return False def convert_to_formats(self, md_text: str) -> ConvertResult: """Convert markdown to LaTeX, MathML, and MML formats. Only converts when input contains a pure LaTeX formula. Mixed content (text + formula) returns empty strings for all fields. Args: md_text: Markdown text to convert. Returns: ConvertResult with latex, mathml, and mml fields. All fields are empty if input is not a pure formula. Raises: RuntimeError: If conversion fails for a valid formula. """ # Empty input returns empty result if not md_text or not md_text.strip(): return ConvertResult(latex="", mathml="", mml="") # Check if input is formula-only if not self._is_formula_only(md_text): # Mixed content: cannot convert to formula formats return ConvertResult(latex="", mathml="", mml="") try: # Extract the LaTeX formula content (remove delimiters) latex_formula = self._extract_latex_formula(md_text) # Convert to MathML mathml = self._latex_to_mathml(latex_formula) # Convert MathML to mml:math format (with namespace prefix) mml = self._mathml_to_mml(mathml) return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml) except Exception as e: raise RuntimeError(f"Conversion failed: {e}") from e def convert_to_omml(self, latex_formula: str) -> str: """Convert LaTeX formula to OMML (Office Math Markup Language). This is a separate method due to the performance overhead of OMML conversion, which requires creating a temporary DOCX file. The formula is preprocessed using the same logic as export_to_file to ensure proper conversion. Args: latex_formula: Pure LaTeX formula (without delimiters like $ or $$). Returns: OMML representation as XML string. Raises: ValueError: If latex_formula is empty. RuntimeError: If conversion fails. """ if not latex_formula or not latex_formula.strip(): raise ValueError("LaTeX formula cannot be empty") # Preprocess formula using the same preprocessing as export preprocessed = self._preprocess_formula_for_omml(latex_formula.strip()) return self._latex_to_omml(preprocessed) def _preprocess_formula_for_omml(self, latex_formula: str) -> str: """Preprocess LaTeX formula for OMML conversion. Applies the same preprocessing steps as preprocess_for_export to ensure consistency. This fixes common issues that cause Pandoc OMML conversion to fail. Args: latex_formula: Pure LaTeX formula. Returns: Preprocessed LaTeX formula. """ # Use the same preprocessing methods as export # 1. Convert matrix environments latex_formula = self._convert_matrix_environments(latex_formula) # 2. Fix array column specifiers (remove spaces) latex_formula = self._fix_array_column_specifiers(latex_formula) # 3. Fix brace spacing latex_formula = self._fix_brace_spacing(latex_formula) # 4. Convert special environments (cases, aligned) latex_formula = self._convert_special_environments(latex_formula) return latex_formula def _extract_latex_formula(self, text: str) -> str: """Extract LaTeX formula from text by removing delimiters. Args: text: Text containing LaTeX formula with delimiters. Returns: Pure LaTeX formula without delimiters. """ text = text.strip() # Remove display math delimiters: $$...$$ or \[...\] if text.startswith("$$") and text.endswith("$$"): return text[2:-2].strip() if text.startswith("\\[") and text.endswith("\\]"): return text[2:-2].strip() # Remove inline math delimiters: $...$ or $...$ if text.startswith("$") and text.endswith("$") and not text.startswith("$$"): return text[1:-1].strip() if text.startswith("\$") and text.endswith("\$"): return text[2:-2].strip() # If no delimiters, return as-is return text.strip() @staticmethod @lru_cache(maxsize=256) def _latex_to_mathml_cached(latex_formula: str) -> str: """Cached conversion of LaTeX formula to MathML. Uses LRU cache to avoid recomputing for repeated formulas. """ try: # Use latex2mathml library for conversion (fast, pure Python) return latex_to_mathml(latex_formula) except Exception as e: # Fallback: try with Pandoc (slower, but more robust) try: mathml_html = pypandoc.convert_text( f"${latex_formula}$", "html", format="markdown+tex_math_dollars", extra_args=["--mathml"], ) # Extract just the

element from the HTML
                match = Converter._RE_MATH_ELEMENT.search(mathml_html)
                if match:
                    return match.group(0)
                return mathml_html.rstrip("\n")
            except Exception as pandoc_error:
                raise RuntimeError(
                    f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
                ) from e

    def _latex_to_mathml(self, latex_formula: str) -> str:
        """Convert LaTeX formula to standard MathML.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters).

        Returns:
            Standard MathML representation.
        """
        return self._latex_to_mathml_cached(latex_formula)

    def _mathml_to_mml(self, mathml: str) -> str:
        """Convert standard MathML to mml:math format with namespace prefix.

        Uses XSLT for efficient transformation. Transforms:
        - to - All child elements like,to, Args:
            mathml: Standard MathML string.

        Returns:
            MathML with mml: namespace prefix.
        """
        if not mathml:
            return ""

        try:
            from lxml import etree

            # Parse MathML
            root = etree.fromstring(mathml.encode("utf-8"))

            # Apply XSLT transformation (cached)
            transform = self._get_mml_xslt_transform()
            result_tree = transform(root)

            # Serialize to string
            return str(result_tree)

        except Exception:
            # Fallback: simple string replacement (less robust but no lxml dependency)
            result = mathml
            # Add namespace to root math element
            result = re.sub(
                r" ", "", result)

            # Add mml: prefix to all other elements using a single regex
            # Match opening tags
            result = re.sub(
                r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
                r"maction|semantics|annotation|annotation-xml)\b",
                r" ",
                r"",
                result,
            )

            return result

    def _latex_to_omml(self, latex_formula: str) -> str:
        """Convert LaTeX formula to OMML (Office Math Markup Language).

        Uses Pandoc to create DOCX in memory and extracts OMML from it.
        Optimized to minimize disk I/O by using in-memory zip processing.

        Args:
            latex_formula: Pure LaTeX formula (without delimiters).

        Returns:
            OMML representation as XML string.
        """
        import io
        import zipfile

        try:
            from lxml import etree

            # Convert to DOCX bytes using Pandoc
            # We still need a temp file for input, but output goes to temp file too
            # Then we process the DOCX in memory
            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
                f.write(f"$${latex_formula}$$\n")
                temp_md = f.name

            temp_docx = temp_md.replace(".md", ".docx")

            try:
                pypandoc.convert_file(
                    temp_md,
                    "docx",
                    format=self.INPUT_FORMAT,
                    outputfile=temp_docx,
                )

                # Read DOCX into memory and process as ZIP
                with open(temp_docx, "rb") as f:
                    docx_bytes = f.read()

                # Extract document.xml from DOCX (which is a ZIP file)
                with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
                    document_xml = zf.read("word/document.xml")

                # Parse XML and extract OMML
                root = etree.fromstring(document_xml)

                # Find all oMath elements
                omml_parts = []
                for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
                    omml_parts.append(etree.tostring(math, encoding="unicode"))

                return "\n".join(omml_parts)

            finally:
                # Cleanup temp files
                if os.path.exists(temp_md):
                    os.remove(temp_md)
                if os.path.exists(temp_docx):
                    os.remove(temp_docx)

        except Exception as e:
            raise RuntimeError(f"OMML conversion failed: {e}") from e

    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

        Uses pre-compiled regex patterns for better performance.

        Args:
            md_text: Raw markdown text.

        Returns:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
        md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
        md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
        md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)

        # Remove arithmatex span wrappers
        cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
        cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")

        # Convert block formulas: \[ \] => $$ $$
        cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")

        # Remove spaces between $ and formula content
        cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)

        # Fix array environment column specifiers (remove spaces)
        cleaned_md = self._fix_array_column_specifiers(cleaned_md)

        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)

        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)

        # Handle LaTeX \tag{} commands for equation numbering
        cleaned_md = self._convert_tag_commands(cleaned_md)

        return cleaned_md

    def _convert_matrix_environments(self, md_text: str) -> str:
        """Convert vmatrix/Vmatrix to left/right delimited forms.

        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
        md_text = self._RE_VMATRIX.sub(
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
        md_text = self._RE_VMATRIX_DOUBLE.sub(
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
        )

        return md_text

    def _fix_array_column_specifiers(self, md_text: str) -> str:
        """Fix array environment column specifiers by removing spaces.

        Pandoc's OMML converter doesn't accept spaces between column alignment
        specifiers in array environments. This converts patterns like
        {c c c c} to {cccc}.
        """

        def remove_spaces_in_specifier(match: re.Match) -> str:
            """Remove spaces from column specifier."""
            specifier = match.group(1)
            return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"

        return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)

    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
        md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
        md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
        """Convert cases and aligned environments to array format.

        These environments have better rendering support in Word/OMML.
        """
        # Pre-compiled pattern for alignment marker removal
        _re_align_marker = re.compile(r"(^|\\\\)\s*&")

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

        md_text = self._RE_CASES.sub(convert_cases, md_text)

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
            content = _re_align_marker.sub(r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

        md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
            content = _re_align_marker.sub(r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

        md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)

        return md_text

    def _convert_tag_commands(self, md_text: str) -> str:
        """Convert LaTeX \\tag{} commands to Word-compatible format.

        The \\tag{} command is not supported in Word OMML format, so we convert it to
        use simple spacing (\\quad) to push the equation number to the right side.
        """

        def convert_tag(match: re.Match) -> str:
            formula_content = match.group(1)
            tag_content = match.group(2)
            return f"$${formula_content} \\quad ({tag_content})$$"

        return self._RE_TAG.sub(convert_tag, md_text)

    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.

        Args:
            md_text: Markdown text to export.
            export_type: Export format, either 'docx' or 'pdf'.

        Returns:
            bytes of the exported file.

        Raises:
            ValueError: If export_type is not supported.
            RuntimeError: If export fails.

        """

        # Preprocess markdown
        cleaned_md = self.preprocess_for_export(md_text)

        # Create temp file for input
        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
            f_in.write(cleaned_md.encode("utf-8"))
            md_path = f_in.name

        output_file = md_path + "." + export_type

        try:
            if export_type == "docx":
                self._export_docx(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
            else:  # pdf
                self._export_pdf(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()

        except Exception as e:
            # Cleanup on error
            self._cleanup_files(md_path, output_file)
            raise RuntimeError(f"Export failed: {e}") from e
        finally:
            # Always cleanup input file
            if os.path.exists(md_path):
                os.remove(md_path)

    def _export_docx(self, input_path: str, output_path: str) -> None:
        """Export to DOCX format using pypandoc."""
        extra_args = [
            "--highlight-style=pygments",
            f"--reference-doc=app/pkg/reference.docx",
        ]
        pypandoc.convert_file(
            input_path,
            "docx",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _export_pdf(self, input_path: str, output_path: str) -> None:
        """Export to PDF format using pypandoc with XeLaTeX."""
        extra_args = [
            "--pdf-engine=xelatex",
            "-V",
            "mainfont=Noto Sans CJK SC",
            "--highlight-style=pygments",
        ]
        pypandoc.convert_file(
            input_path,
            "pdf",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _cleanup_files(self, *paths: str) -> None:
        """Remove files if they exist."""
        for path in paths:
            if os.path.exists(path):
                os.remove(path)

    def cleanup_export_file(self, file_path: str) -> None:
        """Cleanup exported file after sending response.

        Call this after sending the file to the client.

        Args:
            file_path: Path to the exported file.
        """
        if os.path.exists(file_path):
            os.remove(file_path)