feat: optimize the format convert

2026-02-04 12:00:06 +08:00
parent 10dbd59161
commit 526c1f3a0d
7 changed files with 571 additions and 187 deletions
--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -4,17 +4,29 @@ import os
 import re
 import tempfile
 from dataclasses import dataclass
+from functools import lru_cache
 from typing import Literal

 import pypandoc
+from latex2mathml.converter import convert as latex_to_mathml


@dataclass
 class ConvertResult:
-    """Result of markdown conversion."""
+    """Result of markdown conversion.
+
+    Only populated when input contains pure LaTeX formula.
+    All fields are empty strings when input contains mixed content (text + formula).
+
+    Attributes:
+        latex: Pure LaTeX formula code (without delimiters).
+        mathml: Standard MathML format.
+        mml: XML MathML with mml: namespace prefix (mml:math).
+    """

    latex: str
    mathml: str
+    mml: str


@dataclass
@@ -28,59 +40,397 @@ class ExportResult:

 ExportType = Literal["docx", "pdf"]

+# MathML namespace
+MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
+OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
+
+# XSLT for MathML to mml: namespace conversion
+MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet version="1.0"
+    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:mml="http://www.w3.org/1998/Math/MathML"
+    xmlns:m="http://www.w3.org/1998/Math/MathML"
+    exclude-result-prefixes="m">
+
+    <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
+
+    <!-- Match root math element -->
+    <xsl:template match="m:math|math">
+        <mml:math>
+            <xsl:apply-templates select="@*|node()"/>
+        </mml:math>
+    </xsl:template>
+
+    <!-- Match all other MathML elements -->
+    <xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
+        <xsl:element name="mml:{local-name()}">
+            <xsl:apply-templates select="@*|node()"/>
+        </xsl:element>
+    </xsl:template>
+
+    <!-- Copy attributes -->
+    <xsl:template match="@*">
+        <xsl:if test="local-name() != 'xmlns'">
+            <xsl:copy/>
+        </xsl:if>
+    </xsl:template>
+
+    <!-- Copy text nodes -->
+    <xsl:template match="text()">
+        <xsl:value-of select="."/>
+    </xsl:template>
+
+</xsl:stylesheet>
+"""
+

 class Converter:
-    """Service for conversion and export operations."""
+    """Service for conversion and export operations.
+
+    Conversion rules:
+    - Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
+    - Mixed content (text + formula) returns empty results for all formats.
+    - OMML conversion is provided as a separate method due to performance overhead.
+
+    Performance optimizations:
+    - Pre-compiled regex patterns
+    - XSLT-based MML conversion
+    - Cached XSLT transforms
+    - Direct Pandoc OMML output (avoids DOCX parsing)
+    """

    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"

+    # Pre-compiled regex patterns for formula detection
+    _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
+    _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
+    _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
+    _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
+    _RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
+
+    # Pre-compiled regex patterns for preprocessing
+    _RE_VSPACE = re.compile(r"\\\[1mm\]")
+    _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
+    _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
+    _RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
+    _RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
+    _RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
+    _RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
+    _RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
+    _RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
+    _RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
+    _RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
+    _RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
+    _RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
+    _RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
+
+    # Cached XSLT transform
+    _mml_xslt_transform = None
+
    def __init__(self):
        """Initialize converter."""

+    @classmethod
+    def _get_mml_xslt_transform(cls):
+        """Get cached XSLT transform for MathML to mml: conversion."""
+        if cls._mml_xslt_transform is None:
+            from lxml import etree
+            xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
+            cls._mml_xslt_transform = etree.XSLT(xslt_doc)
+        return cls._mml_xslt_transform
+
+    def _is_formula_only(self, text: str) -> bool:
+        """Check if text contains only a LaTeX formula (no mixed content).
+
+        A text is considered formula-only if it matches one of these patterns:
+        - Display math: $$...$$ or \\[...\\]
+        - Inline math: $...$ or \\(...\\)
+
+        Args:
+            text: Input text to check.
+
+        Returns:
+            True if the text contains only a LaTeX formula, False otherwise.
+        """
+        text = text.strip()
+
+        if not text:
+            return False
+
+        # Strict patterns: entire text must be a single formula with delimiters
+        # Using pre-compiled patterns with fullmatch semantics
+        if self._RE_DISPLAY_DOLLAR.fullmatch(text):
+            return True
+        if self._RE_DISPLAY_BRACKET.fullmatch(text):
+            return True
+        if self._RE_INLINE_DOLLAR.fullmatch(text):
+            return True
+        if self._RE_INLINE_PAREN.fullmatch(text):
+            return True
+
+        return False
+
    def convert_to_formats(self, md_text: str) -> ConvertResult:
-        """Convert markdown to LaTeX and MathML formats.
+        """Convert markdown to LaTeX, MathML, and MML formats.
+
+        Only converts when input contains a pure LaTeX formula.
+        Mixed content (text + formula) returns empty strings for all fields.

        Args:
            md_text: Markdown text to convert.

        Returns:
-            ConvertResult with latex and mathml fields.
+            ConvertResult with latex, mathml, and mml fields.
+            All fields are empty if input is not a pure formula.

        Raises:
-            ValueError: If md_text is empty.
-            RuntimeError: If conversion fails.
+            RuntimeError: If conversion fails for a valid formula.
        """
-        if md_text == "":
-            return ConvertResult(latex="", mathml="")
+        # Empty input returns empty result
+        if not md_text or not md_text.strip():
+            return ConvertResult(latex="", mathml="", mml="")
+
+        # Check if input is formula-only
+        if not self._is_formula_only(md_text):
+            # Mixed content: cannot convert to formula formats
+            return ConvertResult(latex="", mathml="", mml="")

        try:
-            # Convert to LaTeX
-            latex_output = pypandoc.convert_text(
-                md_text,
-                "latex",
-                format=self.INPUT_FORMAT,
-            ).rstrip("\n")
+            # Extract the LaTeX formula content (remove delimiters)
+            latex_formula = self._extract_latex_formula(md_text)

-            # Convert to HTML with MathML
-            mathml_output = pypandoc.convert_text(
-                md_text,
-                "html",
-                format=self.INPUT_FORMAT,
-                extra_args=["--mathml"],
-            ).rstrip("\n")
+            # Convert to MathML
+            mathml = self._latex_to_mathml(latex_formula)

-            return ConvertResult(latex=latex_output, mathml=mathml_output)
+            # Convert MathML to mml:math format (with namespace prefix)
+            mml = self._mathml_to_mml(mathml)
+
+            return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)

        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e

+    def convert_to_omml(self, latex_formula: str) -> str:
+        """Convert LaTeX formula to OMML (Office Math Markup Language).
+
+        This is a separate method due to the performance overhead of OMML conversion,
+        which requires creating a temporary DOCX file.
+
+        Args:
+            latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
+
+        Returns:
+            OMML representation as XML string.
+
+        Raises:
+            ValueError: If latex_formula is empty.
+            RuntimeError: If conversion fails.
+        """
+        if not latex_formula or not latex_formula.strip():
+            raise ValueError("LaTeX formula cannot be empty")
+
+        return self._latex_to_omml(latex_formula.strip())
+
+    def _extract_latex_formula(self, text: str) -> str:
+        """Extract LaTeX formula from text by removing delimiters.
+
+        Args:
+            text: Text containing LaTeX formula with delimiters.
+
+        Returns:
+            Pure LaTeX formula without delimiters.
+        """
+        text = text.strip()
+
+        # Remove display math delimiters: $$...$$ or \[...\]
+        if text.startswith("$$") and text.endswith("$$"):
+            return text[2:-2].strip()
+        if text.startswith("\\[") and text.endswith("\\]"):
+            return text[2:-2].strip()
+
+        # Remove inline math delimiters: $...$ or \(...\)
+        if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
+            return text[1:-1].strip()
+        if text.startswith("\\(") and text.endswith("\\)"):
+            return text[2:-2].strip()
+
+        # If no delimiters, return as-is
+        return text.strip()
+
+    @staticmethod
+    @lru_cache(maxsize=256)
+    def _latex_to_mathml_cached(latex_formula: str) -> str:
+        """Cached conversion of LaTeX formula to MathML.
+
+        Uses LRU cache to avoid recomputing for repeated formulas.
+        """
+        try:
+            # Use latex2mathml library for conversion (fast, pure Python)
+            return latex_to_mathml(latex_formula)
+        except Exception as e:
+            # Fallback: try with Pandoc (slower, but more robust)
+            try:
+                mathml_html = pypandoc.convert_text(
+                    f"${latex_formula}$",
+                    "html",
+                    format="markdown+tex_math_dollars",
+                    extra_args=["--mathml"],
+                )
+                # Extract just the <math> element from the HTML
+                match = Converter._RE_MATH_ELEMENT.search(mathml_html)
+                if match:
+                    return match.group(0)
+                return mathml_html.rstrip("\n")
+            except Exception as pandoc_error:
+                raise RuntimeError(
+                    f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
+                ) from e
+
+    def _latex_to_mathml(self, latex_formula: str) -> str:
+        """Convert LaTeX formula to standard MathML.
+
+        Args:
+            latex_formula: Pure LaTeX formula (without delimiters).
+
+        Returns:
+            Standard MathML representation.
+        """
+        return self._latex_to_mathml_cached(latex_formula)
+
+    def _mathml_to_mml(self, mathml: str) -> str:
+        """Convert standard MathML to mml:math format with namespace prefix.
+
+        Uses XSLT for efficient transformation. Transforms:
+        - <math ...> to <mml:math xmlns:mml="..." ...>
+        - All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
+
+        Args:
+            mathml: Standard MathML string.
+
+        Returns:
+            MathML with mml: namespace prefix.
+        """
+        if not mathml:
+            return ""
+
+        try:
+            from lxml import etree
+
+            # Parse MathML
+            root = etree.fromstring(mathml.encode("utf-8"))
+
+            # Apply XSLT transformation (cached)
+            transform = self._get_mml_xslt_transform()
+            result_tree = transform(root)
+
+            # Serialize to string
+            return str(result_tree)
+
+        except Exception:
+            # Fallback: simple string replacement (less robust but no lxml dependency)
+            result = mathml
+            # Add namespace to root math element
+            result = re.sub(
+                r"<math\b",
+                f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
+                result,
+            )
+            result = re.sub(r"</math>", "</mml:math>", result)
+
+            # Add mml: prefix to all other elements using a single regex
+            # Match opening tags
+            result = re.sub(
+                r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
+                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
+                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
+                r"maction|semantics|annotation|annotation-xml)\b",
+                r"<mml:\1",
+                result,
+            )
+            # Match closing tags
+            result = re.sub(
+                r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
+                r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
+                r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
+                r"maction|semantics|annotation|annotation-xml)>",
+                r"</mml:\1>",
+                result,
+            )
+
+            return result
+
+    def _latex_to_omml(self, latex_formula: str) -> str:
+        """Convert LaTeX formula to OMML (Office Math Markup Language).
+
+        Uses Pandoc to create DOCX in memory and extracts OMML from it.
+        Optimized to minimize disk I/O by using in-memory zip processing.
+
+        Args:
+            latex_formula: Pure LaTeX formula (without delimiters).
+
+        Returns:
+            OMML representation as XML string.
+        """
+        import io
+        import zipfile
+
+        try:
+            from lxml import etree
+
+            # Convert to DOCX bytes using Pandoc
+            # We still need a temp file for input, but output goes to temp file too
+            # Then we process the DOCX in memory
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
+                f.write(f"$${latex_formula}$$\n")
+                temp_md = f.name
+
+            temp_docx = temp_md.replace(".md", ".docx")
+
+            try:
+                pypandoc.convert_file(
+                    temp_md,
+                    "docx",
+                    format=self.INPUT_FORMAT,
+                    outputfile=temp_docx,
+                )
+
+                # Read DOCX into memory and process as ZIP
+                with open(temp_docx, "rb") as f:
+                    docx_bytes = f.read()
+
+                # Extract document.xml from DOCX (which is a ZIP file)
+                with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
+                    document_xml = zf.read("word/document.xml")
+
+                # Parse XML and extract OMML
+                root = etree.fromstring(document_xml)
+
+                # Find all oMath elements
+                omml_parts = []
+                for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
+                    omml_parts.append(etree.tostring(math, encoding="unicode"))
+
+                return "\n".join(omml_parts)
+
+            finally:
+                # Cleanup temp files
+                if os.path.exists(temp_md):
+                    os.remove(temp_md)
+                if os.path.exists(temp_docx):
+                    os.remove(temp_docx)
+
+        except Exception as e:
+            raise RuntimeError(f"OMML conversion failed: {e}") from e
+
    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

+        Uses pre-compiled regex patterns for better performance.
+
        Args:
            md_text: Raw markdown text.

@@ -88,36 +438,23 @@ class Converter:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
-        md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
+        md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
-        md_text = re.sub(
-            r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
-            r"\1\n\n\\[\3\\]\n\n\4",
-            md_text,
-            flags=re.DOTALL,
-        )
-        md_text = re.sub(
-            r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
-            r"\n\\[\2\\]\n",
-            md_text,
-            flags=re.MULTILINE | re.DOTALL,
-        )
+        md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
+        md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)

        # Remove arithmatex span wrappers
-        cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
+        cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
-        cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
-        cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
+        cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")

        # Convert block formulas: \[ \] => $$ $$
-        cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
-        cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
+        cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")

        # Remove spaces between $ and formula content
-        # Use negative lookahead/lookbehind to avoid matching $$ block formulas
-        cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
+        cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)
@@ -142,19 +479,15 @@ class Converter:
        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
-        md_text = re.sub(
-            r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
+        md_text = self._RE_VMATRIX.sub(
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
-            flags=re.DOTALL,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
-        md_text = re.sub(
-            r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
+        md_text = self._RE_VMATRIX_DOUBLE.sub(
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
-            flags=re.DOTALL,
        )

        return md_text
@@ -165,50 +498,22 @@ class Converter:
        Pandoc's OMML converter doesn't accept spaces between column alignment
        specifiers in array environments. This converts patterns like
        {c c c c} to {cccc}.
-
-        Args:
-            md_text: Markdown text with LaTeX formulas.
-
-        Returns:
-            Markdown text with fixed array column specifiers.
        """

        def remove_spaces_in_specifier(match: re.Match) -> str:
            """Remove spaces from column specifier."""
            specifier = match.group(1)
-            # Remove all spaces from the specifier
-            specifier_no_spaces = re.sub(r"\s+", "", specifier)
-            return f"\\begin{{array}}{{{specifier_no_spaces}}}"
+            return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"

-        # Match \begin{array}{...} and remove spaces in the column specifier
-        # Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...}
-        md_text = re.sub(
-            r"\\begin\{array\}\{([^}]+)\}",
-            remove_spaces_in_specifier,
-            md_text,
-        )
-
-        return md_text
+        return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)

    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
-        # Fix \left\{ spacing
-        md_text = re.sub(
-            r"\\left\\\{\s+",
-            r"\\left\\{\\!",
-            md_text,
-        )
-
-        # Fix \right\} spacing
-        md_text = re.sub(
-            r"\s+\\right\\\}",
-            r"\\!\\right\\}",
-            md_text,
-        )
-
+        md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
+        md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
@@ -216,42 +521,28 @@ class Converter:

        These environments have better rendering support in Word/OMML.
        """
+        # Pre-compiled pattern for alignment marker removal
+        _re_align_marker = re.compile(r"(^|\\\\)\s*&")

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

-        md_text = re.sub(
-            r"\\begin\{cases\}(.*?)\\end\{cases\}",
-            convert_cases,
-            md_text,
-            flags=re.DOTALL,
-        )
+        md_text = self._RE_CASES.sub(convert_cases, md_text)

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
-            # Remove leading & alignment markers (not needed in array{l})
-            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+            content = _re_align_marker.sub(r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

-        md_text = re.sub(
-            r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
-            convert_aligned_to_array,
-            md_text,
-            flags=re.DOTALL,
-        )
+        md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
-            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+            content = _re_align_marker.sub(r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

-        md_text = re.sub(
-            r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
-            convert_standalone_aligned,
-            md_text,
-            flags=re.DOTALL,
-        )
+        md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)

        return md_text

@@ -259,36 +550,15 @@ class Converter:
        """Convert LaTeX \\tag{} commands to Word-compatible format.

        The \\tag{} command is not supported in Word OMML format, so we convert it to
-        use simple spacing (\quad) to push the equation number to the right side.
-        The tag remains inside the formula for better compatibility.
-
-        Args:
-            md_text: Markdown text containing LaTeX formulas with \\tag{}.
-
-        Returns:
-            Markdown text with \\tag{} commands converted to spacing format.
+        use simple spacing (\\quad) to push the equation number to the right side.
        """

        def convert_tag(match: re.Match) -> str:
-            """Convert a single \\tag{} command within a formula."""
            formula_content = match.group(1)
            tag_content = match.group(2)
-
-            # Replace \tag{...} with \quad (...) to push the number to the right
-            # Keep it inside the formula for better Word compatibility
            return f"$${formula_content} \\quad ({tag_content})$$"

-        # Match display formulas ($$...$$) containing \\tag{...}
-        # Pattern: $$...content...\\tag {?...}...$$
-        # Allow optional space between \tag and {
-        md_text = re.sub(
-            r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$",
-            convert_tag,
-            md_text,
-            flags=re.DOTALL,
-        )
-
-        return md_text
+        return self._RE_TAG.sub(convert_tag, md_text)

    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.
@@ -381,4 +651,3 @@ class Converter:
        """
        if os.path.exists(file_path):
            os.remove(file_path)
-