"""Markdown conversion and export service using pypandoc."""
import os
import re
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from typing import Literal
import pypandoc
from latex2mathml.converter import convert as latex_to_mathml
@dataclass
class ConvertResult:
"""Result of markdown conversion.
Only populated when input contains pure LaTeX formula.
All fields are empty strings when input contains mixed content (text + formula).
Attributes:
latex: Pure LaTeX formula code (without delimiters).
mathml: Standard MathML format.
mml: XML MathML with mml: namespace prefix (mml:math).
"""
latex: str
mathml: str
mml: str
@dataclass
class ExportResult:
"""Result of markdown export."""
file_path: str
content_type: str
download_name: str
ExportType = Literal["docx", "pdf"]
# MathML namespace
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
# XSLT for MathML to mml: namespace conversion
MML_XSLT = """
"""
class Converter:
"""Service for conversion and export operations.
Conversion rules:
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
- Mixed content (text + formula) returns empty results for all formats.
- OMML conversion is provided as a separate method due to performance overhead.
Performance optimizations:
- Pre-compiled regex patterns
- XSLT-based MML conversion
- Cached XSLT transforms
- Direct Pandoc OMML output (avoids DOCX parsing)
"""
# Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
# Pre-compiled regex patterns for formula detection
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
_RE_MATH_ELEMENT = re.compile(r"")
# Pre-compiled regex patterns for preprocessing
_RE_VSPACE = re.compile(r"\\\[1mm\]")
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
_RE_ARITHMATEX = re.compile(r'(.*?)')
_RE_INLINE_SPACE = re.compile(r"(? bool:
"""Check if text contains only a LaTeX formula (no mixed content).
A text is considered formula-only if it matches one of these patterns:
- Display math: $$...$$ or \\[...\\]
- Inline math: $...$ or \\(...\\)
Args:
text: Input text to check.
Returns:
True if the text contains only a LaTeX formula, False otherwise.
"""
text = text.strip()
if not text:
return False
# Strict patterns: entire text must be a single formula with delimiters
# Using pre-compiled patterns with fullmatch semantics
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
return True
if self._RE_DISPLAY_BRACKET.fullmatch(text):
return True
if self._RE_INLINE_DOLLAR.fullmatch(text):
return True
if self._RE_INLINE_PAREN.fullmatch(text):
return True
return False
def convert_to_formats(self, md_text: str) -> ConvertResult:
"""Convert markdown to LaTeX, MathML, and MML formats.
Only converts when input contains a pure LaTeX formula.
Mixed content (text + formula) returns empty strings for all fields.
Args:
md_text: Markdown text to convert.
Returns:
ConvertResult with latex, mathml, and mml fields.
All fields are empty if input is not a pure formula.
Raises:
RuntimeError: If conversion fails for a valid formula.
"""
# Empty input returns empty result
if not md_text or not md_text.strip():
return ConvertResult(latex="", mathml="", mml="")
# Check if input is formula-only
if not self._is_formula_only(md_text):
# Mixed content: cannot convert to formula formats
return ConvertResult(latex="", mathml="", mml="")
try:
# Extract the LaTeX formula content (remove delimiters)
latex_formula = self._extract_latex_formula(md_text)
# Convert to MathML
mathml = self._latex_to_mathml(latex_formula)
# Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml)
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e
def convert_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
This is a separate method due to the performance overhead of OMML conversion,
which requires creating a temporary DOCX file.
The formula is preprocessed using the same logic as export_to_file to ensure
proper conversion.
Args:
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
Returns:
OMML representation as XML string.
Raises:
ValueError: If latex_formula is empty.
RuntimeError: If conversion fails.
"""
if not latex_formula or not latex_formula.strip():
raise ValueError("LaTeX formula cannot be empty")
# Preprocess formula using the same preprocessing as export
preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
return self._latex_to_omml(preprocessed)
def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for OMML conversion.
Applies the same preprocessing steps as preprocess_for_export to ensure
consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
Args:
latex_formula: Pure LaTeX formula.
Returns:
Preprocessed LaTeX formula.
"""
# Use the same preprocessing methods as export
# 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces)
latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing
latex_formula = self._fix_brace_spacing(latex_formula)
# 4. Convert special environments (cases, aligned)
latex_formula = self._convert_special_environments(latex_formula)
return latex_formula
def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
Pure LaTeX formula without delimiters.
"""
text = text.strip()
# Remove display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return text[2:-2].strip()
if text.startswith("\\[") and text.endswith("\\]"):
return text[2:-2].strip()
# Remove inline math delimiters: $...$ or \(...\)
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
return text[1:-1].strip()
if text.startswith("\\(") and text.endswith("\\)"):
return text[2:-2].strip()
# If no delimiters, return as-is
return text.strip()
@staticmethod
@lru_cache(maxsize=256)
def _latex_to_mathml_cached(latex_formula: str) -> str:
"""Cached conversion of LaTeX formula to MathML.
Uses LRU cache to avoid recomputing for repeated formulas.
"""
try:
# Use latex2mathml library for conversion (fast, pure Python)
return latex_to_mathml(latex_formula)
except Exception as e:
# Fallback: try with Pandoc (slower, but more robust)
try:
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
)
# Extract just the