"""Markdown conversion and export service using pypandoc."""
import os
import re
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from typing import Literal
import pypandoc
from latex2mathml.converter import convert as latex_to_mathml
@dataclass
class ConvertResult:
"""Result of markdown conversion.
Only populated when input contains pure LaTeX formula.
All fields are empty strings when input contains mixed content (text + formula).
Attributes:
latex: Pure LaTeX formula code (without delimiters).
mathml: Standard MathML format.
mml: XML MathML with mml: namespace prefix (mml:math).
"""
latex: str
mathml: str
mml: str
@dataclass
class ExportResult:
"""Result of markdown export."""
file_path: str
content_type: str
download_name: str
ExportType = Literal["docx", "pdf"]
# MathML namespace
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
# XSLT for MathML to mml: namespace conversion
MML_XSLT = """
"""
class Converter:
"""Service for conversion and export operations.
Conversion rules:
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
- Mixed content (text + formula) returns empty results for all formats.
- OMML conversion is provided as a separate method due to performance overhead.
Performance optimizations:
- Pre-compiled regex patterns
- XSLT-based MML conversion
- Cached XSLT transforms
- Direct Pandoc OMML output (avoids DOCX parsing)
"""
# Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
# Pre-compiled regex patterns for formula detection
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
_RE_MATH_ELEMENT = re.compile(r"")
# Pre-compiled regex patterns for preprocessing
_RE_VSPACE = re.compile(r"\\\[1mm\]")
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
_RE_ARITHMATEX = re.compile(r'(.*?)')
_RE_INLINE_SPACE = re.compile(r"(? bool:
"""Check if text contains only a LaTeX formula (no mixed content).
A text is considered formula-only if it matches one of these patterns:
- Display math: $$...$$ or \\[...\\]
- Inline math: $...$ or \\(...\\)
Args:
text: Input text to check.
Returns:
True if the text contains only a LaTeX formula, False otherwise.
"""
text = text.strip()
if not text:
return False
# Strict patterns: entire text must be a single formula with delimiters
# Using pre-compiled patterns with fullmatch semantics
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
return True
if self._RE_DISPLAY_BRACKET.fullmatch(text):
return True
if self._RE_INLINE_DOLLAR.fullmatch(text):
return True
if self._RE_INLINE_PAREN.fullmatch(text):
return True
return False
def convert_to_formats(self, md_text: str) -> ConvertResult:
"""Convert markdown to LaTeX, MathML, and MML formats.
Only converts when input contains a pure LaTeX formula.
Mixed content (text + formula) returns empty strings for all fields.
Args:
md_text: Markdown text to convert.
Returns:
ConvertResult with latex, mathml, and mml fields.
All fields are empty if input is not a pure formula.
Raises:
RuntimeError: If conversion fails for a valid formula.
"""
# Empty input returns empty result
if not md_text or not md_text.strip():
return ConvertResult(latex="", mathml="", mml="")
# Check if input is formula-only
if not self._is_formula_only(md_text):
# Mixed content: cannot convert to formula formats
return ConvertResult(latex="", mathml="", mml="")
try:
# Extract the LaTeX formula content (remove delimiters)
latex_formula = self._extract_latex_formula(md_text)
# Convert to MathML
mathml = self._latex_to_mathml(latex_formula)
# Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml)
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e
def convert_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
This is a separate method due to the performance overhead of OMML conversion,
which requires creating a temporary DOCX file.
The formula is preprocessed using the same logic as export_to_file to ensure
proper conversion.
Args:
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
Returns:
OMML representation as XML string.
Raises:
ValueError: If latex_formula is empty.
RuntimeError: If conversion fails.
"""
if not latex_formula or not latex_formula.strip():
raise ValueError("LaTeX formula cannot be empty")
# Preprocess formula using the same preprocessing as export
preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
return self._latex_to_omml(preprocessed)
def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for OMML conversion.
Applies the same preprocessing steps as preprocess_for_export to ensure
consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
Args:
latex_formula: Pure LaTeX formula.
Returns:
Preprocessed LaTeX formula.
"""
# Use the same preprocessing methods as export
# 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces)
latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing
latex_formula = self._fix_brace_spacing(latex_formula)
# 4. Convert special environments (cases, aligned)
latex_formula = self._convert_special_environments(latex_formula)
return latex_formula
def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
Pure LaTeX formula without delimiters.
"""
text = text.strip()
# Remove display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return text[2:-2].strip()
if text.startswith("\\[") and text.endswith("\\]"):
return text[2:-2].strip()
# Remove inline math delimiters: $...$ or \(...\)
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
return text[1:-1].strip()
if text.startswith("\\(") and text.endswith("\\)"):
return text[2:-2].strip()
# If no delimiters, return as-is
return text.strip()
@staticmethod
@lru_cache(maxsize=256)
def _latex_to_mathml_cached(latex_formula: str) -> str:
"""Cached conversion of LaTeX formula to MathML.
Uses Pandoc for conversion to ensure Word compatibility.
Pandoc generates standard MathML that Word can properly import.
Uses LRU cache to avoid recomputing for repeated formulas.
"""
try:
# Use Pandoc for Word-compatible MathML (primary method)
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
)
# Extract just the