2025-12-31 17:38:32 +08:00
|
|
|
|
"""Markdown conversion and export service using pypandoc."""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import re
|
|
|
|
|
|
import tempfile
|
|
|
|
|
|
from dataclasses import dataclass
|
2026-02-04 12:00:06 +08:00
|
|
|
|
from functools import lru_cache
|
2025-12-31 17:38:32 +08:00
|
|
|
|
from typing import Literal
|
|
|
|
|
|
|
|
|
|
|
|
import pypandoc
|
2026-02-04 12:00:06 +08:00
|
|
|
|
from latex2mathml.converter import convert as latex_to_mathml
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class ConvertResult:
|
2026-02-04 12:00:06 +08:00
|
|
|
|
"""Result of markdown conversion.
|
|
|
|
|
|
|
|
|
|
|
|
Only populated when input contains pure LaTeX formula.
|
|
|
|
|
|
All fields are empty strings when input contains mixed content (text + formula).
|
|
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
|
latex: Pure LaTeX formula code (without delimiters).
|
|
|
|
|
|
mathml: Standard MathML format.
|
|
|
|
|
|
mml: XML MathML with mml: namespace prefix (mml:math).
|
|
|
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
latex: str
|
|
|
|
|
|
mathml: str
|
2026-02-04 12:00:06 +08:00
|
|
|
|
mml: str
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class ExportResult:
|
|
|
|
|
|
"""Result of markdown export."""
|
|
|
|
|
|
|
|
|
|
|
|
file_path: str
|
|
|
|
|
|
content_type: str
|
|
|
|
|
|
download_name: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ExportType = Literal["docx", "pdf"]
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
# MathML namespace
|
|
|
|
|
|
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
|
|
|
|
|
|
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
|
|
|
|
|
|
|
|
|
|
|
|
# XSLT for MathML to mml: namespace conversion
|
|
|
|
|
|
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
|
<xsl:stylesheet version="1.0"
|
|
|
|
|
|
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
|
|
|
|
xmlns:mml="http://www.w3.org/1998/Math/MathML"
|
|
|
|
|
|
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
|
|
|
|
|
exclude-result-prefixes="m">
|
|
|
|
|
|
|
|
|
|
|
|
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
|
|
|
|
|
|
|
|
|
|
|
|
<!-- Match root math element -->
|
|
|
|
|
|
<xsl:template match="m:math|math">
|
|
|
|
|
|
<mml:math>
|
|
|
|
|
|
<xsl:apply-templates select="@*|node()"/>
|
|
|
|
|
|
</mml:math>
|
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
|
|
<!-- Match all other MathML elements -->
|
|
|
|
|
|
<xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
|
|
|
|
|
|
<xsl:element name="mml:{local-name()}">
|
|
|
|
|
|
<xsl:apply-templates select="@*|node()"/>
|
|
|
|
|
|
</xsl:element>
|
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
|
|
<!-- Copy attributes -->
|
|
|
|
|
|
<xsl:template match="@*">
|
|
|
|
|
|
<xsl:if test="local-name() != 'xmlns'">
|
|
|
|
|
|
<xsl:copy/>
|
|
|
|
|
|
</xsl:if>
|
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
|
|
<!-- Copy text nodes -->
|
|
|
|
|
|
<xsl:template match="text()">
|
|
|
|
|
|
<xsl:value-of select="."/>
|
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
|
|
</xsl:stylesheet>
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
class Converter:
|
2026-02-04 12:00:06 +08:00
|
|
|
|
"""Service for conversion and export operations.
|
|
|
|
|
|
|
|
|
|
|
|
Conversion rules:
|
|
|
|
|
|
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
|
|
|
|
|
|
- Mixed content (text + formula) returns empty results for all formats.
|
|
|
|
|
|
- OMML conversion is provided as a separate method due to performance overhead.
|
|
|
|
|
|
|
|
|
|
|
|
Performance optimizations:
|
|
|
|
|
|
- Pre-compiled regex patterns
|
|
|
|
|
|
- XSLT-based MML conversion
|
|
|
|
|
|
- Cached XSLT transforms
|
|
|
|
|
|
- Direct Pandoc OMML output (avoids DOCX parsing)
|
|
|
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
# Pandoc input format with LaTeX math extensions
|
|
|
|
|
|
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
# Pre-compiled regex patterns for formula detection
|
|
|
|
|
|
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
|
|
|
|
|
|
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
|
|
|
|
|
|
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
|
|
|
|
|
|
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
|
|
|
|
|
|
_RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
|
|
|
|
|
|
|
|
|
|
|
|
# Pre-compiled regex patterns for preprocessing
|
|
|
|
|
|
_RE_VSPACE = re.compile(r"\\\[1mm\]")
|
|
|
|
|
|
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
|
2026-03-10 19:52:22 +08:00
|
|
|
|
_RE_BLOCK_FORMULA_LINE = re.compile(
|
|
|
|
|
|
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL
|
|
|
|
|
|
)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
|
|
|
|
|
|
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
|
|
|
|
|
|
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
|
|
|
|
|
|
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
|
|
|
|
|
|
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
|
|
|
|
|
|
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
|
2026-03-10 19:52:22 +08:00
|
|
|
|
_RE_ALIGNED_BRACE = re.compile(
|
|
|
|
|
|
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL
|
|
|
|
|
|
)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
|
|
|
|
|
|
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
|
|
|
|
|
|
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
|
|
|
|
|
|
_RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
|
|
|
|
|
|
|
|
|
|
|
|
# Cached XSLT transform
|
|
|
|
|
|
_mml_xslt_transform = None
|
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
def __init__(self):
|
|
|
|
|
|
"""Initialize converter."""
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
@classmethod
|
|
|
|
|
|
def _get_mml_xslt_transform(cls):
|
|
|
|
|
|
"""Get cached XSLT transform for MathML to mml: conversion."""
|
|
|
|
|
|
if cls._mml_xslt_transform is None:
|
|
|
|
|
|
from lxml import etree
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
|
|
|
|
|
|
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
|
|
|
|
|
|
return cls._mml_xslt_transform
|
|
|
|
|
|
|
|
|
|
|
|
def _is_formula_only(self, text: str) -> bool:
|
|
|
|
|
|
"""Check if text contains only a LaTeX formula (no mixed content).
|
|
|
|
|
|
|
|
|
|
|
|
A text is considered formula-only if it matches one of these patterns:
|
|
|
|
|
|
- Display math: $$...$$ or \\[...\\]
|
|
|
|
|
|
- Inline math: $...$ or \\(...\\)
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
text: Input text to check.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
True if the text contains only a LaTeX formula, False otherwise.
|
|
|
|
|
|
"""
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
# Strict patterns: entire text must be a single formula with delimiters
|
|
|
|
|
|
# Using pre-compiled patterns with fullmatch semantics
|
|
|
|
|
|
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if self._RE_DISPLAY_BRACKET.fullmatch(text):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if self._RE_INLINE_DOLLAR.fullmatch(text):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if self._RE_INLINE_PAREN.fullmatch(text):
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
2026-02-04 12:00:06 +08:00
|
|
|
|
"""Convert markdown to LaTeX, MathML, and MML formats.
|
|
|
|
|
|
|
|
|
|
|
|
Only converts when input contains a pure LaTeX formula.
|
|
|
|
|
|
Mixed content (text + formula) returns empty strings for all fields.
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
md_text: Markdown text to convert.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
2026-02-04 12:00:06 +08:00
|
|
|
|
ConvertResult with latex, mathml, and mml fields.
|
|
|
|
|
|
All fields are empty if input is not a pure formula.
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
Raises:
|
2026-02-04 12:00:06 +08:00
|
|
|
|
RuntimeError: If conversion fails for a valid formula.
|
2025-12-31 17:38:32 +08:00
|
|
|
|
"""
|
2026-02-04 12:00:06 +08:00
|
|
|
|
# Empty input returns empty result
|
|
|
|
|
|
if not md_text or not md_text.strip():
|
|
|
|
|
|
return ConvertResult(latex="", mathml="", mml="")
|
|
|
|
|
|
|
|
|
|
|
|
# Check if input is formula-only
|
|
|
|
|
|
if not self._is_formula_only(md_text):
|
|
|
|
|
|
# Mixed content: cannot convert to formula formats
|
|
|
|
|
|
return ConvertResult(latex="", mathml="", mml="")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
try:
|
2026-02-07 21:28:46 +08:00
|
|
|
|
# Detect if formula is display (block) or inline
|
|
|
|
|
|
is_display = self._is_display_formula(md_text)
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
# Extract the LaTeX formula content (remove delimiters)
|
|
|
|
|
|
latex_formula = self._extract_latex_formula(md_text)
|
|
|
|
|
|
|
2026-02-04 15:52:04 +08:00
|
|
|
|
# Preprocess formula for better conversion (fix array specifiers, etc.)
|
|
|
|
|
|
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
|
|
|
|
|
|
|
2026-02-07 21:28:46 +08:00
|
|
|
|
# Convert to MathML (pass display flag to use correct delimiters)
|
|
|
|
|
|
mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
|
|
|
|
|
|
# Convert MathML to mml:math format (with namespace prefix)
|
|
|
|
|
|
mml = self._mathml_to_mml(mathml)
|
|
|
|
|
|
|
|
|
|
|
|
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
raise RuntimeError(f"Conversion failed: {e}") from e
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
def convert_to_omml(self, latex_formula: str) -> str:
|
|
|
|
|
|
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
|
|
|
|
|
|
|
|
|
|
|
This is a separate method due to the performance overhead of OMML conversion,
|
|
|
|
|
|
which requires creating a temporary DOCX file.
|
|
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
|
The formula is preprocessed using the same logic as export_to_file to ensure
|
|
|
|
|
|
proper conversion.
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
OMML representation as XML string.
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
ValueError: If latex_formula is empty.
|
|
|
|
|
|
RuntimeError: If conversion fails.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not latex_formula or not latex_formula.strip():
|
|
|
|
|
|
raise ValueError("LaTeX formula cannot be empty")
|
|
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
|
# Preprocess formula using the same preprocessing as export
|
2026-02-04 15:52:04 +08:00
|
|
|
|
preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
|
return self._latex_to_omml(preprocessed)
|
|
|
|
|
|
|
2026-02-04 15:52:04 +08:00
|
|
|
|
def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
|
|
|
|
|
|
"""Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).
|
2026-02-04 12:45:34 +08:00
|
|
|
|
|
|
|
|
|
|
Applies the same preprocessing steps as preprocess_for_export to ensure
|
2026-02-07 21:28:46 +08:00
|
|
|
|
consistency across all conversion paths. This fixes common issues that
|
2026-02-04 15:52:04 +08:00
|
|
|
|
cause Pandoc conversion to fail.
|
2026-02-04 12:45:34 +08:00
|
|
|
|
|
2026-02-07 21:28:46 +08:00
|
|
|
|
Note: OCR errors (number errors, command spacing) are fixed earlier in the
|
|
|
|
|
|
pipeline (in ocr_service.py), so we don't need to handle them here.
|
2026-02-04 16:04:18 +08:00
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
latex_formula: Pure LaTeX formula.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
Preprocessed LaTeX formula.
|
|
|
|
|
|
"""
|
|
|
|
|
|
# 1. Convert matrix environments
|
|
|
|
|
|
latex_formula = self._convert_matrix_environments(latex_formula)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:04:18 +08:00
|
|
|
|
# 2. Fix array column specifiers (remove spaces)
|
2026-02-04 12:45:34 +08:00
|
|
|
|
latex_formula = self._fix_array_column_specifiers(latex_formula)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
|
# 3. Fix brace spacing
|
|
|
|
|
|
latex_formula = self._fix_brace_spacing(latex_formula)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
|
# 4. Convert special environments (cases, aligned)
|
|
|
|
|
|
latex_formula = self._convert_special_environments(latex_formula)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
|
return latex_formula
|
2026-02-04 12:00:06 +08:00
|
|
|
|
|
2026-02-07 21:28:46 +08:00
|
|
|
|
def _is_display_formula(self, text: str) -> bool:
|
|
|
|
|
|
"""Check if the formula is a display (block) formula.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
text: Text containing LaTeX formula with delimiters.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
True if display formula ($$...$$ or \\[...\\]), False if inline.
|
|
|
|
|
|
"""
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Display math delimiters: $$...$$ or \[...\]
|
|
|
|
|
|
if text.startswith("$$") and text.endswith("$$"):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if text.startswith("\\[") and text.endswith("\\]"):
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
# Inline math delimiters: $...$ or \(...\)
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
def _extract_latex_formula(self, text: str) -> str:
|
|
|
|
|
|
"""Extract LaTeX formula from text by removing delimiters.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
text: Text containing LaTeX formula with delimiters.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
Pure LaTeX formula without delimiters.
|
|
|
|
|
|
"""
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Remove display math delimiters: $$...$$ or \[...\]
|
|
|
|
|
|
if text.startswith("$$") and text.endswith("$$"):
|
|
|
|
|
|
return text[2:-2].strip()
|
|
|
|
|
|
if text.startswith("\\[") and text.endswith("\\]"):
|
|
|
|
|
|
return text[2:-2].strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Remove inline math delimiters: $...$ or \(...\)
|
|
|
|
|
|
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
|
|
|
|
|
|
return text[1:-1].strip()
|
|
|
|
|
|
if text.startswith("\\(") and text.endswith("\\)"):
|
|
|
|
|
|
return text[2:-2].strip()
|
|
|
|
|
|
|
|
|
|
|
|
# If no delimiters, return as-is
|
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
@lru_cache(maxsize=256)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
|
2026-02-04 12:00:06 +08:00
|
|
|
|
"""Cached conversion of LaTeX formula to MathML.
|
|
|
|
|
|
|
2026-02-04 15:49:13 +08:00
|
|
|
|
Uses Pandoc for conversion to ensure Word compatibility.
|
|
|
|
|
|
Pandoc generates standard MathML that Word can properly import.
|
|
|
|
|
|
|
2026-02-07 21:28:46 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
latex_formula: Pure LaTeX formula (without delimiters).
|
|
|
|
|
|
is_display: True if display (block) formula, False if inline.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
Standard MathML representation.
|
2026-02-04 12:00:06 +08:00
|
|
|
|
"""
|
2026-02-07 21:28:46 +08:00
|
|
|
|
# Use appropriate delimiters based on formula type
|
|
|
|
|
|
# Display formulas use $$...$$, inline formulas use $...$
|
|
|
|
|
|
if is_display:
|
|
|
|
|
|
pandoc_input = f"$${latex_formula}$$"
|
|
|
|
|
|
else:
|
|
|
|
|
|
pandoc_input = f"${latex_formula}$"
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
try:
|
2026-02-04 15:49:13 +08:00
|
|
|
|
# Use Pandoc for Word-compatible MathML (primary method)
|
|
|
|
|
|
mathml_html = pypandoc.convert_text(
|
2026-02-07 21:28:46 +08:00
|
|
|
|
pandoc_input,
|
2026-02-04 15:49:13 +08:00
|
|
|
|
"html",
|
|
|
|
|
|
format="markdown+tex_math_dollars",
|
|
|
|
|
|
extra_args=["--mathml"],
|
|
|
|
|
|
)
|
|
|
|
|
|
# Extract just the <math> element from the HTML
|
|
|
|
|
|
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
|
|
|
|
|
|
if match:
|
|
|
|
|
|
mathml = match.group(0)
|
|
|
|
|
|
# Post-process for Word compatibility
|
|
|
|
|
|
return Converter._postprocess_mathml_for_word(mathml)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
|
|
|
|
|
# If Pandoc didn't generate MathML (returned HTML instead), use fallback
|
|
|
|
|
|
# This happens when Pandoc's mathml output format is not available or fails
|
|
|
|
|
|
raise ValueError("Pandoc did not generate MathML, got HTML instead")
|
|
|
|
|
|
|
2026-02-04 15:49:13 +08:00
|
|
|
|
except Exception as pandoc_error:
|
|
|
|
|
|
# Fallback: try latex2mathml (less Word-compatible)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
try:
|
2026-02-04 15:49:13 +08:00
|
|
|
|
mathml = latex_to_mathml(latex_formula)
|
|
|
|
|
|
return Converter._postprocess_mathml_for_word(mathml)
|
|
|
|
|
|
except Exception as e:
|
2026-03-10 19:52:22 +08:00
|
|
|
|
raise RuntimeError(
|
|
|
|
|
|
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
|
|
|
|
|
|
) from e
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 15:49:13 +08:00
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _postprocess_mathml_for_word(mathml: str) -> str:
|
|
|
|
|
|
"""Post-process MathML to improve Word compatibility.
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
Applies transformations to make MathML more compatible and concise:
|
2026-02-04 16:12:22 +08:00
|
|
|
|
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
|
2026-02-04 16:56:20 +08:00
|
|
|
|
- Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
|
|
|
|
|
|
- Remove redundant single <mrow> wrappers
|
2026-02-04 15:49:13 +08:00
|
|
|
|
- Change display="inline" to display="block" for better rendering
|
|
|
|
|
|
- Decode Unicode entities to actual characters (Word prefers this)
|
2026-02-04 16:12:22 +08:00
|
|
|
|
- Ensure proper namespace
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 15:49:13 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
mathml: MathML string.
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 15:49:13 +08:00
|
|
|
|
Returns:
|
2026-02-04 16:56:20 +08:00
|
|
|
|
Simplified, Word-compatible MathML string.
|
2026-02-04 15:49:13 +08:00
|
|
|
|
"""
|
2026-02-04 16:12:22 +08:00
|
|
|
|
import re
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:12:22 +08:00
|
|
|
|
# Step 1: Remove <semantics> and <annotation> wrappers
|
|
|
|
|
|
# These often cause Word import issues
|
2026-02-07 21:28:46 +08:00
|
|
|
|
if "<semantics>" in mathml:
|
2026-02-04 16:12:22 +08:00
|
|
|
|
# Extract content between <semantics> and <annotation>
|
2026-02-07 21:28:46 +08:00
|
|
|
|
match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
|
2026-02-04 16:12:22 +08:00
|
|
|
|
if match:
|
|
|
|
|
|
content = match.group(1).strip()
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:12:22 +08:00
|
|
|
|
# Get the math element attributes
|
|
|
|
|
|
math_attrs = ""
|
2026-02-07 21:28:46 +08:00
|
|
|
|
math_match = re.search(r"<math([^>]*)>", mathml)
|
2026-02-04 16:12:22 +08:00
|
|
|
|
if math_match:
|
|
|
|
|
|
math_attrs = math_match.group(1)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:12:22 +08:00
|
|
|
|
# Rebuild without semantics
|
2026-02-07 21:28:46 +08:00
|
|
|
|
mathml = f"<math{math_attrs}>{content}</math>"
|
|
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
# Step 2: Remove unnecessary attributes that don't affect rendering
|
|
|
|
|
|
# These are verbose and Word doesn't need them
|
|
|
|
|
|
unnecessary_attrs = [
|
|
|
|
|
|
r'\s+form="prefix"',
|
|
|
|
|
|
r'\s+form="postfix"',
|
|
|
|
|
|
r'\s+form="infix"',
|
|
|
|
|
|
r'\s+stretchy="true"',
|
|
|
|
|
|
r'\s+stretchy="false"',
|
|
|
|
|
|
r'\s+fence="true"',
|
|
|
|
|
|
r'\s+fence="false"',
|
|
|
|
|
|
r'\s+separator="true"',
|
|
|
|
|
|
r'\s+separator="false"',
|
|
|
|
|
|
r'\s+columnalign="[^"]*"',
|
|
|
|
|
|
r'\s+columnspacing="[^"]*"',
|
|
|
|
|
|
r'\s+rowspacing="[^"]*"',
|
|
|
|
|
|
r'\s+class="[^"]*"',
|
|
|
|
|
|
r'\s+style="[^"]*"',
|
|
|
|
|
|
]
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
for attr_pattern in unnecessary_attrs:
|
2026-02-07 21:28:46 +08:00
|
|
|
|
mathml = re.sub(attr_pattern, "", mathml)
|
|
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
# Step 3: Remove redundant single <mrow> wrapper at the top level
|
|
|
|
|
|
# Pattern: <math ...><mrow>content</mrow></math>
|
|
|
|
|
|
# Simplify to: <math ...>content</math>
|
2026-02-07 21:28:46 +08:00
|
|
|
|
mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
|
2026-02-04 16:56:20 +08:00
|
|
|
|
match = re.search(mrow_pattern, mathml, re.DOTALL)
|
|
|
|
|
|
if match:
|
|
|
|
|
|
# Check if there's only one mrow at the top level
|
|
|
|
|
|
content = match.group(2)
|
|
|
|
|
|
# Only remove if the content doesn't have other top-level elements
|
2026-02-07 21:28:46 +08:00
|
|
|
|
if not re.search(r"</[^>]+>\s*<[^/]", content):
|
|
|
|
|
|
mathml = f"{match.group(1)}{content}{match.group(3)}"
|
|
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
# Step 4: Change display to block for better Word rendering
|
2026-02-04 15:49:13 +08:00
|
|
|
|
mathml = mathml.replace('display="inline"', 'display="block"')
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
# Step 5: If no display attribute, add it
|
2026-02-07 21:28:46 +08:00
|
|
|
|
if "display=" not in mathml and "<math" in mathml:
|
|
|
|
|
|
mathml = mathml.replace("<math", '<math display="block"', 1)
|
|
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
# Step 6: Ensure xmlns is present
|
2026-02-07 21:28:46 +08:00
|
|
|
|
if "xmlns=" not in mathml and "<math" in mathml:
|
|
|
|
|
|
mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
|
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
|
2026-02-04 15:49:13 +08:00
|
|
|
|
unicode_map = {
|
2026-02-05 13:18:55 +08:00
|
|
|
|
# Basic operators
|
2026-02-07 21:28:46 +08:00
|
|
|
|
"+": "+",
|
|
|
|
|
|
"-": "-",
|
|
|
|
|
|
"*": "*",
|
|
|
|
|
|
"/": "/",
|
|
|
|
|
|
"=": "=",
|
|
|
|
|
|
"<": "<",
|
|
|
|
|
|
">": ">",
|
|
|
|
|
|
"(": "(",
|
|
|
|
|
|
")": ")",
|
|
|
|
|
|
",": ",",
|
|
|
|
|
|
".": ".",
|
|
|
|
|
|
"|": "|",
|
|
|
|
|
|
"°": "°",
|
|
|
|
|
|
"×": "×", # times
|
|
|
|
|
|
"÷": "÷", # div
|
|
|
|
|
|
"±": "±", # pm
|
|
|
|
|
|
"∓": "∓", # mp
|
2026-02-05 13:18:55 +08:00
|
|
|
|
# Ellipsis symbols
|
2026-02-07 21:28:46 +08:00
|
|
|
|
"…": "…", # ldots (horizontal)
|
|
|
|
|
|
"⋮": "⋮", # vdots (vertical)
|
|
|
|
|
|
"⋯": "⋯", # cdots (centered)
|
|
|
|
|
|
"⋰": "⋰", # iddots (diagonal up)
|
|
|
|
|
|
"⋱": "⋱", # ddots (diagonal down)
|
2026-02-05 13:18:55 +08:00
|
|
|
|
# Greek letters (lowercase)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
"α": "α", # alpha
|
|
|
|
|
|
"β": "β", # beta
|
|
|
|
|
|
"γ": "γ", # gamma
|
|
|
|
|
|
"δ": "δ", # delta
|
|
|
|
|
|
"ε": "ε", # epsilon
|
|
|
|
|
|
"ζ": "ζ", # zeta
|
|
|
|
|
|
"η": "η", # eta
|
|
|
|
|
|
"θ": "θ", # theta
|
|
|
|
|
|
"ι": "ι", # iota
|
|
|
|
|
|
"κ": "κ", # kappa
|
|
|
|
|
|
"λ": "λ", # lambda
|
|
|
|
|
|
"μ": "μ", # mu
|
|
|
|
|
|
"ν": "ν", # nu
|
|
|
|
|
|
"ξ": "ξ", # xi
|
|
|
|
|
|
"ο": "ο", # omicron
|
|
|
|
|
|
"π": "π", # pi
|
|
|
|
|
|
"ρ": "ρ", # rho
|
|
|
|
|
|
"ς": "ς", # final sigma
|
|
|
|
|
|
"σ": "σ", # sigma
|
|
|
|
|
|
"τ": "τ", # tau
|
|
|
|
|
|
"υ": "υ", # upsilon
|
|
|
|
|
|
"φ": "φ", # phi
|
|
|
|
|
|
"χ": "χ", # chi
|
|
|
|
|
|
"ψ": "ψ", # psi
|
|
|
|
|
|
"ω": "ω", # omega
|
|
|
|
|
|
"ϕ": "ϕ", # phi variant
|
2026-02-05 13:18:55 +08:00
|
|
|
|
# Greek letters (uppercase)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
"Α": "Α", # Alpha
|
|
|
|
|
|
"Β": "Β", # Beta
|
|
|
|
|
|
"Γ": "Γ", # Gamma
|
|
|
|
|
|
"Δ": "Δ", # Delta
|
|
|
|
|
|
"Ε": "Ε", # Epsilon
|
|
|
|
|
|
"Ζ": "Ζ", # Zeta
|
|
|
|
|
|
"Η": "Η", # Eta
|
|
|
|
|
|
"Θ": "Θ", # Theta
|
|
|
|
|
|
"Ι": "Ι", # Iota
|
|
|
|
|
|
"Κ": "Κ", # Kappa
|
|
|
|
|
|
"Λ": "Λ", # Lambda
|
|
|
|
|
|
"Μ": "Μ", # Mu
|
|
|
|
|
|
"Ν": "Ν", # Nu
|
|
|
|
|
|
"Ξ": "Ξ", # Xi
|
|
|
|
|
|
"Ο": "Ο", # Omicron
|
|
|
|
|
|
"Π": "Π", # Pi
|
|
|
|
|
|
"Ρ": "Ρ", # Rho
|
|
|
|
|
|
"Σ": "Σ", # Sigma
|
|
|
|
|
|
"Τ": "Τ", # Tau
|
|
|
|
|
|
"Υ": "Υ", # Upsilon
|
|
|
|
|
|
"Φ": "Φ", # Phi
|
|
|
|
|
|
"Χ": "Χ", # Chi
|
|
|
|
|
|
"Ψ": "Ψ", # Psi
|
|
|
|
|
|
"Ω": "Ω", # Omega
|
2026-02-05 13:18:55 +08:00
|
|
|
|
# Math symbols
|
2026-02-07 21:28:46 +08:00
|
|
|
|
"∅": "∅", # emptyset
|
|
|
|
|
|
"∈": "∈", # in
|
|
|
|
|
|
"∉": "∉", # notin
|
|
|
|
|
|
"∋": "∋", # ni
|
|
|
|
|
|
"∌": "∌", # nni
|
|
|
|
|
|
"∑": "∑", # sum
|
|
|
|
|
|
"∏": "∏", # prod
|
|
|
|
|
|
"√": "√", # sqrt
|
|
|
|
|
|
"∛": "∛", # cbrt
|
|
|
|
|
|
"∜": "∜", # fourthroot
|
|
|
|
|
|
"∞": "∞", # infty
|
|
|
|
|
|
"∩": "∩", # cap
|
|
|
|
|
|
"∪": "∪", # cup
|
|
|
|
|
|
"∫": "∫", # int
|
|
|
|
|
|
"∬": "∬", # iint
|
|
|
|
|
|
"∭": "∭", # iiint
|
|
|
|
|
|
"∮": "∮", # oint
|
|
|
|
|
|
"⊂": "⊂", # subset
|
|
|
|
|
|
"⊃": "⊃", # supset
|
|
|
|
|
|
"⊄": "⊄", # nsubset
|
|
|
|
|
|
"⊅": "⊅", # nsupset
|
|
|
|
|
|
"⊆": "⊆", # subseteq
|
|
|
|
|
|
"⊇": "⊇", # supseteq
|
|
|
|
|
|
"⊈": "⊈", # nsubseteq
|
|
|
|
|
|
"⊉": "⊉", # nsupseteq
|
|
|
|
|
|
"≤": "≤", # leq
|
|
|
|
|
|
"≥": "≥", # geq
|
|
|
|
|
|
"≠": "≠", # neq
|
|
|
|
|
|
"≡": "≡", # equiv
|
|
|
|
|
|
"≈": "≈", # approx
|
|
|
|
|
|
"≃": "≃", # simeq
|
|
|
|
|
|
"≅": "≅", # cong
|
|
|
|
|
|
"∂": "∂", # partial
|
|
|
|
|
|
"∇": "∇", # nabla
|
|
|
|
|
|
"∀": "∀", # forall
|
|
|
|
|
|
"∃": "∃", # exists
|
|
|
|
|
|
"∄": "∄", # nexists
|
|
|
|
|
|
"¬": "¬", # neg/lnot
|
|
|
|
|
|
"∧": "∧", # wedge/land
|
|
|
|
|
|
"∨": "∨", # vee/lor
|
|
|
|
|
|
"→": "→", # to/rightarrow
|
|
|
|
|
|
"←": "←", # leftarrow
|
|
|
|
|
|
"↔": "↔", # leftrightarrow
|
|
|
|
|
|
"⇒": "⇒", # Rightarrow
|
|
|
|
|
|
"⇐": "⇐", # Leftarrow
|
|
|
|
|
|
"⇔": "⇔", # Leftrightarrow
|
|
|
|
|
|
"↑": "↑", # uparrow
|
|
|
|
|
|
"↓": "↓", # downarrow
|
|
|
|
|
|
"⇑": "⇑", # Uparrow
|
|
|
|
|
|
"⇓": "⇓", # Downarrow
|
|
|
|
|
|
"↕": "↕", # updownarrow
|
|
|
|
|
|
"⇕": "⇕", # Updownarrow
|
|
|
|
|
|
"≪": "≪", # ll
|
|
|
|
|
|
"≫": "≫", # gg
|
|
|
|
|
|
"⩽": "⩽", # leqslant
|
|
|
|
|
|
"⩾": "⩾", # geqslant
|
|
|
|
|
|
"⊥": "⊥", # perp
|
|
|
|
|
|
"∥": "∥", # parallel
|
|
|
|
|
|
"∠": "∠", # angle
|
|
|
|
|
|
"△": "△", # triangle
|
|
|
|
|
|
"□": "□", # square
|
|
|
|
|
|
"◊": "◊", # diamond
|
|
|
|
|
|
"♠": "♠", # spadesuit
|
|
|
|
|
|
"♡": "♡", # heartsuit
|
|
|
|
|
|
"♢": "♢", # diamondsuit
|
|
|
|
|
|
"♣": "♣", # clubsuit
|
|
|
|
|
|
"ℓ": "ℓ", # ell
|
|
|
|
|
|
"℘": "℘", # wp (Weierstrass p)
|
|
|
|
|
|
"ℜ": "ℜ", # Re (real part)
|
|
|
|
|
|
"ℑ": "ℑ", # Im (imaginary part)
|
|
|
|
|
|
"ℵ": "ℵ", # aleph
|
|
|
|
|
|
"ℶ": "ℶ", # beth
|
2026-02-04 15:49:13 +08:00
|
|
|
|
}
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 15:49:13 +08:00
|
|
|
|
for entity, char in unicode_map.items():
|
|
|
|
|
|
mathml = mathml.replace(entity, char)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
|
# Also handle decimal entity format (&#NNNN;) for common characters
|
|
|
|
|
|
# Convert decimal to hex-based lookup
|
|
|
|
|
|
decimal_patterns = [
|
2026-02-07 21:28:46 +08:00
|
|
|
|
(r"λ", "λ"), # lambda (decimal 955 = hex 03BB)
|
|
|
|
|
|
(r"⋮", "⋮"), # vdots (decimal 8942 = hex 22EE)
|
|
|
|
|
|
(r"⋯", "⋯"), # cdots (decimal 8943 = hex 22EF)
|
|
|
|
|
|
(r"…", "…"), # ldots (decimal 8230 = hex 2026)
|
|
|
|
|
|
(r"∞", "∞"), # infty (decimal 8734 = hex 221E)
|
|
|
|
|
|
(r"∑", "∑"), # sum (decimal 8721 = hex 2211)
|
|
|
|
|
|
(r"∏", "∏"), # prod (decimal 8719 = hex 220F)
|
|
|
|
|
|
(r"√", "√"), # sqrt (decimal 8730 = hex 221A)
|
|
|
|
|
|
(r"∈", "∈"), # in (decimal 8712 = hex 2208)
|
|
|
|
|
|
(r"∉", "∉"), # notin (decimal 8713 = hex 2209)
|
|
|
|
|
|
(r"∩", "∩"), # cap (decimal 8745 = hex 2229)
|
|
|
|
|
|
(r"∪", "∪"), # cup (decimal 8746 = hex 222A)
|
|
|
|
|
|
(r"≤", "≤"), # leq (decimal 8804 = hex 2264)
|
|
|
|
|
|
(r"≥", "≥"), # geq (decimal 8805 = hex 2265)
|
|
|
|
|
|
(r"≠", "≠"), # neq (decimal 8800 = hex 2260)
|
|
|
|
|
|
(r"≈", "≈"), # approx (decimal 8776 = hex 2248)
|
|
|
|
|
|
(r"≡", "≡"), # equiv (decimal 8801 = hex 2261)
|
2026-02-05 13:18:55 +08:00
|
|
|
|
]
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-05 13:18:55 +08:00
|
|
|
|
for pattern, char in decimal_patterns:
|
|
|
|
|
|
mathml = mathml.replace(pattern, char)
|
2026-02-07 21:28:46 +08:00
|
|
|
|
|
2026-02-04 16:56:20 +08:00
|
|
|
|
# Step 8: Clean up extra whitespace
|
2026-02-07 21:28:46 +08:00
|
|
|
|
mathml = re.sub(r">\s+<", "><", mathml)
|
|
|
|
|
|
|
2026-02-04 15:49:13 +08:00
|
|
|
|
return mathml
|
2026-02-04 12:00:06 +08:00
|
|
|
|
|
2026-02-07 21:28:46 +08:00
|
|
|
|
def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
|
2026-02-04 12:00:06 +08:00
|
|
|
|
"""Convert LaTeX formula to standard MathML.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
latex_formula: Pure LaTeX formula (without delimiters).
|
2026-02-07 21:28:46 +08:00
|
|
|
|
is_display: True if display (block) formula, False if inline.
|
2026-02-04 12:00:06 +08:00
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
Standard MathML representation.
|
|
|
|
|
|
"""
|
2026-02-07 21:28:46 +08:00
|
|
|
|
return self._latex_to_mathml_cached(latex_formula, is_display=is_display)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
|
|
|
|
|
|
def _mathml_to_mml(self, mathml: str) -> str:
|
|
|
|
|
|
"""Convert standard MathML to mml:math format with namespace prefix.
|
|
|
|
|
|
|
|
|
|
|
|
Uses XSLT for efficient transformation. Transforms:
|
|
|
|
|
|
- <math ...> to <mml:math xmlns:mml="..." ...>
|
|
|
|
|
|
- All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
mathml: Standard MathML string.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
MathML with mml: namespace prefix.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not mathml:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
|
|
# Parse MathML
|
|
|
|
|
|
root = etree.fromstring(mathml.encode("utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
# Apply XSLT transformation (cached)
|
|
|
|
|
|
transform = self._get_mml_xslt_transform()
|
|
|
|
|
|
result_tree = transform(root)
|
|
|
|
|
|
|
|
|
|
|
|
# Serialize to string
|
|
|
|
|
|
return str(result_tree)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
# Fallback: simple string replacement (less robust but no lxml dependency)
|
|
|
|
|
|
result = mathml
|
|
|
|
|
|
# Add namespace to root math element
|
|
|
|
|
|
result = re.sub(
|
|
|
|
|
|
r"<math\b",
|
|
|
|
|
|
f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
|
|
|
|
|
|
result,
|
|
|
|
|
|
)
|
|
|
|
|
|
result = re.sub(r"</math>", "</mml:math>", result)
|
|
|
|
|
|
|
|
|
|
|
|
# Add mml: prefix to all other elements using a single regex
|
|
|
|
|
|
# Match opening tags
|
|
|
|
|
|
result = re.sub(
|
|
|
|
|
|
r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
|
|
|
|
|
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
|
|
|
|
|
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
|
|
|
|
|
r"maction|semantics|annotation|annotation-xml)\b",
|
|
|
|
|
|
r"<mml:\1",
|
|
|
|
|
|
result,
|
|
|
|
|
|
)
|
|
|
|
|
|
# Match closing tags
|
|
|
|
|
|
result = re.sub(
|
|
|
|
|
|
r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
|
|
|
|
|
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
|
|
|
|
|
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
|
|
|
|
|
r"maction|semantics|annotation|annotation-xml)>",
|
|
|
|
|
|
r"</mml:\1>",
|
|
|
|
|
|
result,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def _latex_to_omml(self, latex_formula: str) -> str:
|
|
|
|
|
|
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
|
|
|
|
|
|
|
|
|
|
|
Uses Pandoc to create DOCX in memory and extracts OMML from it.
|
|
|
|
|
|
Optimized to minimize disk I/O by using in-memory zip processing.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
latex_formula: Pure LaTeX formula (without delimiters).
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
OMML representation as XML string.
|
|
|
|
|
|
"""
|
|
|
|
|
|
import io
|
|
|
|
|
|
import zipfile
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
|
|
# Convert to DOCX bytes using Pandoc
|
|
|
|
|
|
# We still need a temp file for input, but output goes to temp file too
|
|
|
|
|
|
# Then we process the DOCX in memory
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
|
|
|
|
|
f.write(f"$${latex_formula}$$\n")
|
|
|
|
|
|
temp_md = f.name
|
|
|
|
|
|
|
|
|
|
|
|
temp_docx = temp_md.replace(".md", ".docx")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
|
temp_md,
|
|
|
|
|
|
"docx",
|
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
|
outputfile=temp_docx,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Read DOCX into memory and process as ZIP
|
|
|
|
|
|
with open(temp_docx, "rb") as f:
|
|
|
|
|
|
docx_bytes = f.read()
|
|
|
|
|
|
|
|
|
|
|
|
# Extract document.xml from DOCX (which is a ZIP file)
|
|
|
|
|
|
with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
|
|
|
|
|
|
document_xml = zf.read("word/document.xml")
|
|
|
|
|
|
|
|
|
|
|
|
# Parse XML and extract OMML
|
|
|
|
|
|
root = etree.fromstring(document_xml)
|
|
|
|
|
|
|
|
|
|
|
|
# Find all oMath elements
|
|
|
|
|
|
omml_parts = []
|
|
|
|
|
|
for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
|
|
|
|
|
|
omml_parts.append(etree.tostring(math, encoding="unicode"))
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(omml_parts)
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
# Cleanup temp files
|
|
|
|
|
|
if os.path.exists(temp_md):
|
|
|
|
|
|
os.remove(temp_md)
|
|
|
|
|
|
if os.path.exists(temp_docx):
|
|
|
|
|
|
os.remove(temp_docx)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
raise RuntimeError(f"OMML conversion failed: {e}") from e
|
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
def preprocess_for_export(self, md_text: str) -> str:
|
|
|
|
|
|
"""Preprocess markdown text for export to docx/pdf.
|
|
|
|
|
|
|
|
|
|
|
|
Handles LaTeX formula formatting, matrix environments, and
|
|
|
|
|
|
other transformations needed for proper Word/PDF rendering.
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
Uses pre-compiled regex patterns for better performance.
|
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
Args:
|
|
|
|
|
|
md_text: Raw markdown text.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
Preprocessed markdown text.
|
|
|
|
|
|
"""
|
|
|
|
|
|
# Replace \[1mm] => \vspace{1mm}
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
# Add blank lines around \[...\] block formulas
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
|
|
|
|
|
|
md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
# Remove arithmatex span wrappers
|
2026-02-04 12:00:06 +08:00
|
|
|
|
cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
# Convert inline formulas: \( \) => $ $
|
2026-02-04 12:00:06 +08:00
|
|
|
|
cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
# Convert block formulas: \[ \] => $$ $$
|
2026-02-04 12:00:06 +08:00
|
|
|
|
cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
# Remove spaces between $ and formula content
|
2026-02-04 12:00:06 +08:00
|
|
|
|
cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
# Convert matrix environments for better Word rendering
|
|
|
|
|
|
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
|
# Fix array environment column specifiers (remove spaces)
|
|
|
|
|
|
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
|
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
# Fix brace spacing for equation systems
|
|
|
|
|
|
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
|
|
|
|
|
|
|
|
|
|
|
# Convert cases and aligned environments
|
|
|
|
|
|
cleaned_md = self._convert_special_environments(cleaned_md)
|
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
|
# Handle LaTeX \tag{} commands for equation numbering
|
|
|
|
|
|
cleaned_md = self._convert_tag_commands(cleaned_md)
|
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
return cleaned_md
|
|
|
|
|
|
|
|
|
|
|
|
def _convert_matrix_environments(self, md_text: str) -> str:
|
|
|
|
|
|
"""Convert vmatrix/Vmatrix to left/right delimited forms.
|
|
|
|
|
|
|
|
|
|
|
|
This fixes the vertical line height issues in Word.
|
|
|
|
|
|
"""
|
|
|
|
|
|
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_VMATRIX.sub(
|
2025-12-31 17:38:32 +08:00
|
|
|
|
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
|
|
|
|
|
md_text,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_VMATRIX_DOUBLE.sub(
|
2025-12-31 17:38:32 +08:00
|
|
|
|
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
|
|
|
|
|
md_text,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
|
def _fix_array_column_specifiers(self, md_text: str) -> str:
|
|
|
|
|
|
"""Fix array environment column specifiers by removing spaces.
|
|
|
|
|
|
|
|
|
|
|
|
Pandoc's OMML converter doesn't accept spaces between column alignment
|
|
|
|
|
|
specifiers in array environments. This converts patterns like
|
|
|
|
|
|
{c c c c} to {cccc}.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def remove_spaces_in_specifier(match: re.Match) -> str:
|
|
|
|
|
|
"""Remove spaces from column specifier."""
|
|
|
|
|
|
specifier = match.group(1)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
|
2026-01-14 14:18:00 +08:00
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
|
2026-01-14 14:18:00 +08:00
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
def _fix_brace_spacing(self, md_text: str) -> str:
|
|
|
|
|
|
"""Fix spacing issues with braces in equation systems.
|
|
|
|
|
|
|
|
|
|
|
|
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
|
|
|
|
|
"""
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
|
|
|
|
|
|
md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
return md_text
|
|
|
|
|
|
|
|
|
|
|
|
def _convert_special_environments(self, md_text: str) -> str:
|
|
|
|
|
|
"""Convert cases and aligned environments to array format.
|
|
|
|
|
|
|
|
|
|
|
|
These environments have better rendering support in Word/OMML.
|
|
|
|
|
|
"""
|
2026-02-04 12:00:06 +08:00
|
|
|
|
# Pre-compiled pattern for alignment marker removal
|
|
|
|
|
|
_re_align_marker = re.compile(r"(^|\\\\)\s*&")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
def convert_cases(match: re.Match) -> str:
|
|
|
|
|
|
content = match.group(1)
|
|
|
|
|
|
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_CASES.sub(convert_cases, md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
def convert_aligned_to_array(match: re.Match) -> str:
|
|
|
|
|
|
content = match.group(1)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
content = _re_align_marker.sub(r"\1", content)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
def convert_standalone_aligned(match: re.Match) -> str:
|
|
|
|
|
|
content = match.group(1)
|
2026-02-04 12:00:06 +08:00
|
|
|
|
content = _re_align_marker.sub(r"\1", content)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
return r"\begin{array}{l}" + content + r"\end{array}"
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
|
def _convert_tag_commands(self, md_text: str) -> str:
|
|
|
|
|
|
"""Convert LaTeX \\tag{} commands to Word-compatible format.
|
|
|
|
|
|
|
|
|
|
|
|
The \\tag{} command is not supported in Word OMML format, so we convert it to
|
2026-02-04 12:00:06 +08:00
|
|
|
|
use simple spacing (\\quad) to push the equation number to the right side.
|
2026-01-14 14:18:00 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def convert_tag(match: re.Match) -> str:
|
|
|
|
|
|
formula_content = match.group(1)
|
|
|
|
|
|
tag_content = match.group(2)
|
|
|
|
|
|
return f"$${formula_content} \\quad ({tag_content})$$"
|
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
|
return self._RE_TAG.sub(convert_tag, md_text)
|
2026-01-14 14:18:00 +08:00
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
|
|
|
|
|
"""Export markdown to docx or pdf file.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
md_text: Markdown text to export.
|
|
|
|
|
|
export_type: Export format, either 'docx' or 'pdf'.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
bytes of the exported file.
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
ValueError: If export_type is not supported.
|
|
|
|
|
|
RuntimeError: If export fails.
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
# Preprocess markdown
|
|
|
|
|
|
cleaned_md = self.preprocess_for_export(md_text)
|
|
|
|
|
|
|
|
|
|
|
|
# Create temp file for input
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
|
|
|
|
|
|
f_in.write(cleaned_md.encode("utf-8"))
|
|
|
|
|
|
md_path = f_in.name
|
|
|
|
|
|
|
|
|
|
|
|
output_file = md_path + "." + export_type
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
if export_type == "docx":
|
|
|
|
|
|
self._export_docx(md_path, output_file)
|
|
|
|
|
|
with open(output_file, "rb") as f:
|
|
|
|
|
|
return f.read()
|
|
|
|
|
|
else: # pdf
|
|
|
|
|
|
self._export_pdf(md_path, output_file)
|
|
|
|
|
|
with open(output_file, "rb") as f:
|
|
|
|
|
|
return f.read()
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
# Cleanup on error
|
|
|
|
|
|
self._cleanup_files(md_path, output_file)
|
|
|
|
|
|
raise RuntimeError(f"Export failed: {e}") from e
|
|
|
|
|
|
finally:
|
|
|
|
|
|
# Always cleanup input file
|
|
|
|
|
|
if os.path.exists(md_path):
|
|
|
|
|
|
os.remove(md_path)
|
|
|
|
|
|
|
|
|
|
|
|
def _export_docx(self, input_path: str, output_path: str) -> None:
|
|
|
|
|
|
"""Export to DOCX format using pypandoc."""
|
|
|
|
|
|
extra_args = [
|
|
|
|
|
|
"--highlight-style=pygments",
|
2026-03-10 19:52:22 +08:00
|
|
|
|
"--reference-doc=app/pkg/reference.docx",
|
2025-12-31 17:38:32 +08:00
|
|
|
|
]
|
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
|
input_path,
|
|
|
|
|
|
"docx",
|
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
|
outputfile=output_path,
|
|
|
|
|
|
extra_args=extra_args,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _export_pdf(self, input_path: str, output_path: str) -> None:
|
|
|
|
|
|
"""Export to PDF format using pypandoc with XeLaTeX."""
|
|
|
|
|
|
extra_args = [
|
|
|
|
|
|
"--pdf-engine=xelatex",
|
|
|
|
|
|
"-V",
|
|
|
|
|
|
"mainfont=Noto Sans CJK SC",
|
|
|
|
|
|
"--highlight-style=pygments",
|
|
|
|
|
|
]
|
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
|
input_path,
|
|
|
|
|
|
"pdf",
|
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
|
outputfile=output_path,
|
|
|
|
|
|
extra_args=extra_args,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _cleanup_files(self, *paths: str) -> None:
|
|
|
|
|
|
"""Remove files if they exist."""
|
|
|
|
|
|
for path in paths:
|
|
|
|
|
|
if os.path.exists(path):
|
|
|
|
|
|
os.remove(path)
|
|
|
|
|
|
|
|
|
|
|
|
def cleanup_export_file(self, file_path: str) -> None:
|
|
|
|
|
|
"""Cleanup exported file after sending response.
|
|
|
|
|
|
|
|
|
|
|
|
Call this after sending the file to the client.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
file_path: Path to the exported file.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
|
|
os.remove(file_path)
|