2025-12-31 17:38:32 +08:00
|
|
|
"""Markdown conversion and export service using pypandoc."""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import tempfile
|
|
|
|
|
from dataclasses import dataclass
|
2026-02-04 12:00:06 +08:00
|
|
|
from functools import lru_cache
|
2025-12-31 17:38:32 +08:00
|
|
|
from typing import Literal
|
|
|
|
|
|
|
|
|
|
import pypandoc
|
2026-02-04 12:00:06 +08:00
|
|
|
from latex2mathml.converter import convert as latex_to_mathml
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class ConvertResult:
|
2026-02-04 12:00:06 +08:00
|
|
|
"""Result of markdown conversion.
|
|
|
|
|
|
|
|
|
|
Only populated when input contains pure LaTeX formula.
|
|
|
|
|
All fields are empty strings when input contains mixed content (text + formula).
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
latex: Pure LaTeX formula code (without delimiters).
|
|
|
|
|
mathml: Standard MathML format.
|
|
|
|
|
mml: XML MathML with mml: namespace prefix (mml:math).
|
|
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
latex: str
|
|
|
|
|
mathml: str
|
2026-02-04 12:00:06 +08:00
|
|
|
mml: str
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class ExportResult:
|
|
|
|
|
"""Result of markdown export."""
|
|
|
|
|
|
|
|
|
|
file_path: str
|
|
|
|
|
content_type: str
|
|
|
|
|
download_name: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ExportType = Literal["docx", "pdf"]
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
# MathML namespace
|
|
|
|
|
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
|
|
|
|
|
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
|
|
|
|
|
|
|
|
|
|
# XSLT for MathML to mml: namespace conversion
|
|
|
|
|
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
<xsl:stylesheet version="1.0"
|
|
|
|
|
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
|
|
|
|
xmlns:mml="http://www.w3.org/1998/Math/MathML"
|
|
|
|
|
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
|
|
|
|
exclude-result-prefixes="m">
|
|
|
|
|
|
|
|
|
|
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
|
|
|
|
|
|
|
|
|
|
<!-- Match root math element -->
|
|
|
|
|
<xsl:template match="m:math|math">
|
|
|
|
|
<mml:math>
|
|
|
|
|
<xsl:apply-templates select="@*|node()"/>
|
|
|
|
|
</mml:math>
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
<!-- Match all other MathML elements -->
|
|
|
|
|
<xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
|
|
|
|
|
<xsl:element name="mml:{local-name()}">
|
|
|
|
|
<xsl:apply-templates select="@*|node()"/>
|
|
|
|
|
</xsl:element>
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
<!-- Copy attributes -->
|
|
|
|
|
<xsl:template match="@*">
|
|
|
|
|
<xsl:if test="local-name() != 'xmlns'">
|
|
|
|
|
<xsl:copy/>
|
|
|
|
|
</xsl:if>
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
<!-- Copy text nodes -->
|
|
|
|
|
<xsl:template match="text()">
|
|
|
|
|
<xsl:value-of select="."/>
|
|
|
|
|
</xsl:template>
|
|
|
|
|
|
|
|
|
|
</xsl:stylesheet>
|
|
|
|
|
"""
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
class Converter:
|
2026-02-04 12:00:06 +08:00
|
|
|
"""Service for conversion and export operations.
|
|
|
|
|
|
|
|
|
|
Conversion rules:
|
|
|
|
|
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
|
|
|
|
|
- Mixed content (text + formula) returns empty results for all formats.
|
|
|
|
|
- OMML conversion is provided as a separate method due to performance overhead.
|
|
|
|
|
|
|
|
|
|
Performance optimizations:
|
|
|
|
|
- Pre-compiled regex patterns
|
|
|
|
|
- XSLT-based MML conversion
|
|
|
|
|
- Cached XSLT transforms
|
|
|
|
|
- Direct Pandoc OMML output (avoids DOCX parsing)
|
|
|
|
|
"""
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
# Pandoc input format with LaTeX math extensions
|
|
|
|
|
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
# Pre-compiled regex patterns for formula detection
|
|
|
|
|
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
|
|
|
|
|
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
|
|
|
|
|
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
|
|
|
|
|
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
|
|
|
|
|
_RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
|
|
|
|
|
|
|
|
|
|
# Pre-compiled regex patterns for preprocessing
|
|
|
|
|
_RE_VSPACE = re.compile(r"\\\[1mm\]")
|
|
|
|
|
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
|
|
|
|
|
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
|
|
|
|
|
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
|
|
|
|
|
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
|
|
|
|
|
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
|
|
|
|
|
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
|
|
|
|
|
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
|
|
|
|
|
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
|
|
|
|
|
_RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
|
|
|
|
|
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
|
|
|
|
|
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
|
|
|
|
|
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
|
|
|
|
|
_RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
|
|
|
|
|
|
|
|
|
|
# Cached XSLT transform
|
|
|
|
|
_mml_xslt_transform = None
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def __init__(self):
|
|
|
|
|
"""Initialize converter."""
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
@classmethod
|
|
|
|
|
def _get_mml_xslt_transform(cls):
|
|
|
|
|
"""Get cached XSLT transform for MathML to mml: conversion."""
|
|
|
|
|
if cls._mml_xslt_transform is None:
|
|
|
|
|
from lxml import etree
|
|
|
|
|
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
|
|
|
|
|
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
|
|
|
|
|
return cls._mml_xslt_transform
|
|
|
|
|
|
|
|
|
|
def _is_formula_only(self, text: str) -> bool:
|
|
|
|
|
"""Check if text contains only a LaTeX formula (no mixed content).
|
|
|
|
|
|
|
|
|
|
A text is considered formula-only if it matches one of these patterns:
|
|
|
|
|
- Display math: $$...$$ or \\[...\\]
|
|
|
|
|
- Inline math: $...$ or \\(...\\)
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
text: Input text to check.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
True if the text contains only a LaTeX formula, False otherwise.
|
|
|
|
|
"""
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
|
|
|
|
if not text:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# Strict patterns: entire text must be a single formula with delimiters
|
|
|
|
|
# Using pre-compiled patterns with fullmatch semantics
|
|
|
|
|
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
|
|
|
|
|
return True
|
|
|
|
|
if self._RE_DISPLAY_BRACKET.fullmatch(text):
|
|
|
|
|
return True
|
|
|
|
|
if self._RE_INLINE_DOLLAR.fullmatch(text):
|
|
|
|
|
return True
|
|
|
|
|
if self._RE_INLINE_PAREN.fullmatch(text):
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
2026-02-04 12:00:06 +08:00
|
|
|
"""Convert markdown to LaTeX, MathML, and MML formats.
|
|
|
|
|
|
|
|
|
|
Only converts when input contains a pure LaTeX formula.
|
|
|
|
|
Mixed content (text + formula) returns empty strings for all fields.
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
md_text: Markdown text to convert.
|
|
|
|
|
|
|
|
|
|
Returns:
|
2026-02-04 12:00:06 +08:00
|
|
|
ConvertResult with latex, mathml, and mml fields.
|
|
|
|
|
All fields are empty if input is not a pure formula.
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
Raises:
|
2026-02-04 12:00:06 +08:00
|
|
|
RuntimeError: If conversion fails for a valid formula.
|
2025-12-31 17:38:32 +08:00
|
|
|
"""
|
2026-02-04 12:00:06 +08:00
|
|
|
# Empty input returns empty result
|
|
|
|
|
if not md_text or not md_text.strip():
|
|
|
|
|
return ConvertResult(latex="", mathml="", mml="")
|
|
|
|
|
|
|
|
|
|
# Check if input is formula-only
|
|
|
|
|
if not self._is_formula_only(md_text):
|
|
|
|
|
# Mixed content: cannot convert to formula formats
|
|
|
|
|
return ConvertResult(latex="", mathml="", mml="")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
try:
|
2026-02-04 12:00:06 +08:00
|
|
|
# Extract the LaTeX formula content (remove delimiters)
|
|
|
|
|
latex_formula = self._extract_latex_formula(md_text)
|
|
|
|
|
|
|
|
|
|
# Convert to MathML
|
|
|
|
|
mathml = self._latex_to_mathml(latex_formula)
|
|
|
|
|
|
|
|
|
|
# Convert MathML to mml:math format (with namespace prefix)
|
|
|
|
|
mml = self._mathml_to_mml(mathml)
|
|
|
|
|
|
|
|
|
|
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"Conversion failed: {e}") from e
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
def convert_to_omml(self, latex_formula: str) -> str:
|
|
|
|
|
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
|
|
|
|
|
|
|
|
|
This is a separate method due to the performance overhead of OMML conversion,
|
|
|
|
|
which requires creating a temporary DOCX file.
|
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
The formula is preprocessed using the same logic as export_to_file to ensure
|
|
|
|
|
proper conversion.
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
Args:
|
|
|
|
|
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
OMML representation as XML string.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If latex_formula is empty.
|
|
|
|
|
RuntimeError: If conversion fails.
|
|
|
|
|
"""
|
|
|
|
|
if not latex_formula or not latex_formula.strip():
|
|
|
|
|
raise ValueError("LaTeX formula cannot be empty")
|
|
|
|
|
|
2026-02-04 12:45:34 +08:00
|
|
|
# Preprocess formula using the same preprocessing as export
|
|
|
|
|
preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
|
|
|
|
|
|
|
|
|
|
return self._latex_to_omml(preprocessed)
|
|
|
|
|
|
|
|
|
|
def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
|
|
|
|
|
"""Preprocess LaTeX formula for OMML conversion.
|
|
|
|
|
|
|
|
|
|
Applies the same preprocessing steps as preprocess_for_export to ensure
|
|
|
|
|
consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
latex_formula: Pure LaTeX formula.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Preprocessed LaTeX formula.
|
|
|
|
|
"""
|
|
|
|
|
# Use the same preprocessing methods as export
|
|
|
|
|
# 1. Convert matrix environments
|
|
|
|
|
latex_formula = self._convert_matrix_environments(latex_formula)
|
|
|
|
|
|
|
|
|
|
# 2. Fix array column specifiers (remove spaces)
|
|
|
|
|
latex_formula = self._fix_array_column_specifiers(latex_formula)
|
|
|
|
|
|
|
|
|
|
# 3. Fix brace spacing
|
|
|
|
|
latex_formula = self._fix_brace_spacing(latex_formula)
|
|
|
|
|
|
|
|
|
|
# 4. Convert special environments (cases, aligned)
|
|
|
|
|
latex_formula = self._convert_special_environments(latex_formula)
|
|
|
|
|
|
|
|
|
|
return latex_formula
|
2026-02-04 12:00:06 +08:00
|
|
|
|
|
|
|
|
def _extract_latex_formula(self, text: str) -> str:
|
|
|
|
|
"""Extract LaTeX formula from text by removing delimiters.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
text: Text containing LaTeX formula with delimiters.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Pure LaTeX formula without delimiters.
|
|
|
|
|
"""
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
|
|
|
|
# Remove display math delimiters: $$...$$ or \[...\]
|
|
|
|
|
if text.startswith("$$") and text.endswith("$$"):
|
|
|
|
|
return text[2:-2].strip()
|
|
|
|
|
if text.startswith("\\[") and text.endswith("\\]"):
|
|
|
|
|
return text[2:-2].strip()
|
|
|
|
|
|
|
|
|
|
# Remove inline math delimiters: $...$ or \(...\)
|
|
|
|
|
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
|
|
|
|
|
return text[1:-1].strip()
|
|
|
|
|
if text.startswith("\\(") and text.endswith("\\)"):
|
|
|
|
|
return text[2:-2].strip()
|
|
|
|
|
|
|
|
|
|
# If no delimiters, return as-is
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
@lru_cache(maxsize=256)
|
|
|
|
|
def _latex_to_mathml_cached(latex_formula: str) -> str:
|
|
|
|
|
"""Cached conversion of LaTeX formula to MathML.
|
|
|
|
|
|
|
|
|
|
Uses LRU cache to avoid recomputing for repeated formulas.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
# Use latex2mathml library for conversion (fast, pure Python)
|
|
|
|
|
return latex_to_mathml(latex_formula)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
# Fallback: try with Pandoc (slower, but more robust)
|
|
|
|
|
try:
|
|
|
|
|
mathml_html = pypandoc.convert_text(
|
|
|
|
|
f"${latex_formula}$",
|
|
|
|
|
"html",
|
|
|
|
|
format="markdown+tex_math_dollars",
|
|
|
|
|
extra_args=["--mathml"],
|
|
|
|
|
)
|
|
|
|
|
# Extract just the <math> element from the HTML
|
|
|
|
|
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
|
|
|
|
|
if match:
|
|
|
|
|
return match.group(0)
|
|
|
|
|
return mathml_html.rstrip("\n")
|
|
|
|
|
except Exception as pandoc_error:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
def _latex_to_mathml(self, latex_formula: str) -> str:
|
|
|
|
|
"""Convert LaTeX formula to standard MathML.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
latex_formula: Pure LaTeX formula (without delimiters).
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Standard MathML representation.
|
|
|
|
|
"""
|
|
|
|
|
return self._latex_to_mathml_cached(latex_formula)
|
|
|
|
|
|
|
|
|
|
def _mathml_to_mml(self, mathml: str) -> str:
|
|
|
|
|
"""Convert standard MathML to mml:math format with namespace prefix.
|
|
|
|
|
|
|
|
|
|
Uses XSLT for efficient transformation. Transforms:
|
|
|
|
|
- <math ...> to <mml:math xmlns:mml="..." ...>
|
|
|
|
|
- All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
mathml: Standard MathML string.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
MathML with mml: namespace prefix.
|
|
|
|
|
"""
|
|
|
|
|
if not mathml:
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
# Parse MathML
|
|
|
|
|
root = etree.fromstring(mathml.encode("utf-8"))
|
|
|
|
|
|
|
|
|
|
# Apply XSLT transformation (cached)
|
|
|
|
|
transform = self._get_mml_xslt_transform()
|
|
|
|
|
result_tree = transform(root)
|
|
|
|
|
|
|
|
|
|
# Serialize to string
|
|
|
|
|
return str(result_tree)
|
|
|
|
|
|
|
|
|
|
except Exception:
|
|
|
|
|
# Fallback: simple string replacement (less robust but no lxml dependency)
|
|
|
|
|
result = mathml
|
|
|
|
|
# Add namespace to root math element
|
|
|
|
|
result = re.sub(
|
|
|
|
|
r"<math\b",
|
|
|
|
|
f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
|
|
|
|
|
result,
|
|
|
|
|
)
|
|
|
|
|
result = re.sub(r"</math>", "</mml:math>", result)
|
|
|
|
|
|
|
|
|
|
# Add mml: prefix to all other elements using a single regex
|
|
|
|
|
# Match opening tags
|
|
|
|
|
result = re.sub(
|
|
|
|
|
r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
|
|
|
|
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
|
|
|
|
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
|
|
|
|
r"maction|semantics|annotation|annotation-xml)\b",
|
|
|
|
|
r"<mml:\1",
|
|
|
|
|
result,
|
|
|
|
|
)
|
|
|
|
|
# Match closing tags
|
|
|
|
|
result = re.sub(
|
|
|
|
|
r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
|
|
|
|
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
|
|
|
|
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
|
|
|
|
r"maction|semantics|annotation|annotation-xml)>",
|
|
|
|
|
r"</mml:\1>",
|
|
|
|
|
result,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
def _latex_to_omml(self, latex_formula: str) -> str:
|
|
|
|
|
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
|
|
|
|
|
|
|
|
|
Uses Pandoc to create DOCX in memory and extracts OMML from it.
|
|
|
|
|
Optimized to minimize disk I/O by using in-memory zip processing.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
latex_formula: Pure LaTeX formula (without delimiters).
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
OMML representation as XML string.
|
|
|
|
|
"""
|
|
|
|
|
import io
|
|
|
|
|
import zipfile
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
# Convert to DOCX bytes using Pandoc
|
|
|
|
|
# We still need a temp file for input, but output goes to temp file too
|
|
|
|
|
# Then we process the DOCX in memory
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
|
|
|
|
f.write(f"$${latex_formula}$$\n")
|
|
|
|
|
temp_md = f.name
|
|
|
|
|
|
|
|
|
|
temp_docx = temp_md.replace(".md", ".docx")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
temp_md,
|
|
|
|
|
"docx",
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
outputfile=temp_docx,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Read DOCX into memory and process as ZIP
|
|
|
|
|
with open(temp_docx, "rb") as f:
|
|
|
|
|
docx_bytes = f.read()
|
|
|
|
|
|
|
|
|
|
# Extract document.xml from DOCX (which is a ZIP file)
|
|
|
|
|
with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
|
|
|
|
|
document_xml = zf.read("word/document.xml")
|
|
|
|
|
|
|
|
|
|
# Parse XML and extract OMML
|
|
|
|
|
root = etree.fromstring(document_xml)
|
|
|
|
|
|
|
|
|
|
# Find all oMath elements
|
|
|
|
|
omml_parts = []
|
|
|
|
|
for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
|
|
|
|
|
omml_parts.append(etree.tostring(math, encoding="unicode"))
|
|
|
|
|
|
|
|
|
|
return "\n".join(omml_parts)
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
# Cleanup temp files
|
|
|
|
|
if os.path.exists(temp_md):
|
|
|
|
|
os.remove(temp_md)
|
|
|
|
|
if os.path.exists(temp_docx):
|
|
|
|
|
os.remove(temp_docx)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"OMML conversion failed: {e}") from e
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def preprocess_for_export(self, md_text: str) -> str:
|
|
|
|
|
"""Preprocess markdown text for export to docx/pdf.
|
|
|
|
|
|
|
|
|
|
Handles LaTeX formula formatting, matrix environments, and
|
|
|
|
|
other transformations needed for proper Word/PDF rendering.
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
Uses pre-compiled regex patterns for better performance.
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
Args:
|
|
|
|
|
md_text: Raw markdown text.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Preprocessed markdown text.
|
|
|
|
|
"""
|
|
|
|
|
# Replace \[1mm] => \vspace{1mm}
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
# Add blank lines around \[...\] block formulas
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
|
|
|
|
|
md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
# Remove arithmatex span wrappers
|
2026-02-04 12:00:06 +08:00
|
|
|
cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
# Convert inline formulas: \( \) => $ $
|
2026-02-04 12:00:06 +08:00
|
|
|
cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
# Convert block formulas: \[ \] => $$ $$
|
2026-02-04 12:00:06 +08:00
|
|
|
cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
# Remove spaces between $ and formula content
|
2026-02-04 12:00:06 +08:00
|
|
|
cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
# Convert matrix environments for better Word rendering
|
|
|
|
|
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
# Fix array environment column specifiers (remove spaces)
|
|
|
|
|
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
# Fix brace spacing for equation systems
|
|
|
|
|
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
|
|
|
|
|
|
|
|
|
# Convert cases and aligned environments
|
|
|
|
|
cleaned_md = self._convert_special_environments(cleaned_md)
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
# Handle LaTeX \tag{} commands for equation numbering
|
|
|
|
|
cleaned_md = self._convert_tag_commands(cleaned_md)
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
return cleaned_md
|
|
|
|
|
|
|
|
|
|
def _convert_matrix_environments(self, md_text: str) -> str:
|
|
|
|
|
"""Convert vmatrix/Vmatrix to left/right delimited forms.
|
|
|
|
|
|
|
|
|
|
This fixes the vertical line height issues in Word.
|
|
|
|
|
"""
|
|
|
|
|
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_VMATRIX.sub(
|
2025-12-31 17:38:32 +08:00
|
|
|
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
|
|
|
|
md_text,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_VMATRIX_DOUBLE.sub(
|
2025-12-31 17:38:32 +08:00
|
|
|
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
|
|
|
|
md_text,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
def _fix_array_column_specifiers(self, md_text: str) -> str:
|
|
|
|
|
"""Fix array environment column specifiers by removing spaces.
|
|
|
|
|
|
|
|
|
|
Pandoc's OMML converter doesn't accept spaces between column alignment
|
|
|
|
|
specifiers in array environments. This converts patterns like
|
|
|
|
|
{c c c c} to {cccc}.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def remove_spaces_in_specifier(match: re.Match) -> str:
|
|
|
|
|
"""Remove spaces from column specifier."""
|
|
|
|
|
specifier = match.group(1)
|
2026-02-04 12:00:06 +08:00
|
|
|
return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
|
2026-01-14 14:18:00 +08:00
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
|
2026-01-14 14:18:00 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def _fix_brace_spacing(self, md_text: str) -> str:
|
|
|
|
|
"""Fix spacing issues with braces in equation systems.
|
|
|
|
|
|
|
|
|
|
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
|
|
|
|
"""
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
|
|
|
|
|
md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
return md_text
|
|
|
|
|
|
|
|
|
|
def _convert_special_environments(self, md_text: str) -> str:
|
|
|
|
|
"""Convert cases and aligned environments to array format.
|
|
|
|
|
|
|
|
|
|
These environments have better rendering support in Word/OMML.
|
|
|
|
|
"""
|
2026-02-04 12:00:06 +08:00
|
|
|
# Pre-compiled pattern for alignment marker removal
|
|
|
|
|
_re_align_marker = re.compile(r"(^|\\\\)\s*&")
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
def convert_cases(match: re.Match) -> str:
|
|
|
|
|
content = match.group(1)
|
|
|
|
|
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_CASES.sub(convert_cases, md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
def convert_aligned_to_array(match: re.Match) -> str:
|
|
|
|
|
content = match.group(1)
|
2026-02-04 12:00:06 +08:00
|
|
|
content = _re_align_marker.sub(r"\1", content)
|
2025-12-31 17:38:32 +08:00
|
|
|
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
def convert_standalone_aligned(match: re.Match) -> str:
|
|
|
|
|
content = match.group(1)
|
2026-02-04 12:00:06 +08:00
|
|
|
content = _re_align_marker.sub(r"\1", content)
|
2025-12-31 17:38:32 +08:00
|
|
|
return r"\begin{array}{l}" + content + r"\end{array}"
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)
|
2025-12-31 17:38:32 +08:00
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
def _convert_tag_commands(self, md_text: str) -> str:
|
|
|
|
|
"""Convert LaTeX \\tag{} commands to Word-compatible format.
|
|
|
|
|
|
|
|
|
|
The \\tag{} command is not supported in Word OMML format, so we convert it to
|
2026-02-04 12:00:06 +08:00
|
|
|
use simple spacing (\\quad) to push the equation number to the right side.
|
2026-01-14 14:18:00 +08:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def convert_tag(match: re.Match) -> str:
|
|
|
|
|
formula_content = match.group(1)
|
|
|
|
|
tag_content = match.group(2)
|
|
|
|
|
return f"$${formula_content} \\quad ({tag_content})$$"
|
|
|
|
|
|
2026-02-04 12:00:06 +08:00
|
|
|
return self._RE_TAG.sub(convert_tag, md_text)
|
2026-01-14 14:18:00 +08:00
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
|
|
|
|
"""Export markdown to docx or pdf file.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
md_text: Markdown text to export.
|
|
|
|
|
export_type: Export format, either 'docx' or 'pdf'.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
bytes of the exported file.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If export_type is not supported.
|
|
|
|
|
RuntimeError: If export fails.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Preprocess markdown
|
|
|
|
|
cleaned_md = self.preprocess_for_export(md_text)
|
|
|
|
|
|
|
|
|
|
# Create temp file for input
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
|
|
|
|
|
f_in.write(cleaned_md.encode("utf-8"))
|
|
|
|
|
md_path = f_in.name
|
|
|
|
|
|
|
|
|
|
output_file = md_path + "." + export_type
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if export_type == "docx":
|
|
|
|
|
self._export_docx(md_path, output_file)
|
|
|
|
|
with open(output_file, "rb") as f:
|
|
|
|
|
return f.read()
|
|
|
|
|
else: # pdf
|
|
|
|
|
self._export_pdf(md_path, output_file)
|
|
|
|
|
with open(output_file, "rb") as f:
|
|
|
|
|
return f.read()
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
# Cleanup on error
|
|
|
|
|
self._cleanup_files(md_path, output_file)
|
|
|
|
|
raise RuntimeError(f"Export failed: {e}") from e
|
|
|
|
|
finally:
|
|
|
|
|
# Always cleanup input file
|
|
|
|
|
if os.path.exists(md_path):
|
|
|
|
|
os.remove(md_path)
|
|
|
|
|
|
|
|
|
|
def _export_docx(self, input_path: str, output_path: str) -> None:
|
|
|
|
|
"""Export to DOCX format using pypandoc."""
|
|
|
|
|
extra_args = [
|
|
|
|
|
"--highlight-style=pygments",
|
|
|
|
|
f"--reference-doc=app/pkg/reference.docx",
|
|
|
|
|
]
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
input_path,
|
|
|
|
|
"docx",
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
outputfile=output_path,
|
|
|
|
|
extra_args=extra_args,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _export_pdf(self, input_path: str, output_path: str) -> None:
|
|
|
|
|
"""Export to PDF format using pypandoc with XeLaTeX."""
|
|
|
|
|
extra_args = [
|
|
|
|
|
"--pdf-engine=xelatex",
|
|
|
|
|
"-V",
|
|
|
|
|
"mainfont=Noto Sans CJK SC",
|
|
|
|
|
"--highlight-style=pygments",
|
|
|
|
|
]
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
input_path,
|
|
|
|
|
"pdf",
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
outputfile=output_path,
|
|
|
|
|
extra_args=extra_args,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _cleanup_files(self, *paths: str) -> None:
|
|
|
|
|
"""Remove files if they exist."""
|
|
|
|
|
for path in paths:
|
|
|
|
|
if os.path.exists(path):
|
|
|
|
|
os.remove(path)
|
|
|
|
|
|
|
|
|
|
def cleanup_export_file(self, file_path: str) -> None:
|
|
|
|
|
"""Cleanup exported file after sending response.
|
|
|
|
|
|
|
|
|
|
Call this after sending the file to the client.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
file_path: Path to the exported file.
|
|
|
|
|
"""
|
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
|
os.remove(file_path)
|