827 lines
30 KiB
Python
827 lines
30 KiB
Python
"""Markdown conversion and export service using pypandoc."""
|
||
|
||
import os
|
||
import re
|
||
import tempfile
|
||
from dataclasses import dataclass
|
||
from functools import lru_cache
|
||
from typing import Literal
|
||
|
||
import pypandoc
|
||
from latex2mathml.converter import convert as latex_to_mathml
|
||
|
||
|
||
@dataclass
|
||
class ConvertResult:
|
||
"""Result of markdown conversion.
|
||
|
||
Only populated when input contains pure LaTeX formula.
|
||
All fields are empty strings when input contains mixed content (text + formula).
|
||
|
||
Attributes:
|
||
latex: Pure LaTeX formula code (without delimiters).
|
||
mathml: Standard MathML format.
|
||
mml: XML MathML with mml: namespace prefix (mml:math).
|
||
"""
|
||
|
||
latex: str
|
||
mathml: str
|
||
mml: str
|
||
|
||
|
||
@dataclass
|
||
class ExportResult:
|
||
"""Result of markdown export."""
|
||
|
||
file_path: str
|
||
content_type: str
|
||
download_name: str
|
||
|
||
|
||
ExportType = Literal["docx", "pdf"]
|
||
|
||
# MathML namespace
|
||
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
|
||
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
|
||
|
||
# XSLT for MathML to mml: namespace conversion
|
||
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
|
||
<xsl:stylesheet version="1.0"
|
||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||
xmlns:mml="http://www.w3.org/1998/Math/MathML"
|
||
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
||
exclude-result-prefixes="m">
|
||
|
||
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
|
||
|
||
<!-- Match root math element -->
|
||
<xsl:template match="m:math|math">
|
||
<mml:math>
|
||
<xsl:apply-templates select="@*|node()"/>
|
||
</mml:math>
|
||
</xsl:template>
|
||
|
||
<!-- Match all other MathML elements -->
|
||
<xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
|
||
<xsl:element name="mml:{local-name()}">
|
||
<xsl:apply-templates select="@*|node()"/>
|
||
</xsl:element>
|
||
</xsl:template>
|
||
|
||
<!-- Copy attributes -->
|
||
<xsl:template match="@*">
|
||
<xsl:if test="local-name() != 'xmlns'">
|
||
<xsl:copy/>
|
||
</xsl:if>
|
||
</xsl:template>
|
||
|
||
<!-- Copy text nodes -->
|
||
<xsl:template match="text()">
|
||
<xsl:value-of select="."/>
|
||
</xsl:template>
|
||
|
||
</xsl:stylesheet>
|
||
"""
|
||
|
||
|
||
class Converter:
|
||
"""Service for conversion and export operations.
|
||
|
||
Conversion rules:
|
||
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
|
||
- Mixed content (text + formula) returns empty results for all formats.
|
||
- OMML conversion is provided as a separate method due to performance overhead.
|
||
|
||
Performance optimizations:
|
||
- Pre-compiled regex patterns
|
||
- XSLT-based MML conversion
|
||
- Cached XSLT transforms
|
||
- Direct Pandoc OMML output (avoids DOCX parsing)
|
||
"""
|
||
|
||
# Pandoc input format with LaTeX math extensions
|
||
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
||
|
||
# Pre-compiled regex patterns for formula detection
|
||
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
|
||
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
|
||
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
|
||
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
|
||
_RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
|
||
|
||
# Pre-compiled regex patterns for preprocessing
|
||
_RE_VSPACE = re.compile(r"\\\[1mm\]")
|
||
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
|
||
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
|
||
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
|
||
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
|
||
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
|
||
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
|
||
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
|
||
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
|
||
_RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
|
||
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
|
||
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
|
||
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
|
||
_RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
|
||
|
||
# Cached XSLT transform
|
||
_mml_xslt_transform = None
|
||
|
||
def __init__(self):
|
||
"""Initialize converter."""
|
||
|
||
@classmethod
|
||
def _get_mml_xslt_transform(cls):
|
||
"""Get cached XSLT transform for MathML to mml: conversion."""
|
||
if cls._mml_xslt_transform is None:
|
||
from lxml import etree
|
||
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
|
||
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
|
||
return cls._mml_xslt_transform
|
||
|
||
def _is_formula_only(self, text: str) -> bool:
|
||
"""Check if text contains only a LaTeX formula (no mixed content).
|
||
|
||
A text is considered formula-only if it matches one of these patterns:
|
||
- Display math: $$...$$ or \\[...\\]
|
||
- Inline math: $...$ or \\(...\\)
|
||
|
||
Args:
|
||
text: Input text to check.
|
||
|
||
Returns:
|
||
True if the text contains only a LaTeX formula, False otherwise.
|
||
"""
|
||
text = text.strip()
|
||
|
||
if not text:
|
||
return False
|
||
|
||
# Strict patterns: entire text must be a single formula with delimiters
|
||
# Using pre-compiled patterns with fullmatch semantics
|
||
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
|
||
return True
|
||
if self._RE_DISPLAY_BRACKET.fullmatch(text):
|
||
return True
|
||
if self._RE_INLINE_DOLLAR.fullmatch(text):
|
||
return True
|
||
if self._RE_INLINE_PAREN.fullmatch(text):
|
||
return True
|
||
|
||
return False
|
||
|
||
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
||
"""Convert markdown to LaTeX, MathML, and MML formats.
|
||
|
||
Only converts when input contains a pure LaTeX formula.
|
||
Mixed content (text + formula) returns empty strings for all fields.
|
||
|
||
Args:
|
||
md_text: Markdown text to convert.
|
||
|
||
Returns:
|
||
ConvertResult with latex, mathml, and mml fields.
|
||
All fields are empty if input is not a pure formula.
|
||
|
||
Raises:
|
||
RuntimeError: If conversion fails for a valid formula.
|
||
"""
|
||
# Empty input returns empty result
|
||
if not md_text or not md_text.strip():
|
||
return ConvertResult(latex="", mathml="", mml="")
|
||
|
||
# Check if input is formula-only
|
||
if not self._is_formula_only(md_text):
|
||
# Mixed content: cannot convert to formula formats
|
||
return ConvertResult(latex="", mathml="", mml="")
|
||
|
||
try:
|
||
# Extract the LaTeX formula content (remove delimiters)
|
||
latex_formula = self._extract_latex_formula(md_text)
|
||
|
||
# Preprocess formula for better conversion (fix array specifiers, etc.)
|
||
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
|
||
|
||
# Convert to MathML
|
||
mathml = self._latex_to_mathml(preprocessed_formula)
|
||
|
||
# Convert MathML to mml:math format (with namespace prefix)
|
||
mml = self._mathml_to_mml(mathml)
|
||
|
||
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
|
||
|
||
except Exception as e:
|
||
raise RuntimeError(f"Conversion failed: {e}") from e
|
||
|
||
def convert_to_omml(self, latex_formula: str) -> str:
|
||
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
||
|
||
This is a separate method due to the performance overhead of OMML conversion,
|
||
which requires creating a temporary DOCX file.
|
||
|
||
The formula is preprocessed using the same logic as export_to_file to ensure
|
||
proper conversion.
|
||
|
||
Args:
|
||
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
|
||
|
||
Returns:
|
||
OMML representation as XML string.
|
||
|
||
Raises:
|
||
ValueError: If latex_formula is empty.
|
||
RuntimeError: If conversion fails.
|
||
"""
|
||
if not latex_formula or not latex_formula.strip():
|
||
raise ValueError("LaTeX formula cannot be empty")
|
||
|
||
# Preprocess formula using the same preprocessing as export
|
||
preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())
|
||
|
||
return self._latex_to_omml(preprocessed)
|
||
|
||
def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
|
||
"""Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).
|
||
|
||
Applies the same preprocessing steps as preprocess_for_export to ensure
|
||
consistency across all conversion paths. This fixes common issues that
|
||
cause Pandoc conversion to fail.
|
||
|
||
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
|
||
so we don't need to handle them here.
|
||
|
||
Args:
|
||
latex_formula: Pure LaTeX formula.
|
||
|
||
Returns:
|
||
Preprocessed LaTeX formula.
|
||
"""
|
||
# 1. Convert matrix environments
|
||
latex_formula = self._convert_matrix_environments(latex_formula)
|
||
|
||
# 2. Fix array column specifiers (remove spaces)
|
||
latex_formula = self._fix_array_column_specifiers(latex_formula)
|
||
|
||
# 3. Fix brace spacing
|
||
latex_formula = self._fix_brace_spacing(latex_formula)
|
||
|
||
# 4. Convert special environments (cases, aligned)
|
||
latex_formula = self._convert_special_environments(latex_formula)
|
||
|
||
return latex_formula
|
||
|
||
def _extract_latex_formula(self, text: str) -> str:
|
||
"""Extract LaTeX formula from text by removing delimiters.
|
||
|
||
Args:
|
||
text: Text containing LaTeX formula with delimiters.
|
||
|
||
Returns:
|
||
Pure LaTeX formula without delimiters.
|
||
"""
|
||
text = text.strip()
|
||
|
||
# Remove display math delimiters: $$...$$ or \[...\]
|
||
if text.startswith("$$") and text.endswith("$$"):
|
||
return text[2:-2].strip()
|
||
if text.startswith("\\[") and text.endswith("\\]"):
|
||
return text[2:-2].strip()
|
||
|
||
# Remove inline math delimiters: $...$ or \(...\)
|
||
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
|
||
return text[1:-1].strip()
|
||
if text.startswith("\\(") and text.endswith("\\)"):
|
||
return text[2:-2].strip()
|
||
|
||
# If no delimiters, return as-is
|
||
return text.strip()
|
||
|
||
@staticmethod
|
||
@lru_cache(maxsize=256)
|
||
def _latex_to_mathml_cached(latex_formula: str) -> str:
|
||
"""Cached conversion of LaTeX formula to MathML.
|
||
|
||
Uses Pandoc for conversion to ensure Word compatibility.
|
||
Pandoc generates standard MathML that Word can properly import.
|
||
|
||
Uses LRU cache to avoid recomputing for repeated formulas.
|
||
"""
|
||
try:
|
||
# Use Pandoc for Word-compatible MathML (primary method)
|
||
mathml_html = pypandoc.convert_text(
|
||
f"${latex_formula}$",
|
||
"html",
|
||
format="markdown+tex_math_dollars",
|
||
extra_args=["--mathml"],
|
||
)
|
||
# Extract just the <math> element from the HTML
|
||
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
|
||
if match:
|
||
mathml = match.group(0)
|
||
# Post-process for Word compatibility
|
||
return Converter._postprocess_mathml_for_word(mathml)
|
||
|
||
# If no match, return as-is
|
||
return mathml_html.rstrip("\n")
|
||
|
||
except Exception as pandoc_error:
|
||
# Fallback: try latex2mathml (less Word-compatible)
|
||
try:
|
||
mathml = latex_to_mathml(latex_formula)
|
||
return Converter._postprocess_mathml_for_word(mathml)
|
||
except Exception as e:
|
||
raise RuntimeError(
|
||
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
|
||
) from e
|
||
|
||
@staticmethod
|
||
def _postprocess_mathml_for_word(mathml: str) -> str:
|
||
"""Post-process MathML to improve Word compatibility.
|
||
|
||
Applies transformations to make MathML more compatible and concise:
|
||
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
|
||
- Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
|
||
- Remove redundant single <mrow> wrappers
|
||
- Change display="inline" to display="block" for better rendering
|
||
- Decode Unicode entities to actual characters (Word prefers this)
|
||
- Ensure proper namespace
|
||
|
||
Args:
|
||
mathml: MathML string.
|
||
|
||
Returns:
|
||
Simplified, Word-compatible MathML string.
|
||
"""
|
||
import re
|
||
|
||
# Step 1: Remove <semantics> and <annotation> wrappers
|
||
# These often cause Word import issues
|
||
if '<semantics>' in mathml:
|
||
# Extract content between <semantics> and <annotation>
|
||
match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
|
||
if match:
|
||
content = match.group(1).strip()
|
||
|
||
# Get the math element attributes
|
||
math_attrs = ""
|
||
math_match = re.search(r'<math([^>]*)>', mathml)
|
||
if math_match:
|
||
math_attrs = math_match.group(1)
|
||
|
||
# Rebuild without semantics
|
||
mathml = f'<math{math_attrs}>{content}</math>'
|
||
|
||
# Step 2: Remove unnecessary attributes that don't affect rendering
|
||
# These are verbose and Word doesn't need them
|
||
unnecessary_attrs = [
|
||
r'\s+form="prefix"',
|
||
r'\s+form="postfix"',
|
||
r'\s+form="infix"',
|
||
r'\s+stretchy="true"',
|
||
r'\s+stretchy="false"',
|
||
r'\s+fence="true"',
|
||
r'\s+fence="false"',
|
||
r'\s+separator="true"',
|
||
r'\s+separator="false"',
|
||
r'\s+columnalign="[^"]*"',
|
||
r'\s+columnspacing="[^"]*"',
|
||
r'\s+rowspacing="[^"]*"',
|
||
r'\s+class="[^"]*"',
|
||
r'\s+style="[^"]*"',
|
||
]
|
||
|
||
for attr_pattern in unnecessary_attrs:
|
||
mathml = re.sub(attr_pattern, '', mathml)
|
||
|
||
# Step 3: Remove redundant single <mrow> wrapper at the top level
|
||
# Pattern: <math ...><mrow>content</mrow></math>
|
||
# Simplify to: <math ...>content</math>
|
||
mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
|
||
match = re.search(mrow_pattern, mathml, re.DOTALL)
|
||
if match:
|
||
# Check if there's only one mrow at the top level
|
||
content = match.group(2)
|
||
# Only remove if the content doesn't have other top-level elements
|
||
if not re.search(r'</[^>]+>\s*<[^/]', content):
|
||
mathml = f'{match.group(1)}{content}{match.group(3)}'
|
||
|
||
# Step 4: Change display to block for better Word rendering
|
||
mathml = mathml.replace('display="inline"', 'display="block"')
|
||
|
||
# Step 5: If no display attribute, add it
|
||
if 'display=' not in mathml and '<math' in mathml:
|
||
mathml = mathml.replace('<math', '<math display="block"', 1)
|
||
|
||
# Step 6: Ensure xmlns is present
|
||
if 'xmlns=' not in mathml and '<math' in mathml:
|
||
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
|
||
|
||
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
|
||
unicode_map = {
|
||
'+': '+',
|
||
'-': '-',
|
||
'*': '*',
|
||
'/': '/',
|
||
'=': '=',
|
||
'<': '<',
|
||
'>': '>',
|
||
'(': '(',
|
||
')': ')',
|
||
',': ',',
|
||
'.': '.',
|
||
'|': '|',
|
||
'…': '⋯',
|
||
'⋮': '⋮',
|
||
'⋯': '⋯',
|
||
'°': '°',
|
||
'γ': 'γ',
|
||
'φ': 'φ',
|
||
'ϕ': 'ϕ',
|
||
'α': 'α',
|
||
'β': 'β',
|
||
'δ': 'δ',
|
||
'ε': 'ε',
|
||
'θ': 'θ',
|
||
'λ': 'λ',
|
||
'μ': 'μ',
|
||
'π': 'π',
|
||
'ρ': 'ρ',
|
||
'σ': 'σ',
|
||
'τ': 'τ',
|
||
'ω': 'ω',
|
||
}
|
||
|
||
for entity, char in unicode_map.items():
|
||
mathml = mathml.replace(entity, char)
|
||
|
||
# Step 8: Clean up extra whitespace
|
||
mathml = re.sub(r'>\s+<', '><', mathml)
|
||
|
||
return mathml
|
||
|
||
def _latex_to_mathml(self, latex_formula: str) -> str:
|
||
"""Convert LaTeX formula to standard MathML.
|
||
|
||
Args:
|
||
latex_formula: Pure LaTeX formula (without delimiters).
|
||
|
||
Returns:
|
||
Standard MathML representation.
|
||
"""
|
||
return self._latex_to_mathml_cached(latex_formula)
|
||
|
||
def _mathml_to_mml(self, mathml: str) -> str:
|
||
"""Convert standard MathML to mml:math format with namespace prefix.
|
||
|
||
Uses XSLT for efficient transformation. Transforms:
|
||
- <math ...> to <mml:math xmlns:mml="..." ...>
|
||
- All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
|
||
|
||
Args:
|
||
mathml: Standard MathML string.
|
||
|
||
Returns:
|
||
MathML with mml: namespace prefix.
|
||
"""
|
||
if not mathml:
|
||
return ""
|
||
|
||
try:
|
||
from lxml import etree
|
||
|
||
# Parse MathML
|
||
root = etree.fromstring(mathml.encode("utf-8"))
|
||
|
||
# Apply XSLT transformation (cached)
|
||
transform = self._get_mml_xslt_transform()
|
||
result_tree = transform(root)
|
||
|
||
# Serialize to string
|
||
return str(result_tree)
|
||
|
||
except Exception:
|
||
# Fallback: simple string replacement (less robust but no lxml dependency)
|
||
result = mathml
|
||
# Add namespace to root math element
|
||
result = re.sub(
|
||
r"<math\b",
|
||
f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
|
||
result,
|
||
)
|
||
result = re.sub(r"</math>", "</mml:math>", result)
|
||
|
||
# Add mml: prefix to all other elements using a single regex
|
||
# Match opening tags
|
||
result = re.sub(
|
||
r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
||
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
||
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
||
r"maction|semantics|annotation|annotation-xml)\b",
|
||
r"<mml:\1",
|
||
result,
|
||
)
|
||
# Match closing tags
|
||
result = re.sub(
|
||
r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
||
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
||
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
||
r"maction|semantics|annotation|annotation-xml)>",
|
||
r"</mml:\1>",
|
||
result,
|
||
)
|
||
|
||
return result
|
||
|
||
def _latex_to_omml(self, latex_formula: str) -> str:
|
||
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
||
|
||
Uses Pandoc to create DOCX in memory and extracts OMML from it.
|
||
Optimized to minimize disk I/O by using in-memory zip processing.
|
||
|
||
Args:
|
||
latex_formula: Pure LaTeX formula (without delimiters).
|
||
|
||
Returns:
|
||
OMML representation as XML string.
|
||
"""
|
||
import io
|
||
import zipfile
|
||
|
||
try:
|
||
from lxml import etree
|
||
|
||
# Convert to DOCX bytes using Pandoc
|
||
# We still need a temp file for input, but output goes to temp file too
|
||
# Then we process the DOCX in memory
|
||
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
||
f.write(f"$${latex_formula}$$\n")
|
||
temp_md = f.name
|
||
|
||
temp_docx = temp_md.replace(".md", ".docx")
|
||
|
||
try:
|
||
pypandoc.convert_file(
|
||
temp_md,
|
||
"docx",
|
||
format=self.INPUT_FORMAT,
|
||
outputfile=temp_docx,
|
||
)
|
||
|
||
# Read DOCX into memory and process as ZIP
|
||
with open(temp_docx, "rb") as f:
|
||
docx_bytes = f.read()
|
||
|
||
# Extract document.xml from DOCX (which is a ZIP file)
|
||
with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
|
||
document_xml = zf.read("word/document.xml")
|
||
|
||
# Parse XML and extract OMML
|
||
root = etree.fromstring(document_xml)
|
||
|
||
# Find all oMath elements
|
||
omml_parts = []
|
||
for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
|
||
omml_parts.append(etree.tostring(math, encoding="unicode"))
|
||
|
||
return "\n".join(omml_parts)
|
||
|
||
finally:
|
||
# Cleanup temp files
|
||
if os.path.exists(temp_md):
|
||
os.remove(temp_md)
|
||
if os.path.exists(temp_docx):
|
||
os.remove(temp_docx)
|
||
|
||
except Exception as e:
|
||
raise RuntimeError(f"OMML conversion failed: {e}") from e
|
||
|
||
def preprocess_for_export(self, md_text: str) -> str:
|
||
"""Preprocess markdown text for export to docx/pdf.
|
||
|
||
Handles LaTeX formula formatting, matrix environments, and
|
||
other transformations needed for proper Word/PDF rendering.
|
||
|
||
Uses pre-compiled regex patterns for better performance.
|
||
|
||
Args:
|
||
md_text: Raw markdown text.
|
||
|
||
Returns:
|
||
Preprocessed markdown text.
|
||
"""
|
||
# Replace \[1mm] => \vspace{1mm}
|
||
md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)
|
||
|
||
# Add blank lines around \[...\] block formulas
|
||
md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
|
||
md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)
|
||
|
||
# Remove arithmatex span wrappers
|
||
cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)
|
||
|
||
# Convert inline formulas: \( \) => $ $
|
||
cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")
|
||
|
||
# Convert block formulas: \[ \] => $$ $$
|
||
cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")
|
||
|
||
# Remove spaces between $ and formula content
|
||
cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)
|
||
|
||
# Convert matrix environments for better Word rendering
|
||
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
||
|
||
# Fix array environment column specifiers (remove spaces)
|
||
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
|
||
|
||
# Fix brace spacing for equation systems
|
||
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
||
|
||
# Convert cases and aligned environments
|
||
cleaned_md = self._convert_special_environments(cleaned_md)
|
||
|
||
# Handle LaTeX \tag{} commands for equation numbering
|
||
cleaned_md = self._convert_tag_commands(cleaned_md)
|
||
|
||
return cleaned_md
|
||
|
||
def _convert_matrix_environments(self, md_text: str) -> str:
|
||
"""Convert vmatrix/Vmatrix to left/right delimited forms.
|
||
|
||
This fixes the vertical line height issues in Word.
|
||
"""
|
||
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
||
md_text = self._RE_VMATRIX.sub(
|
||
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
||
md_text,
|
||
)
|
||
|
||
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
||
md_text = self._RE_VMATRIX_DOUBLE.sub(
|
||
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
||
md_text,
|
||
)
|
||
|
||
return md_text
|
||
|
||
def _fix_array_column_specifiers(self, md_text: str) -> str:
|
||
"""Fix array environment column specifiers by removing spaces.
|
||
|
||
Pandoc's OMML converter doesn't accept spaces between column alignment
|
||
specifiers in array environments. This converts patterns like
|
||
{c c c c} to {cccc}.
|
||
"""
|
||
|
||
def remove_spaces_in_specifier(match: re.Match) -> str:
|
||
"""Remove spaces from column specifier."""
|
||
specifier = match.group(1)
|
||
return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
|
||
|
||
return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
|
||
|
||
def _fix_brace_spacing(self, md_text: str) -> str:
|
||
"""Fix spacing issues with braces in equation systems.
|
||
|
||
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
||
"""
|
||
md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
|
||
md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
|
||
return md_text
|
||
|
||
def _convert_special_environments(self, md_text: str) -> str:
|
||
"""Convert cases and aligned environments to array format.
|
||
|
||
These environments have better rendering support in Word/OMML.
|
||
"""
|
||
# Pre-compiled pattern for alignment marker removal
|
||
_re_align_marker = re.compile(r"(^|\\\\)\s*&")
|
||
|
||
def convert_cases(match: re.Match) -> str:
|
||
content = match.group(1)
|
||
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
||
|
||
md_text = self._RE_CASES.sub(convert_cases, md_text)
|
||
|
||
def convert_aligned_to_array(match: re.Match) -> str:
|
||
content = match.group(1)
|
||
content = _re_align_marker.sub(r"\1", content)
|
||
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
||
|
||
md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)
|
||
|
||
def convert_standalone_aligned(match: re.Match) -> str:
|
||
content = match.group(1)
|
||
content = _re_align_marker.sub(r"\1", content)
|
||
return r"\begin{array}{l}" + content + r"\end{array}"
|
||
|
||
md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)
|
||
|
||
return md_text
|
||
|
||
def _convert_tag_commands(self, md_text: str) -> str:
|
||
"""Convert LaTeX \\tag{} commands to Word-compatible format.
|
||
|
||
The \\tag{} command is not supported in Word OMML format, so we convert it to
|
||
use simple spacing (\\quad) to push the equation number to the right side.
|
||
"""
|
||
|
||
def convert_tag(match: re.Match) -> str:
|
||
formula_content = match.group(1)
|
||
tag_content = match.group(2)
|
||
return f"$${formula_content} \\quad ({tag_content})$$"
|
||
|
||
return self._RE_TAG.sub(convert_tag, md_text)
|
||
|
||
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
||
"""Export markdown to docx or pdf file.
|
||
|
||
Args:
|
||
md_text: Markdown text to export.
|
||
export_type: Export format, either 'docx' or 'pdf'.
|
||
|
||
Returns:
|
||
bytes of the exported file.
|
||
|
||
Raises:
|
||
ValueError: If export_type is not supported.
|
||
RuntimeError: If export fails.
|
||
|
||
"""
|
||
|
||
# Preprocess markdown
|
||
cleaned_md = self.preprocess_for_export(md_text)
|
||
|
||
# Create temp file for input
|
||
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
|
||
f_in.write(cleaned_md.encode("utf-8"))
|
||
md_path = f_in.name
|
||
|
||
output_file = md_path + "." + export_type
|
||
|
||
try:
|
||
if export_type == "docx":
|
||
self._export_docx(md_path, output_file)
|
||
with open(output_file, "rb") as f:
|
||
return f.read()
|
||
else: # pdf
|
||
self._export_pdf(md_path, output_file)
|
||
with open(output_file, "rb") as f:
|
||
return f.read()
|
||
|
||
except Exception as e:
|
||
# Cleanup on error
|
||
self._cleanup_files(md_path, output_file)
|
||
raise RuntimeError(f"Export failed: {e}") from e
|
||
finally:
|
||
# Always cleanup input file
|
||
if os.path.exists(md_path):
|
||
os.remove(md_path)
|
||
|
||
def _export_docx(self, input_path: str, output_path: str) -> None:
|
||
"""Export to DOCX format using pypandoc."""
|
||
extra_args = [
|
||
"--highlight-style=pygments",
|
||
f"--reference-doc=app/pkg/reference.docx",
|
||
]
|
||
pypandoc.convert_file(
|
||
input_path,
|
||
"docx",
|
||
format=self.INPUT_FORMAT,
|
||
outputfile=output_path,
|
||
extra_args=extra_args,
|
||
)
|
||
|
||
def _export_pdf(self, input_path: str, output_path: str) -> None:
|
||
"""Export to PDF format using pypandoc with XeLaTeX."""
|
||
extra_args = [
|
||
"--pdf-engine=xelatex",
|
||
"-V",
|
||
"mainfont=Noto Sans CJK SC",
|
||
"--highlight-style=pygments",
|
||
]
|
||
pypandoc.convert_file(
|
||
input_path,
|
||
"pdf",
|
||
format=self.INPUT_FORMAT,
|
||
outputfile=output_path,
|
||
extra_args=extra_args,
|
||
)
|
||
|
||
def _cleanup_files(self, *paths: str) -> None:
|
||
"""Remove files if they exist."""
|
||
for path in paths:
|
||
if os.path.exists(path):
|
||
os.remove(path)
|
||
|
||
def cleanup_export_file(self, file_path: str) -> None:
|
||
"""Cleanup exported file after sending response.
|
||
|
||
Call this after sending the file to the client.
|
||
|
||
Args:
|
||
file_path: Path to the exported file.
|
||
"""
|
||
if os.path.exists(file_path):
|
||
os.remove(file_path)
|