Files
doc_processer/app/services/converter.py

1007 lines
37 KiB
Python
Raw Normal View History

2025-12-31 17:38:32 +08:00
"""Markdown conversion and export service using pypandoc."""
import os
import re
import tempfile
from dataclasses import dataclass
2026-02-04 12:00:06 +08:00
from functools import lru_cache
2025-12-31 17:38:32 +08:00
from typing import Literal
import pypandoc
2026-02-04 12:00:06 +08:00
from latex2mathml.converter import convert as latex_to_mathml
2025-12-31 17:38:32 +08:00
@dataclass
class ConvertResult:
2026-02-04 12:00:06 +08:00
"""Result of markdown conversion.
Only populated when input contains pure LaTeX formula.
All fields are empty strings when input contains mixed content (text + formula).
Attributes:
latex: Pure LaTeX formula code (without delimiters).
mathml: Standard MathML format.
mml: XML MathML with mml: namespace prefix (mml:math).
"""
2025-12-31 17:38:32 +08:00
latex: str
mathml: str
2026-02-04 12:00:06 +08:00
mml: str
2025-12-31 17:38:32 +08:00
@dataclass
class ExportResult:
"""Result of markdown export."""
file_path: str
content_type: str
download_name: str
ExportType = Literal["docx", "pdf"]
2026-02-04 12:00:06 +08:00
# MathML namespace
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
# XSLT for MathML to mml: namespace conversion
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:mml="http://www.w3.org/1998/Math/MathML"
xmlns:m="http://www.w3.org/1998/Math/MathML"
exclude-result-prefixes="m">
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- Match root math element -->
<xsl:template match="m:math|math">
<mml:math>
<xsl:apply-templates select="@*|node()"/>
</mml:math>
</xsl:template>
<!-- Match all other MathML elements -->
<xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
<xsl:element name="mml:{local-name()}">
<xsl:apply-templates select="@*|node()"/>
</xsl:element>
</xsl:template>
<!-- Copy attributes -->
<xsl:template match="@*">
<xsl:if test="local-name() != 'xmlns'">
<xsl:copy/>
</xsl:if>
</xsl:template>
<!-- Copy text nodes -->
<xsl:template match="text()">
<xsl:value-of select="."/>
</xsl:template>
</xsl:stylesheet>
"""
2025-12-31 17:38:32 +08:00
class Converter:
2026-02-04 12:00:06 +08:00
"""Service for conversion and export operations.
Conversion rules:
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
- Mixed content (text + formula) returns empty results for all formats.
- OMML conversion is provided as a separate method due to performance overhead.
Performance optimizations:
- Pre-compiled regex patterns
- XSLT-based MML conversion
- Cached XSLT transforms
- Direct Pandoc OMML output (avoids DOCX parsing)
"""
2025-12-31 17:38:32 +08:00
# Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
2026-02-04 12:00:06 +08:00
# Pre-compiled regex patterns for formula detection
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
_RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
# Pre-compiled regex patterns for preprocessing
_RE_VSPACE = re.compile(r"\\\[1mm\]")
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
_RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
_RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
# Cached XSLT transform
_mml_xslt_transform = None
2025-12-31 17:38:32 +08:00
def __init__(self):
"""Initialize converter."""
2026-02-04 12:00:06 +08:00
@classmethod
def _get_mml_xslt_transform(cls):
"""Get cached XSLT transform for MathML to mml: conversion."""
if cls._mml_xslt_transform is None:
from lxml import etree
2026-02-07 21:28:46 +08:00
2026-02-04 12:00:06 +08:00
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
return cls._mml_xslt_transform
def _is_formula_only(self, text: str) -> bool:
"""Check if text contains only a LaTeX formula (no mixed content).
A text is considered formula-only if it matches one of these patterns:
- Display math: $$...$$ or \\[...\\]
- Inline math: $...$ or \\(...\\)
Args:
text: Input text to check.
Returns:
True if the text contains only a LaTeX formula, False otherwise.
"""
text = text.strip()
if not text:
return False
# Strict patterns: entire text must be a single formula with delimiters
# Using pre-compiled patterns with fullmatch semantics
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
return True
if self._RE_DISPLAY_BRACKET.fullmatch(text):
return True
if self._RE_INLINE_DOLLAR.fullmatch(text):
return True
if self._RE_INLINE_PAREN.fullmatch(text):
return True
return False
2025-12-31 17:38:32 +08:00
def convert_to_formats(self, md_text: str) -> ConvertResult:
2026-02-04 12:00:06 +08:00
"""Convert markdown to LaTeX, MathML, and MML formats.
Only converts when input contains a pure LaTeX formula.
Mixed content (text + formula) returns empty strings for all fields.
2025-12-31 17:38:32 +08:00
Args:
md_text: Markdown text to convert.
Returns:
2026-02-04 12:00:06 +08:00
ConvertResult with latex, mathml, and mml fields.
All fields are empty if input is not a pure formula.
2025-12-31 17:38:32 +08:00
Raises:
2026-02-04 12:00:06 +08:00
RuntimeError: If conversion fails for a valid formula.
2025-12-31 17:38:32 +08:00
"""
2026-02-04 12:00:06 +08:00
# Empty input returns empty result
if not md_text or not md_text.strip():
return ConvertResult(latex="", mathml="", mml="")
# Check if input is formula-only
if not self._is_formula_only(md_text):
# Mixed content: cannot convert to formula formats
return ConvertResult(latex="", mathml="", mml="")
2025-12-31 17:38:32 +08:00
try:
2026-02-07 21:28:46 +08:00
# Detect if formula is display (block) or inline
is_display = self._is_display_formula(md_text)
2026-02-04 12:00:06 +08:00
# Extract the LaTeX formula content (remove delimiters)
latex_formula = self._extract_latex_formula(md_text)
2026-02-04 15:52:04 +08:00
# Preprocess formula for better conversion (fix array specifiers, etc.)
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
2026-02-07 21:28:46 +08:00
# Convert to MathML (pass display flag to use correct delimiters)
mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)
2026-02-04 12:00:06 +08:00
# Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml)
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
2025-12-31 17:38:32 +08:00
except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e
2026-02-04 12:00:06 +08:00
def convert_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
This is a separate method due to the performance overhead of OMML conversion,
which requires creating a temporary DOCX file.
2026-02-04 12:45:34 +08:00
The formula is preprocessed using the same logic as export_to_file to ensure
proper conversion.
2026-02-04 12:00:06 +08:00
Args:
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
Returns:
OMML representation as XML string.
Raises:
ValueError: If latex_formula is empty.
RuntimeError: If conversion fails.
"""
if not latex_formula or not latex_formula.strip():
raise ValueError("LaTeX formula cannot be empty")
2026-02-04 12:45:34 +08:00
# Preprocess formula using the same preprocessing as export
2026-02-04 15:52:04 +08:00
preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())
2026-02-07 21:28:46 +08:00
2026-02-04 12:45:34 +08:00
return self._latex_to_omml(preprocessed)
2026-02-04 15:52:04 +08:00
def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).
2026-02-04 12:45:34 +08:00
Applies the same preprocessing steps as preprocess_for_export to ensure
2026-02-07 21:28:46 +08:00
consistency across all conversion paths. This fixes common issues that
2026-02-04 15:52:04 +08:00
cause Pandoc conversion to fail.
2026-02-04 12:45:34 +08:00
2026-02-07 21:28:46 +08:00
Note: OCR errors (number errors, command spacing) are fixed earlier in the
pipeline (in ocr_service.py), so we don't need to handle them here.
2026-02-04 16:04:18 +08:00
2026-02-04 12:45:34 +08:00
Args:
latex_formula: Pure LaTeX formula.
Returns:
Preprocessed LaTeX formula.
"""
# 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula)
2026-02-07 21:28:46 +08:00
2026-02-04 16:04:18 +08:00
# 2. Fix array column specifiers (remove spaces)
2026-02-04 12:45:34 +08:00
latex_formula = self._fix_array_column_specifiers(latex_formula)
2026-02-07 21:28:46 +08:00
2026-02-04 12:45:34 +08:00
# 3. Fix brace spacing
latex_formula = self._fix_brace_spacing(latex_formula)
2026-02-07 21:28:46 +08:00
2026-02-04 12:45:34 +08:00
# 4. Convert special environments (cases, aligned)
latex_formula = self._convert_special_environments(latex_formula)
2026-02-07 21:28:46 +08:00
2026-02-04 12:45:34 +08:00
return latex_formula
2026-02-04 12:00:06 +08:00
2026-02-07 21:28:46 +08:00
def _is_display_formula(self, text: str) -> bool:
"""Check if the formula is a display (block) formula.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
True if display formula ($$...$$ or \\[...\\]), False if inline.
"""
text = text.strip()
# Display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return True
if text.startswith("\\[") and text.endswith("\\]"):
return True
# Inline math delimiters: $...$ or \(...\)
return False
2026-02-04 12:00:06 +08:00
def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
Pure LaTeX formula without delimiters.
"""
text = text.strip()
# Remove display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return text[2:-2].strip()
if text.startswith("\\[") and text.endswith("\\]"):
return text[2:-2].strip()
# Remove inline math delimiters: $...$ or \(...\)
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
return text[1:-1].strip()
if text.startswith("\\(") and text.endswith("\\)"):
return text[2:-2].strip()
# If no delimiters, return as-is
return text.strip()
@staticmethod
@lru_cache(maxsize=256)
2026-02-07 21:28:46 +08:00
def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
2026-02-04 12:00:06 +08:00
"""Cached conversion of LaTeX formula to MathML.
2026-02-04 15:49:13 +08:00
Uses Pandoc for conversion to ensure Word compatibility.
Pandoc generates standard MathML that Word can properly import.
2026-02-07 21:28:46 +08:00
Args:
latex_formula: Pure LaTeX formula (without delimiters).
is_display: True if display (block) formula, False if inline.
Returns:
Standard MathML representation.
2026-02-04 12:00:06 +08:00
"""
2026-02-07 21:28:46 +08:00
# Use appropriate delimiters based on formula type
# Display formulas use $$...$$, inline formulas use $...$
if is_display:
pandoc_input = f"$${latex_formula}$$"
else:
pandoc_input = f"${latex_formula}$"
2026-02-04 12:00:06 +08:00
try:
2026-02-04 15:49:13 +08:00
# Use Pandoc for Word-compatible MathML (primary method)
mathml_html = pypandoc.convert_text(
2026-02-07 21:28:46 +08:00
pandoc_input,
2026-02-04 15:49:13 +08:00
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
)
# Extract just the <math> element from the HTML
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
if match:
mathml = match.group(0)
# Post-process for Word compatibility
return Converter._postprocess_mathml_for_word(mathml)
2026-02-07 21:28:46 +08:00
# If Pandoc didn't generate MathML (returned HTML instead), use fallback
# This happens when Pandoc's mathml output format is not available or fails
raise ValueError("Pandoc did not generate MathML, got HTML instead")
2026-02-04 15:49:13 +08:00
except Exception as pandoc_error:
# Fallback: try latex2mathml (less Word-compatible)
2026-02-04 12:00:06 +08:00
try:
2026-02-04 15:49:13 +08:00
mathml = latex_to_mathml(latex_formula)
return Converter._postprocess_mathml_for_word(mathml)
except Exception as e:
2026-02-07 21:28:46 +08:00
raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e
2026-02-04 15:49:13 +08:00
@staticmethod
def _postprocess_mathml_for_word(mathml: str) -> str:
"""Post-process MathML to improve Word compatibility.
2026-02-07 21:28:46 +08:00
2026-02-04 16:56:20 +08:00
Applies transformations to make MathML more compatible and concise:
2026-02-04 16:12:22 +08:00
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
2026-02-04 16:56:20 +08:00
- Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
- Remove redundant single <mrow> wrappers
2026-02-04 15:49:13 +08:00
- Change display="inline" to display="block" for better rendering
- Decode Unicode entities to actual characters (Word prefers this)
2026-02-04 16:12:22 +08:00
- Ensure proper namespace
2026-02-07 21:28:46 +08:00
2026-02-04 15:49:13 +08:00
Args:
mathml: MathML string.
2026-02-07 21:28:46 +08:00
2026-02-04 15:49:13 +08:00
Returns:
2026-02-04 16:56:20 +08:00
Simplified, Word-compatible MathML string.
2026-02-04 15:49:13 +08:00
"""
2026-02-04 16:12:22 +08:00
import re
2026-02-07 21:28:46 +08:00
2026-02-04 16:12:22 +08:00
# Step 1: Remove <semantics> and <annotation> wrappers
# These often cause Word import issues
2026-02-07 21:28:46 +08:00
if "<semantics>" in mathml:
2026-02-04 16:12:22 +08:00
# Extract content between <semantics> and <annotation>
2026-02-07 21:28:46 +08:00
match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
2026-02-04 16:12:22 +08:00
if match:
content = match.group(1).strip()
2026-02-07 21:28:46 +08:00
2026-02-04 16:12:22 +08:00
# Get the math element attributes
math_attrs = ""
2026-02-07 21:28:46 +08:00
math_match = re.search(r"<math([^>]*)>", mathml)
2026-02-04 16:12:22 +08:00
if math_match:
math_attrs = math_match.group(1)
2026-02-07 21:28:46 +08:00
2026-02-04 16:12:22 +08:00
# Rebuild without semantics
2026-02-07 21:28:46 +08:00
mathml = f"<math{math_attrs}>{content}</math>"
2026-02-04 16:56:20 +08:00
# Step 2: Remove unnecessary attributes that don't affect rendering
# These are verbose and Word doesn't need them
unnecessary_attrs = [
r'\s+form="prefix"',
r'\s+form="postfix"',
r'\s+form="infix"',
r'\s+stretchy="true"',
r'\s+stretchy="false"',
r'\s+fence="true"',
r'\s+fence="false"',
r'\s+separator="true"',
r'\s+separator="false"',
r'\s+columnalign="[^"]*"',
r'\s+columnspacing="[^"]*"',
r'\s+rowspacing="[^"]*"',
r'\s+class="[^"]*"',
r'\s+style="[^"]*"',
]
2026-02-07 21:28:46 +08:00
2026-02-04 16:56:20 +08:00
for attr_pattern in unnecessary_attrs:
2026-02-07 21:28:46 +08:00
mathml = re.sub(attr_pattern, "", mathml)
2026-02-04 16:56:20 +08:00
# Step 3: Remove redundant single <mrow> wrapper at the top level
# Pattern: <math ...><mrow>content</mrow></math>
# Simplify to: <math ...>content</math>
2026-02-07 21:28:46 +08:00
mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
2026-02-04 16:56:20 +08:00
match = re.search(mrow_pattern, mathml, re.DOTALL)
if match:
# Check if there's only one mrow at the top level
content = match.group(2)
# Only remove if the content doesn't have other top-level elements
2026-02-07 21:28:46 +08:00
if not re.search(r"</[^>]+>\s*<[^/]", content):
mathml = f"{match.group(1)}{content}{match.group(3)}"
2026-02-04 16:56:20 +08:00
# Step 4: Change display to block for better Word rendering
2026-02-04 15:49:13 +08:00
mathml = mathml.replace('display="inline"', 'display="block"')
2026-02-07 21:28:46 +08:00
2026-02-04 16:56:20 +08:00
# Step 5: If no display attribute, add it
2026-02-07 21:28:46 +08:00
if "display=" not in mathml and "<math" in mathml:
mathml = mathml.replace("<math", '<math display="block"', 1)
2026-02-04 16:56:20 +08:00
# Step 6: Ensure xmlns is present
2026-02-07 21:28:46 +08:00
if "xmlns=" not in mathml and "<math" in mathml:
mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
2026-02-04 16:56:20 +08:00
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
2026-02-04 15:49:13 +08:00
unicode_map = {
2026-02-05 13:18:55 +08:00
# Basic operators
2026-02-07 21:28:46 +08:00
"&#x0002B;": "+",
"&#x0002D;": "-",
"&#x0002A;": "*",
"&#x0002F;": "/",
"&#x0003D;": "=",
"&#x0003C;": "<",
"&#x0003E;": ">",
"&#x00028;": "(",
"&#x00029;": ")",
"&#x0002C;": ",",
"&#x0002E;": ".",
"&#x0007C;": "|",
"&#x00B0;": "°",
"&#x00D7;": "×", # times
"&#x00F7;": "÷", # div
"&#x00B1;": "±", # pm
"&#x2213;": "", # mp
2026-02-05 13:18:55 +08:00
# Ellipsis symbols
2026-02-07 21:28:46 +08:00
"&#x02026;": "", # ldots (horizontal)
"&#x022EE;": "", # vdots (vertical)
"&#x022EF;": "", # cdots (centered)
"&#x022F0;": "", # iddots (diagonal up)
"&#x022F1;": "", # ddots (diagonal down)
2026-02-05 13:18:55 +08:00
# Greek letters (lowercase)
2026-02-07 21:28:46 +08:00
"&#x03B1;": "α", # alpha
"&#x03B2;": "β", # beta
"&#x03B3;": "γ", # gamma
"&#x03B4;": "δ", # delta
"&#x03B5;": "ε", # epsilon
"&#x03B6;": "ζ", # zeta
"&#x03B7;": "η", # eta
"&#x03B8;": "θ", # theta
"&#x03B9;": "ι", # iota
"&#x03BA;": "κ", # kappa
"&#x03BB;": "λ", # lambda
"&#x03BC;": "μ", # mu
"&#x03BD;": "ν", # nu
"&#x03BE;": "ξ", # xi
"&#x03BF;": "ο", # omicron
"&#x03C0;": "π", # pi
"&#x03C1;": "ρ", # rho
"&#x03C2;": "ς", # final sigma
"&#x03C3;": "σ", # sigma
"&#x03C4;": "τ", # tau
"&#x03C5;": "υ", # upsilon
"&#x03C6;": "φ", # phi
"&#x03C7;": "χ", # chi
"&#x03C8;": "ψ", # psi
"&#x03C9;": "ω", # omega
"&#x03D5;": "ϕ", # phi variant
2026-02-05 13:18:55 +08:00
# Greek letters (uppercase)
2026-02-07 21:28:46 +08:00
"&#x0391;": "Α", # Alpha
"&#x0392;": "Β", # Beta
"&#x0393;": "Γ", # Gamma
"&#x0394;": "Δ", # Delta
"&#x0395;": "Ε", # Epsilon
"&#x0396;": "Ζ", # Zeta
"&#x0397;": "Η", # Eta
"&#x0398;": "Θ", # Theta
"&#x0399;": "Ι", # Iota
"&#x039A;": "Κ", # Kappa
"&#x039B;": "Λ", # Lambda
"&#x039C;": "Μ", # Mu
"&#x039D;": "Ν", # Nu
"&#x039E;": "Ξ", # Xi
"&#x039F;": "Ο", # Omicron
"&#x03A0;": "Π", # Pi
"&#x03A1;": "Ρ", # Rho
"&#x03A3;": "Σ", # Sigma
"&#x03A4;": "Τ", # Tau
"&#x03A5;": "Υ", # Upsilon
"&#x03A6;": "Φ", # Phi
"&#x03A7;": "Χ", # Chi
"&#x03A8;": "Ψ", # Psi
"&#x03A9;": "Ω", # Omega
2026-02-05 13:18:55 +08:00
# Math symbols
2026-02-07 21:28:46 +08:00
"&#x2205;": "", # emptyset
"&#x2208;": "", # in
"&#x2209;": "", # notin
"&#x220B;": "", # ni
"&#x220C;": "", # nni
"&#x2211;": "", # sum
"&#x220F;": "", # prod
"&#x221A;": "", # sqrt
"&#x221B;": "", # cbrt
"&#x221C;": "", # fourthroot
"&#x221E;": "", # infty
"&#x2229;": "", # cap
"&#x222A;": "", # cup
"&#x222B;": "", # int
"&#x222C;": "", # iint
"&#x222D;": "", # iiint
"&#x222E;": "", # oint
"&#x2282;": "", # subset
"&#x2283;": "", # supset
"&#x2284;": "", # nsubset
"&#x2285;": "", # nsupset
"&#x2286;": "", # subseteq
"&#x2287;": "", # supseteq
"&#x2288;": "", # nsubseteq
"&#x2289;": "", # nsupseteq
"&#x2264;": "", # leq
"&#x2265;": "", # geq
"&#x2260;": "", # neq
"&#x2261;": "", # equiv
"&#x2248;": "", # approx
"&#x2243;": "", # simeq
"&#x2245;": "", # cong
"&#x2202;": "", # partial
"&#x2207;": "", # nabla
"&#x2200;": "", # forall
"&#x2203;": "", # exists
"&#x2204;": "", # nexists
"&#x00AC;": "¬", # neg/lnot
"&#x2227;": "", # wedge/land
"&#x2228;": "", # vee/lor
"&#x2192;": "", # to/rightarrow
"&#x2190;": "", # leftarrow
"&#x2194;": "", # leftrightarrow
"&#x21D2;": "", # Rightarrow
"&#x21D0;": "", # Leftarrow
"&#x21D4;": "", # Leftrightarrow
"&#x2191;": "", # uparrow
"&#x2193;": "", # downarrow
"&#x21D1;": "", # Uparrow
"&#x21D3;": "", # Downarrow
"&#x2195;": "", # updownarrow
"&#x21D5;": "", # Updownarrow
"&#x2260;": "", # ne
"&#x226A;": "", # ll
"&#x226B;": "", # gg
"&#x2A7D;": "", # leqslant
"&#x2A7E;": "", # geqslant
"&#x22A5;": "", # perp
"&#x2225;": "", # parallel
"&#x2220;": "", # angle
"&#x25B3;": "", # triangle
"&#x25A1;": "", # square
"&#x25CA;": "", # diamond
"&#x2660;": "", # spadesuit
"&#x2661;": "", # heartsuit
"&#x2662;": "", # diamondsuit
"&#x2663;": "", # clubsuit
"&#x2113;": "", # ell
"&#x2118;": "", # wp (Weierstrass p)
"&#x211C;": "", # Re (real part)
"&#x2111;": "", # Im (imaginary part)
"&#x2135;": "", # aleph
"&#x2136;": "", # beth
2026-02-04 15:49:13 +08:00
}
2026-02-07 21:28:46 +08:00
2026-02-04 15:49:13 +08:00
for entity, char in unicode_map.items():
mathml = mathml.replace(entity, char)
2026-02-07 21:28:46 +08:00
2026-02-05 13:18:55 +08:00
# Also handle decimal entity format (&#NNNN;) for common characters
# Convert decimal to hex-based lookup
decimal_patterns = [
2026-02-07 21:28:46 +08:00
(r"&#955;", "λ"), # lambda (decimal 955 = hex 03BB)
(r"&#8942;", ""), # vdots (decimal 8942 = hex 22EE)
(r"&#8943;", ""), # cdots (decimal 8943 = hex 22EF)
(r"&#8230;", ""), # ldots (decimal 8230 = hex 2026)
(r"&#8734;", ""), # infty (decimal 8734 = hex 221E)
(r"&#8721;", ""), # sum (decimal 8721 = hex 2211)
(r"&#8719;", ""), # prod (decimal 8719 = hex 220F)
(r"&#8730;", ""), # sqrt (decimal 8730 = hex 221A)
(r"&#8712;", ""), # in (decimal 8712 = hex 2208)
(r"&#8713;", ""), # notin (decimal 8713 = hex 2209)
(r"&#8745;", ""), # cap (decimal 8745 = hex 2229)
(r"&#8746;", ""), # cup (decimal 8746 = hex 222A)
(r"&#8804;", ""), # leq (decimal 8804 = hex 2264)
(r"&#8805;", ""), # geq (decimal 8805 = hex 2265)
(r"&#8800;", ""), # neq (decimal 8800 = hex 2260)
(r"&#8776;", ""), # approx (decimal 8776 = hex 2248)
(r"&#8801;", ""), # equiv (decimal 8801 = hex 2261)
2026-02-05 13:18:55 +08:00
]
2026-02-07 21:28:46 +08:00
2026-02-05 13:18:55 +08:00
for pattern, char in decimal_patterns:
mathml = mathml.replace(pattern, char)
2026-02-07 21:28:46 +08:00
2026-02-04 16:56:20 +08:00
# Step 8: Clean up extra whitespace
2026-02-07 21:28:46 +08:00
mathml = re.sub(r">\s+<", "><", mathml)
2026-02-04 15:49:13 +08:00
return mathml
2026-02-04 12:00:06 +08:00
2026-02-07 21:28:46 +08:00
def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
2026-02-04 12:00:06 +08:00
"""Convert LaTeX formula to standard MathML.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
2026-02-07 21:28:46 +08:00
is_display: True if display (block) formula, False if inline.
2026-02-04 12:00:06 +08:00
Returns:
Standard MathML representation.
"""
2026-02-07 21:28:46 +08:00
return self._latex_to_mathml_cached(latex_formula, is_display=is_display)
2026-02-04 12:00:06 +08:00
def _mathml_to_mml(self, mathml: str) -> str:
"""Convert standard MathML to mml:math format with namespace prefix.
Uses XSLT for efficient transformation. Transforms:
- <math ...> to <mml:math xmlns:mml="..." ...>
- All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
Args:
mathml: Standard MathML string.
Returns:
MathML with mml: namespace prefix.
"""
if not mathml:
return ""
try:
from lxml import etree
# Parse MathML
root = etree.fromstring(mathml.encode("utf-8"))
# Apply XSLT transformation (cached)
transform = self._get_mml_xslt_transform()
result_tree = transform(root)
# Serialize to string
return str(result_tree)
except Exception:
# Fallback: simple string replacement (less robust but no lxml dependency)
result = mathml
# Add namespace to root math element
result = re.sub(
r"<math\b",
f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
result,
)
result = re.sub(r"</math>", "</mml:math>", result)
# Add mml: prefix to all other elements using a single regex
# Match opening tags
result = re.sub(
r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
r"maction|semantics|annotation|annotation-xml)\b",
r"<mml:\1",
result,
)
# Match closing tags
result = re.sub(
r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
r"maction|semantics|annotation|annotation-xml)>",
r"</mml:\1>",
result,
)
return result
def _latex_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
Uses Pandoc to create DOCX in memory and extracts OMML from it.
Optimized to minimize disk I/O by using in-memory zip processing.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
Returns:
OMML representation as XML string.
"""
import io
import zipfile
try:
from lxml import etree
# Convert to DOCX bytes using Pandoc
# We still need a temp file for input, but output goes to temp file too
# Then we process the DOCX in memory
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(f"$${latex_formula}$$\n")
temp_md = f.name
temp_docx = temp_md.replace(".md", ".docx")
try:
pypandoc.convert_file(
temp_md,
"docx",
format=self.INPUT_FORMAT,
outputfile=temp_docx,
)
# Read DOCX into memory and process as ZIP
with open(temp_docx, "rb") as f:
docx_bytes = f.read()
# Extract document.xml from DOCX (which is a ZIP file)
with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
document_xml = zf.read("word/document.xml")
# Parse XML and extract OMML
root = etree.fromstring(document_xml)
# Find all oMath elements
omml_parts = []
for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
omml_parts.append(etree.tostring(math, encoding="unicode"))
return "\n".join(omml_parts)
finally:
# Cleanup temp files
if os.path.exists(temp_md):
os.remove(temp_md)
if os.path.exists(temp_docx):
os.remove(temp_docx)
except Exception as e:
raise RuntimeError(f"OMML conversion failed: {e}") from e
2025-12-31 17:38:32 +08:00
def preprocess_for_export(self, md_text: str) -> str:
"""Preprocess markdown text for export to docx/pdf.
Handles LaTeX formula formatting, matrix environments, and
other transformations needed for proper Word/PDF rendering.
2026-02-04 12:00:06 +08:00
Uses pre-compiled regex patterns for better performance.
2025-12-31 17:38:32 +08:00
Args:
md_text: Raw markdown text.
Returns:
Preprocessed markdown text.
"""
# Replace \[1mm] => \vspace{1mm}
2026-02-04 12:00:06 +08:00
md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)
2025-12-31 17:38:32 +08:00
# Add blank lines around \[...\] block formulas
2026-02-04 12:00:06 +08:00
md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)
2025-12-31 17:38:32 +08:00
# Remove arithmatex span wrappers
2026-02-04 12:00:06 +08:00
cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)
2025-12-31 17:38:32 +08:00
# Convert inline formulas: \( \) => $ $
2026-02-04 12:00:06 +08:00
cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")
2025-12-31 17:38:32 +08:00
# Convert block formulas: \[ \] => $$ $$
2026-02-04 12:00:06 +08:00
cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")
2025-12-31 17:38:32 +08:00
# Remove spaces between $ and formula content
2026-02-04 12:00:06 +08:00
cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)
2025-12-31 17:38:32 +08:00
# Convert matrix environments for better Word rendering
cleaned_md = self._convert_matrix_environments(cleaned_md)
2026-01-14 14:18:00 +08:00
# Fix array environment column specifiers (remove spaces)
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
2025-12-31 17:38:32 +08:00
# Fix brace spacing for equation systems
cleaned_md = self._fix_brace_spacing(cleaned_md)
# Convert cases and aligned environments
cleaned_md = self._convert_special_environments(cleaned_md)
2026-01-14 14:18:00 +08:00
# Handle LaTeX \tag{} commands for equation numbering
cleaned_md = self._convert_tag_commands(cleaned_md)
2025-12-31 17:38:32 +08:00
return cleaned_md
def _convert_matrix_environments(self, md_text: str) -> str:
"""Convert vmatrix/Vmatrix to left/right delimited forms.
This fixes the vertical line height issues in Word.
"""
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
2026-02-04 12:00:06 +08:00
md_text = self._RE_VMATRIX.sub(
2025-12-31 17:38:32 +08:00
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
md_text,
)
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
2026-02-04 12:00:06 +08:00
md_text = self._RE_VMATRIX_DOUBLE.sub(
2025-12-31 17:38:32 +08:00
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
md_text,
)
return md_text
2026-01-14 14:18:00 +08:00
def _fix_array_column_specifiers(self, md_text: str) -> str:
"""Fix array environment column specifiers by removing spaces.
Pandoc's OMML converter doesn't accept spaces between column alignment
specifiers in array environments. This converts patterns like
{c c c c} to {cccc}.
"""
def remove_spaces_in_specifier(match: re.Match) -> str:
"""Remove spaces from column specifier."""
specifier = match.group(1)
2026-02-04 12:00:06 +08:00
return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
2026-01-14 14:18:00 +08:00
2026-02-04 12:00:06 +08:00
return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
2026-01-14 14:18:00 +08:00
2025-12-31 17:38:32 +08:00
def _fix_brace_spacing(self, md_text: str) -> str:
"""Fix spacing issues with braces in equation systems.
Removes whitespace and adds negative space for proper alignment in Word/OMML.
"""
2026-02-04 12:00:06 +08:00
md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
2025-12-31 17:38:32 +08:00
return md_text
def _convert_special_environments(self, md_text: str) -> str:
"""Convert cases and aligned environments to array format.
These environments have better rendering support in Word/OMML.
"""
2026-02-04 12:00:06 +08:00
# Pre-compiled pattern for alignment marker removal
_re_align_marker = re.compile(r"(^|\\\\)\s*&")
2025-12-31 17:38:32 +08:00
def convert_cases(match: re.Match) -> str:
content = match.group(1)
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
2026-02-04 12:00:06 +08:00
md_text = self._RE_CASES.sub(convert_cases, md_text)
2025-12-31 17:38:32 +08:00
def convert_aligned_to_array(match: re.Match) -> str:
content = match.group(1)
2026-02-04 12:00:06 +08:00
content = _re_align_marker.sub(r"\1", content)
2025-12-31 17:38:32 +08:00
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
2026-02-04 12:00:06 +08:00
md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)
2025-12-31 17:38:32 +08:00
def convert_standalone_aligned(match: re.Match) -> str:
content = match.group(1)
2026-02-04 12:00:06 +08:00
content = _re_align_marker.sub(r"\1", content)
2025-12-31 17:38:32 +08:00
return r"\begin{array}{l}" + content + r"\end{array}"
2026-02-04 12:00:06 +08:00
md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)
2025-12-31 17:38:32 +08:00
return md_text
2026-01-14 14:18:00 +08:00
def _convert_tag_commands(self, md_text: str) -> str:
"""Convert LaTeX \\tag{} commands to Word-compatible format.
The \\tag{} command is not supported in Word OMML format, so we convert it to
2026-02-04 12:00:06 +08:00
use simple spacing (\\quad) to push the equation number to the right side.
2026-01-14 14:18:00 +08:00
"""
def convert_tag(match: re.Match) -> str:
formula_content = match.group(1)
tag_content = match.group(2)
return f"$${formula_content} \\quad ({tag_content})$$"
2026-02-04 12:00:06 +08:00
return self._RE_TAG.sub(convert_tag, md_text)
2026-01-14 14:18:00 +08:00
2025-12-31 17:38:32 +08:00
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
"""Export markdown to docx or pdf file.
Args:
md_text: Markdown text to export.
export_type: Export format, either 'docx' or 'pdf'.
Returns:
bytes of the exported file.
Raises:
ValueError: If export_type is not supported.
RuntimeError: If export fails.
"""
# Preprocess markdown
cleaned_md = self.preprocess_for_export(md_text)
# Create temp file for input
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
f_in.write(cleaned_md.encode("utf-8"))
md_path = f_in.name
output_file = md_path + "." + export_type
try:
if export_type == "docx":
self._export_docx(md_path, output_file)
with open(output_file, "rb") as f:
return f.read()
else: # pdf
self._export_pdf(md_path, output_file)
with open(output_file, "rb") as f:
return f.read()
except Exception as e:
# Cleanup on error
self._cleanup_files(md_path, output_file)
raise RuntimeError(f"Export failed: {e}") from e
finally:
# Always cleanup input file
if os.path.exists(md_path):
os.remove(md_path)
def _export_docx(self, input_path: str, output_path: str) -> None:
"""Export to DOCX format using pypandoc."""
extra_args = [
"--highlight-style=pygments",
f"--reference-doc=app/pkg/reference.docx",
]
pypandoc.convert_file(
input_path,
"docx",
format=self.INPUT_FORMAT,
outputfile=output_path,
extra_args=extra_args,
)
def _export_pdf(self, input_path: str, output_path: str) -> None:
"""Export to PDF format using pypandoc with XeLaTeX."""
extra_args = [
"--pdf-engine=xelatex",
"-V",
"mainfont=Noto Sans CJK SC",
"--highlight-style=pygments",
]
pypandoc.convert_file(
input_path,
"pdf",
format=self.INPUT_FORMAT,
outputfile=output_path,
extra_args=extra_args,
)
def _cleanup_files(self, *paths: str) -> None:
"""Remove files if they exist."""
for path in paths:
if os.path.exists(path):
os.remove(path)
def cleanup_export_file(self, file_path: str) -> None:
"""Cleanup exported file after sending response.
Call this after sending the file to the client.
Args:
file_path: Path to the exported file.
"""
if os.path.exists(file_path):
os.remove(file_path)