"""Markdown conversion and export service using pypandoc.""" import os import re import tempfile from dataclasses import dataclass from functools import lru_cache from typing import Literal import pypandoc from latex2mathml.converter import convert as latex_to_mathml @dataclass class ConvertResult: """Result of markdown conversion. Only populated when input contains pure LaTeX formula. All fields are empty strings when input contains mixed content (text + formula). Attributes: latex: Pure LaTeX formula code (without delimiters). mathml: Standard MathML format. mml: XML MathML with mml: namespace prefix (mml:math). """ latex: str mathml: str mml: str @dataclass class ExportResult: """Result of markdown export.""" file_path: str content_type: str download_name: str ExportType = Literal["docx", "pdf"] # MathML namespace MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML" OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math" # XSLT for MathML to mml: namespace conversion MML_XSLT = """ """ class Converter: """Service for conversion and export operations. Conversion rules: - Only pure LaTeX formulas can be converted to latex/mathml/mml formats. - Mixed content (text + formula) returns empty results for all formats. - OMML conversion is provided as a separate method due to performance overhead. Performance optimizations: - Pre-compiled regex patterns - XSLT-based MML conversion - Cached XSLT transforms - Direct Pandoc OMML output (avoids DOCX parsing) """ # Pandoc input format with LaTeX math extensions INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" # Pre-compiled regex patterns for formula detection _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$") _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]") _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)") _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)") _RE_MATH_ELEMENT = re.compile(r"]*>[\s\S]*?") # Pre-compiled regex patterns for preprocessing _RE_VSPACE = re.compile(r"\\\[1mm\]") _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL) _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL) _RE_ARITHMATEX = re.compile(r'(.*?)') _RE_INLINE_SPACE = re.compile(r"(? bool: """Check if text contains only a LaTeX formula (no mixed content). A text is considered formula-only if it matches one of these patterns: - Display math: $$...$$ or \\[...\\] - Inline math: $...$ or \\(...\\) Args: text: Input text to check. Returns: True if the text contains only a LaTeX formula, False otherwise. """ text = text.strip() if not text: return False # Strict patterns: entire text must be a single formula with delimiters # Using pre-compiled patterns with fullmatch semantics if self._RE_DISPLAY_DOLLAR.fullmatch(text): return True if self._RE_DISPLAY_BRACKET.fullmatch(text): return True if self._RE_INLINE_DOLLAR.fullmatch(text): return True if self._RE_INLINE_PAREN.fullmatch(text): return True return False def convert_to_formats(self, md_text: str) -> ConvertResult: """Convert markdown to LaTeX, MathML, and MML formats. Only converts when input contains a pure LaTeX formula. Mixed content (text + formula) returns empty strings for all fields. Args: md_text: Markdown text to convert. Returns: ConvertResult with latex, mathml, and mml fields. All fields are empty if input is not a pure formula. Raises: RuntimeError: If conversion fails for a valid formula. """ # Empty input returns empty result if not md_text or not md_text.strip(): return ConvertResult(latex="", mathml="", mml="") # Check if input is formula-only if not self._is_formula_only(md_text): # Mixed content: cannot convert to formula formats return ConvertResult(latex="", mathml="", mml="") try: # Detect if formula is display (block) or inline is_display = self._is_display_formula(md_text) # Extract the LaTeX formula content (remove delimiters) latex_formula = self._extract_latex_formula(md_text) # Preprocess formula for better conversion (fix array specifiers, etc.) preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula) # Convert to MathML (pass display flag to use correct delimiters) mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display) # Convert MathML to mml:math format (with namespace prefix) mml = self._mathml_to_mml(mathml) return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml) except Exception as e: raise RuntimeError(f"Conversion failed: {e}") from e def convert_to_omml(self, latex_formula: str) -> str: """Convert LaTeX formula to OMML (Office Math Markup Language). This is a separate method due to the performance overhead of OMML conversion, which requires creating a temporary DOCX file. The formula is preprocessed using the same logic as export_to_file to ensure proper conversion. Args: latex_formula: Pure LaTeX formula (without delimiters like $ or $$). Returns: OMML representation as XML string. Raises: ValueError: If latex_formula is empty. RuntimeError: If conversion fails. """ if not latex_formula or not latex_formula.strip(): raise ValueError("LaTeX formula cannot be empty") # Preprocess formula using the same preprocessing as export preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip()) return self._latex_to_omml(preprocessed) def _preprocess_formula_for_conversion(self, latex_formula: str) -> str: """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.). Applies the same preprocessing steps as preprocess_for_export to ensure consistency across all conversion paths. This fixes common issues that cause Pandoc conversion to fail. Note: OCR errors (number errors, command spacing) are fixed earlier in the pipeline (in ocr_service.py), so we don't need to handle them here. Args: latex_formula: Pure LaTeX formula. Returns: Preprocessed LaTeX formula. """ # 1. Convert matrix environments latex_formula = self._convert_matrix_environments(latex_formula) # 2. Fix array column specifiers (remove spaces) latex_formula = self._fix_array_column_specifiers(latex_formula) # 3. Fix brace spacing latex_formula = self._fix_brace_spacing(latex_formula) # 4. Convert special environments (cases, aligned) latex_formula = self._convert_special_environments(latex_formula) return latex_formula def _is_display_formula(self, text: str) -> bool: """Check if the formula is a display (block) formula. Args: text: Text containing LaTeX formula with delimiters. Returns: True if display formula ($$...$$ or \\[...\\]), False if inline. """ text = text.strip() # Display math delimiters: $$...$$ or \[...\] if text.startswith("$$") and text.endswith("$$"): return True if text.startswith("\\[") and text.endswith("\\]"): return True # Inline math delimiters: $...$ or \(...\) return False def _extract_latex_formula(self, text: str) -> str: """Extract LaTeX formula from text by removing delimiters. Args: text: Text containing LaTeX formula with delimiters. Returns: Pure LaTeX formula without delimiters. """ text = text.strip() # Remove display math delimiters: $$...$$ or \[...\] if text.startswith("$$") and text.endswith("$$"): return text[2:-2].strip() if text.startswith("\\[") and text.endswith("\\]"): return text[2:-2].strip() # Remove inline math delimiters: $...$ or \(...\) if text.startswith("$") and text.endswith("$") and not text.startswith("$$"): return text[1:-1].strip() if text.startswith("\\(") and text.endswith("\\)"): return text[2:-2].strip() # If no delimiters, return as-is return text.strip() @staticmethod @lru_cache(maxsize=256) def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str: """Cached conversion of LaTeX formula to MathML. Uses Pandoc for conversion to ensure Word compatibility. Pandoc generates standard MathML that Word can properly import. Args: latex_formula: Pure LaTeX formula (without delimiters). is_display: True if display (block) formula, False if inline. Returns: Standard MathML representation. """ # Use appropriate delimiters based on formula type # Display formulas use $$...$$, inline formulas use $...$ if is_display: pandoc_input = f"$${latex_formula}$$" else: pandoc_input = f"${latex_formula}$" try: # Use Pandoc for Word-compatible MathML (primary method) mathml_html = pypandoc.convert_text( pandoc_input, "html", format="markdown+tex_math_dollars", extra_args=["--mathml"], ) # Extract just the element from the HTML match = Converter._RE_MATH_ELEMENT.search(mathml_html) if match: mathml = match.group(0) # Post-process for Word compatibility return Converter._postprocess_mathml_for_word(mathml) # If Pandoc didn't generate MathML (returned HTML instead), use fallback # This happens when Pandoc's mathml output format is not available or fails raise ValueError("Pandoc did not generate MathML, got HTML instead") except Exception as pandoc_error: # Fallback: try latex2mathml (less Word-compatible) try: mathml = latex_to_mathml(latex_formula) return Converter._postprocess_mathml_for_word(mathml) except Exception as e: raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e @staticmethod def _postprocess_mathml_for_word(mathml: str) -> str: """Post-process MathML to improve Word compatibility. Applies transformations to make MathML more compatible and concise: - Remove and wrappers (Word doesn't need them) - Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.) - Remove redundant single wrappers - Change display="inline" to display="block" for better rendering - Decode Unicode entities to actual characters (Word prefers this) - Ensure proper namespace Args: mathml: MathML string. Returns: Simplified, Word-compatible MathML string. """ import re # Step 1: Remove and wrappers # These often cause Word import issues if "" in mathml: # Extract content between and match = re.search(r"(.*?)]*)>", mathml) if math_match: math_attrs = math_match.group(1) # Rebuild without semantics mathml = f"{content}" # Step 2: Remove unnecessary attributes that don't affect rendering # These are verbose and Word doesn't need them unnecessary_attrs = [ r'\s+form="prefix"', r'\s+form="postfix"', r'\s+form="infix"', r'\s+stretchy="true"', r'\s+stretchy="false"', r'\s+fence="true"', r'\s+fence="false"', r'\s+separator="true"', r'\s+separator="false"', r'\s+columnalign="[^"]*"', r'\s+columnspacing="[^"]*"', r'\s+rowspacing="[^"]*"', r'\s+class="[^"]*"', r'\s+style="[^"]*"', ] for attr_pattern in unnecessary_attrs: mathml = re.sub(attr_pattern, "", mathml) # Step 3: Remove redundant single wrapper at the top level # Pattern: content # Simplify to: content mrow_pattern = r"(]*>)\s*(.*?)\s*()" match = re.search(mrow_pattern, mathml, re.DOTALL) if match: # Check if there's only one mrow at the top level content = match.group(2) # Only remove if the content doesn't have other top-level elements if not re.search(r"]+>\s*<[^/]", content): mathml = f"{match.group(1)}{content}{match.group(3)}" # Step 4: Change display to block for better Word rendering mathml = mathml.replace('display="inline"', 'display="block"') # Step 5: If no display attribute, add it if "display=" not in mathml and "", "(": "(", ")": ")", ",": ",", ".": ".", "|": "|", "°": "°", "×": "×", # times "÷": "÷", # div "±": "±", # pm "∓": "∓", # mp # Ellipsis symbols "…": "…", # ldots (horizontal) "⋮": "⋮", # vdots (vertical) "⋯": "⋯", # cdots (centered) "⋰": "⋰", # iddots (diagonal up) "⋱": "⋱", # ddots (diagonal down) # Greek letters (lowercase) "α": "α", # alpha "β": "β", # beta "γ": "γ", # gamma "δ": "δ", # delta "ε": "ε", # epsilon "ζ": "ζ", # zeta "η": "η", # eta "θ": "θ", # theta "ι": "ι", # iota "κ": "κ", # kappa "λ": "λ", # lambda "μ": "μ", # mu "ν": "ν", # nu "ξ": "ξ", # xi "ο": "ο", # omicron "π": "π", # pi "ρ": "ρ", # rho "ς": "ς", # final sigma "σ": "σ", # sigma "τ": "τ", # tau "υ": "υ", # upsilon "φ": "φ", # phi "χ": "χ", # chi "ψ": "ψ", # psi "ω": "ω", # omega "ϕ": "ϕ", # phi variant # Greek letters (uppercase) "Α": "Α", # Alpha "Β": "Β", # Beta "Γ": "Γ", # Gamma "Δ": "Δ", # Delta "Ε": "Ε", # Epsilon "Ζ": "Ζ", # Zeta "Η": "Η", # Eta "Θ": "Θ", # Theta "Ι": "Ι", # Iota "Κ": "Κ", # Kappa "Λ": "Λ", # Lambda "Μ": "Μ", # Mu "Ν": "Ν", # Nu "Ξ": "Ξ", # Xi "Ο": "Ο", # Omicron "Π": "Π", # Pi "Ρ": "Ρ", # Rho "Σ": "Σ", # Sigma "Τ": "Τ", # Tau "Υ": "Υ", # Upsilon "Φ": "Φ", # Phi "Χ": "Χ", # Chi "Ψ": "Ψ", # Psi "Ω": "Ω", # Omega # Math symbols "∅": "∅", # emptyset "∈": "∈", # in "∉": "∉", # notin "∋": "∋", # ni "∌": "∌", # nni "∑": "∑", # sum "∏": "∏", # prod "√": "√", # sqrt "∛": "∛", # cbrt "∜": "∜", # fourthroot "∞": "∞", # infty "∩": "∩", # cap "∪": "∪", # cup "∫": "∫", # int "∬": "∬", # iint "∭": "∭", # iiint "∮": "∮", # oint "⊂": "⊂", # subset "⊃": "⊃", # supset "⊄": "⊄", # nsubset "⊅": "⊅", # nsupset "⊆": "⊆", # subseteq "⊇": "⊇", # supseteq "⊈": "⊈", # nsubseteq "⊉": "⊉", # nsupseteq "≤": "≤", # leq "≥": "≥", # geq "≠": "≠", # neq "≡": "≡", # equiv "≈": "≈", # approx "≃": "≃", # simeq "≅": "≅", # cong "∂": "∂", # partial "∇": "∇", # nabla "∀": "∀", # forall "∃": "∃", # exists "∄": "∄", # nexists "¬": "¬", # neg/lnot "∧": "∧", # wedge/land "∨": "∨", # vee/lor "→": "→", # to/rightarrow "←": "←", # leftarrow "↔": "↔", # leftrightarrow "⇒": "⇒", # Rightarrow "⇐": "⇐", # Leftarrow "⇔": "⇔", # Leftrightarrow "↑": "↑", # uparrow "↓": "↓", # downarrow "⇑": "⇑", # Uparrow "⇓": "⇓", # Downarrow "↕": "↕", # updownarrow "⇕": "⇕", # Updownarrow "≠": "≠", # ne "≪": "≪", # ll "≫": "≫", # gg "⩽": "⩽", # leqslant "⩾": "⩾", # geqslant "⊥": "⊥", # perp "∥": "∥", # parallel "∠": "∠", # angle "△": "△", # triangle "□": "□", # square "◊": "◊", # diamond "♠": "♠", # spadesuit "♡": "♡", # heartsuit "♢": "♢", # diamondsuit "♣": "♣", # clubsuit "ℓ": "ℓ", # ell "℘": "℘", # wp (Weierstrass p) "ℜ": "ℜ", # Re (real part) "ℑ": "ℑ", # Im (imaginary part) "ℵ": "ℵ", # aleph "ℶ": "ℶ", # beth } for entity, char in unicode_map.items(): mathml = mathml.replace(entity, char) # Also handle decimal entity format (&#NNNN;) for common characters # Convert decimal to hex-based lookup decimal_patterns = [ (r"λ", "λ"), # lambda (decimal 955 = hex 03BB) (r"⋮", "⋮"), # vdots (decimal 8942 = hex 22EE) (r"⋯", "⋯"), # cdots (decimal 8943 = hex 22EF) (r"…", "…"), # ldots (decimal 8230 = hex 2026) (r"∞", "∞"), # infty (decimal 8734 = hex 221E) (r"∑", "∑"), # sum (decimal 8721 = hex 2211) (r"∏", "∏"), # prod (decimal 8719 = hex 220F) (r"√", "√"), # sqrt (decimal 8730 = hex 221A) (r"∈", "∈"), # in (decimal 8712 = hex 2208) (r"∉", "∉"), # notin (decimal 8713 = hex 2209) (r"∩", "∩"), # cap (decimal 8745 = hex 2229) (r"∪", "∪"), # cup (decimal 8746 = hex 222A) (r"≤", "≤"), # leq (decimal 8804 = hex 2264) (r"≥", "≥"), # geq (decimal 8805 = hex 2265) (r"≠", "≠"), # neq (decimal 8800 = hex 2260) (r"≈", "≈"), # approx (decimal 8776 = hex 2248) (r"≡", "≡"), # equiv (decimal 8801 = hex 2261) ] for pattern, char in decimal_patterns: mathml = mathml.replace(pattern, char) # Step 8: Clean up extra whitespace mathml = re.sub(r">\s+<", "><", mathml) return mathml def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str: """Convert LaTeX formula to standard MathML. Args: latex_formula: Pure LaTeX formula (without delimiters). is_display: True if display (block) formula, False if inline. Returns: Standard MathML representation. """ return self._latex_to_mathml_cached(latex_formula, is_display=is_display) def _mathml_to_mml(self, mathml: str) -> str: """Convert standard MathML to mml:math format with namespace prefix. Uses XSLT for efficient transformation. Transforms: - to - All child elements like , to , Args: mathml: Standard MathML string. Returns: MathML with mml: namespace prefix. """ if not mathml: return "" try: from lxml import etree # Parse MathML root = etree.fromstring(mathml.encode("utf-8")) # Apply XSLT transformation (cached) transform = self._get_mml_xslt_transform() result_tree = transform(root) # Serialize to string return str(result_tree) except Exception: # Fallback: simple string replacement (less robust but no lxml dependency) result = mathml # Add namespace to root math element result = re.sub( r"", "", result) # Add mml: prefix to all other elements using a single regex # Match opening tags result = re.sub( r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|" r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|" r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|" r"maction|semantics|annotation|annotation-xml)\b", r"", r"", result, ) return result def _latex_to_omml(self, latex_formula: str) -> str: """Convert LaTeX formula to OMML (Office Math Markup Language). Uses Pandoc to create DOCX in memory and extracts OMML from it. Optimized to minimize disk I/O by using in-memory zip processing. Args: latex_formula: Pure LaTeX formula (without delimiters). Returns: OMML representation as XML string. """ import io import zipfile try: from lxml import etree # Convert to DOCX bytes using Pandoc # We still need a temp file for input, but output goes to temp file too # Then we process the DOCX in memory with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(f"$${latex_formula}$$\n") temp_md = f.name temp_docx = temp_md.replace(".md", ".docx") try: pypandoc.convert_file( temp_md, "docx", format=self.INPUT_FORMAT, outputfile=temp_docx, ) # Read DOCX into memory and process as ZIP with open(temp_docx, "rb") as f: docx_bytes = f.read() # Extract document.xml from DOCX (which is a ZIP file) with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf: document_xml = zf.read("word/document.xml") # Parse XML and extract OMML root = etree.fromstring(document_xml) # Find all oMath elements omml_parts = [] for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"): omml_parts.append(etree.tostring(math, encoding="unicode")) return "\n".join(omml_parts) finally: # Cleanup temp files if os.path.exists(temp_md): os.remove(temp_md) if os.path.exists(temp_docx): os.remove(temp_docx) except Exception as e: raise RuntimeError(f"OMML conversion failed: {e}") from e def preprocess_for_export(self, md_text: str) -> str: """Preprocess markdown text for export to docx/pdf. Handles LaTeX formula formatting, matrix environments, and other transformations needed for proper Word/PDF rendering. Uses pre-compiled regex patterns for better performance. Args: md_text: Raw markdown text. Returns: Preprocessed markdown text. """ # Replace \[1mm] => \vspace{1mm} md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text) # Add blank lines around \[...\] block formulas md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text) md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text) # Remove arithmatex span wrappers cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text) # Convert inline formulas: \( \) => $ $ cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$") # Convert block formulas: \[ \] => $$ $$ cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$") # Remove spaces between $ and formula content cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md) # Convert matrix environments for better Word rendering cleaned_md = self._convert_matrix_environments(cleaned_md) # Fix array environment column specifiers (remove spaces) cleaned_md = self._fix_array_column_specifiers(cleaned_md) # Fix brace spacing for equation systems cleaned_md = self._fix_brace_spacing(cleaned_md) # Convert cases and aligned environments cleaned_md = self._convert_special_environments(cleaned_md) # Handle LaTeX \tag{} commands for equation numbering cleaned_md = self._convert_tag_commands(cleaned_md) return cleaned_md def _convert_matrix_environments(self, md_text: str) -> str: """Convert vmatrix/Vmatrix to left/right delimited forms. This fixes the vertical line height issues in Word. """ # vmatrix -> \left| \begin{matrix}...\end{matrix} \right| md_text = self._RE_VMATRIX.sub( r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", md_text, ) # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| md_text = self._RE_VMATRIX_DOUBLE.sub( r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", md_text, ) return md_text def _fix_array_column_specifiers(self, md_text: str) -> str: """Fix array environment column specifiers by removing spaces. Pandoc's OMML converter doesn't accept spaces between column alignment specifiers in array environments. This converts patterns like {c c c c} to {cccc}. """ def remove_spaces_in_specifier(match: re.Match) -> str: """Remove spaces from column specifier.""" specifier = match.group(1) return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}" return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text) def _fix_brace_spacing(self, md_text: str) -> str: """Fix spacing issues with braces in equation systems. Removes whitespace and adds negative space for proper alignment in Word/OMML. """ md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text) md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text) return md_text def _convert_special_environments(self, md_text: str) -> str: """Convert cases and aligned environments to array format. These environments have better rendering support in Word/OMML. """ # Pre-compiled pattern for alignment marker removal _re_align_marker = re.compile(r"(^|\\\\)\s*&") def convert_cases(match: re.Match) -> str: content = match.group(1) return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." md_text = self._RE_CASES.sub(convert_cases, md_text) def convert_aligned_to_array(match: re.Match) -> str: content = match.group(1) content = _re_align_marker.sub(r"\1", content) return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text) def convert_standalone_aligned(match: re.Match) -> str: content = match.group(1) content = _re_align_marker.sub(r"\1", content) return r"\begin{array}{l}" + content + r"\end{array}" md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text) return md_text def _convert_tag_commands(self, md_text: str) -> str: """Convert LaTeX \\tag{} commands to Word-compatible format. The \\tag{} command is not supported in Word OMML format, so we convert it to use simple spacing (\\quad) to push the equation number to the right side. """ def convert_tag(match: re.Match) -> str: formula_content = match.group(1) tag_content = match.group(2) return f"$${formula_content} \\quad ({tag_content})$$" return self._RE_TAG.sub(convert_tag, md_text) def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: """Export markdown to docx or pdf file. Args: md_text: Markdown text to export. export_type: Export format, either 'docx' or 'pdf'. Returns: bytes of the exported file. Raises: ValueError: If export_type is not supported. RuntimeError: If export fails. """ # Preprocess markdown cleaned_md = self.preprocess_for_export(md_text) # Create temp file for input with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in: f_in.write(cleaned_md.encode("utf-8")) md_path = f_in.name output_file = md_path + "." + export_type try: if export_type == "docx": self._export_docx(md_path, output_file) with open(output_file, "rb") as f: return f.read() else: # pdf self._export_pdf(md_path, output_file) with open(output_file, "rb") as f: return f.read() except Exception as e: # Cleanup on error self._cleanup_files(md_path, output_file) raise RuntimeError(f"Export failed: {e}") from e finally: # Always cleanup input file if os.path.exists(md_path): os.remove(md_path) def _export_docx(self, input_path: str, output_path: str) -> None: """Export to DOCX format using pypandoc.""" extra_args = [ "--highlight-style=pygments", f"--reference-doc=app/pkg/reference.docx", ] pypandoc.convert_file( input_path, "docx", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _export_pdf(self, input_path: str, output_path: str) -> None: """Export to PDF format using pypandoc with XeLaTeX.""" extra_args = [ "--pdf-engine=xelatex", "-V", "mainfont=Noto Sans CJK SC", "--highlight-style=pygments", ] pypandoc.convert_file( input_path, "pdf", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _cleanup_files(self, *paths: str) -> None: """Remove files if they exist.""" for path in paths: if os.path.exists(path): os.remove(path) def cleanup_export_file(self, file_path: str) -> None: """Cleanup exported file after sending response. Call this after sending the file to the client. Args: file_path: Path to the exported file. """ if os.path.exists(file_path): os.remove(file_path)