"""Markdown conversion and export service using pypandoc.""" import os import re import tempfile from dataclasses import dataclass from functools import lru_cache from typing import Literal import pypandoc from latex2mathml.converter import convert as latex_to_mathml @dataclass class ConvertResult: """Result of markdown conversion. Only populated when input contains pure LaTeX formula. All fields are empty strings when input contains mixed content (text + formula). Attributes: latex: Pure LaTeX formula code (without delimiters). mathml: Standard MathML format. mml: XML MathML with mml: namespace prefix (mml:math). """ latex: str mathml: str mml: str @dataclass class ExportResult: """Result of markdown export.""" file_path: str content_type: str download_name: str ExportType = Literal["docx", "pdf"] # MathML namespace MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML" OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math" # XSLT for MathML to mml: namespace conversion MML_XSLT = """ """ class Converter: """Service for conversion and export operations. Conversion rules: - Only pure LaTeX formulas can be converted to latex/mathml/mml formats. - Mixed content (text + formula) returns empty results for all formats. - OMML conversion is provided as a separate method due to performance overhead. Performance optimizations: - Pre-compiled regex patterns - XSLT-based MML conversion - Cached XSLT transforms - Direct Pandoc OMML output (avoids DOCX parsing) """ # Pandoc input format with LaTeX math extensions INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" # Pre-compiled regex patterns for formula detection _RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$") _RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]") _RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)") _RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)") _RE_MATH_ELEMENT = re.compile(r"]*>[\s\S]*?") # Pre-compiled regex patterns for preprocessing _RE_VSPACE = re.compile(r"\\\[1mm\]") _RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL) _RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL) _RE_ARITHMATEX = re.compile(r'(.*?)') _RE_INLINE_SPACE = re.compile(r"(? bool: """Check if text contains only a LaTeX formula (no mixed content). A text is considered formula-only if it matches one of these patterns: - Display math: $$...$$ or \\[...\\] - Inline math: $...$ or \\(...\\) Args: text: Input text to check. Returns: True if the text contains only a LaTeX formula, False otherwise. """ text = text.strip() if not text: return False # Strict patterns: entire text must be a single formula with delimiters # Using pre-compiled patterns with fullmatch semantics if self._RE_DISPLAY_DOLLAR.fullmatch(text): return True if self._RE_DISPLAY_BRACKET.fullmatch(text): return True if self._RE_INLINE_DOLLAR.fullmatch(text): return True if self._RE_INLINE_PAREN.fullmatch(text): return True return False def convert_to_formats(self, md_text: str) -> ConvertResult: """Convert markdown to LaTeX, MathML, and MML formats. Only converts when input contains a pure LaTeX formula. Mixed content (text + formula) returns empty strings for all fields. Args: md_text: Markdown text to convert. Returns: ConvertResult with latex, mathml, and mml fields. All fields are empty if input is not a pure formula. Raises: RuntimeError: If conversion fails for a valid formula. """ # Empty input returns empty result if not md_text or not md_text.strip(): return ConvertResult(latex="", mathml="", mml="") # Check if input is formula-only if not self._is_formula_only(md_text): # Mixed content: cannot convert to formula formats return ConvertResult(latex="", mathml="", mml="") try: # Extract the LaTeX formula content (remove delimiters) latex_formula = self._extract_latex_formula(md_text) # Preprocess formula for better conversion (fix array specifiers, etc.) preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula) # Convert to MathML mathml = self._latex_to_mathml(preprocessed_formula) # Convert MathML to mml:math format (with namespace prefix) mml = self._mathml_to_mml(mathml) return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml) except Exception as e: raise RuntimeError(f"Conversion failed: {e}") from e def convert_to_omml(self, latex_formula: str) -> str: """Convert LaTeX formula to OMML (Office Math Markup Language). This is a separate method due to the performance overhead of OMML conversion, which requires creating a temporary DOCX file. The formula is preprocessed using the same logic as export_to_file to ensure proper conversion. Args: latex_formula: Pure LaTeX formula (without delimiters like $ or $$). Returns: OMML representation as XML string. Raises: ValueError: If latex_formula is empty. RuntimeError: If conversion fails. """ if not latex_formula or not latex_formula.strip(): raise ValueError("LaTeX formula cannot be empty") # Preprocess formula using the same preprocessing as export preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip()) return self._latex_to_omml(preprocessed) def _preprocess_formula_for_conversion(self, latex_formula: str) -> str: """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.). Applies the same preprocessing steps as preprocess_for_export to ensure consistency across all conversion paths. This fixes common issues that cause Pandoc conversion to fail. Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py), so we don't need to handle them here. Args: latex_formula: Pure LaTeX formula. Returns: Preprocessed LaTeX formula. """ # 1. Convert matrix environments latex_formula = self._convert_matrix_environments(latex_formula) # 2. Fix array column specifiers (remove spaces) latex_formula = self._fix_array_column_specifiers(latex_formula) # 3. Fix brace spacing latex_formula = self._fix_brace_spacing(latex_formula) # 4. Convert special environments (cases, aligned) latex_formula = self._convert_special_environments(latex_formula) return latex_formula def _extract_latex_formula(self, text: str) -> str: """Extract LaTeX formula from text by removing delimiters. Args: text: Text containing LaTeX formula with delimiters. Returns: Pure LaTeX formula without delimiters. """ text = text.strip() # Remove display math delimiters: $$...$$ or \[...\] if text.startswith("$$") and text.endswith("$$"): return text[2:-2].strip() if text.startswith("\\[") and text.endswith("\\]"): return text[2:-2].strip() # Remove inline math delimiters: $...$ or \(...\) if text.startswith("$") and text.endswith("$") and not text.startswith("$$"): return text[1:-1].strip() if text.startswith("\\(") and text.endswith("\\)"): return text[2:-2].strip() # If no delimiters, return as-is return text.strip() @staticmethod @lru_cache(maxsize=256) def _latex_to_mathml_cached(latex_formula: str) -> str: """Cached conversion of LaTeX formula to MathML. Uses Pandoc for conversion to ensure Word compatibility. Pandoc generates standard MathML that Word can properly import. Uses LRU cache to avoid recomputing for repeated formulas. """ try: # Use Pandoc for Word-compatible MathML (primary method) mathml_html = pypandoc.convert_text( f"${latex_formula}$", "html", format="markdown+tex_math_dollars", extra_args=["--mathml"], ) # Extract just the element from the HTML match = Converter._RE_MATH_ELEMENT.search(mathml_html) if match: mathml = match.group(0) # Post-process for Word compatibility return Converter._postprocess_mathml_for_word(mathml) # If no match, return as-is return mathml_html.rstrip("\n") except Exception as pandoc_error: # Fallback: try latex2mathml (less Word-compatible) try: mathml = latex_to_mathml(latex_formula) return Converter._postprocess_mathml_for_word(mathml) except Exception as e: raise RuntimeError( f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}" ) from e @staticmethod def _postprocess_mathml_for_word(mathml: str) -> str: """Post-process MathML to improve Word compatibility. Applies transformations to make MathML more compatible and concise: - Remove and wrappers (Word doesn't need them) - Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.) - Remove redundant single wrappers - Change display="inline" to display="block" for better rendering - Decode Unicode entities to actual characters (Word prefers this) - Ensure proper namespace Args: mathml: MathML string. Returns: Simplified, Word-compatible MathML string. """ import re # Step 1: Remove and wrappers # These often cause Word import issues if '' in mathml: # Extract content between and match = re.search(r'(.*?)]*)>', mathml) if math_match: math_attrs = math_match.group(1) # Rebuild without semantics mathml = f'{content}' # Step 2: Remove unnecessary attributes that don't affect rendering # These are verbose and Word doesn't need them unnecessary_attrs = [ r'\s+form="prefix"', r'\s+form="postfix"', r'\s+form="infix"', r'\s+stretchy="true"', r'\s+stretchy="false"', r'\s+fence="true"', r'\s+fence="false"', r'\s+separator="true"', r'\s+separator="false"', r'\s+columnalign="[^"]*"', r'\s+columnspacing="[^"]*"', r'\s+rowspacing="[^"]*"', r'\s+class="[^"]*"', r'\s+style="[^"]*"', ] for attr_pattern in unnecessary_attrs: mathml = re.sub(attr_pattern, '', mathml) # Step 3: Remove redundant single wrapper at the top level # Pattern: content # Simplify to: content mrow_pattern = r'(]*>)\s*(.*?)\s*()' match = re.search(mrow_pattern, mathml, re.DOTALL) if match: # Check if there's only one mrow at the top level content = match.group(2) # Only remove if the content doesn't have other top-level elements if not re.search(r']+>\s*<[^/]', content): mathml = f'{match.group(1)}{content}{match.group(3)}' # Step 4: Change display to block for better Word rendering mathml = mathml.replace('display="inline"', 'display="block"') # Step 5: If no display attribute, add it if 'display=' not in mathml and '', '(': '(', ')': ')', ',': ',', '.': '.', '|': '|', '°': '°', '×': '×', # times '÷': '÷', # div '±': '±', # pm '∓': '∓', # mp # Ellipsis symbols '…': '…', # ldots (horizontal) '⋮': '⋮', # vdots (vertical) '⋯': '⋯', # cdots (centered) '⋰': '⋰', # iddots (diagonal up) '⋱': '⋱', # ddots (diagonal down) # Greek letters (lowercase) 'α': 'α', # alpha 'β': 'β', # beta 'γ': 'γ', # gamma 'δ': 'δ', # delta 'ε': 'ε', # epsilon 'ζ': 'ζ', # zeta 'η': 'η', # eta 'θ': 'θ', # theta 'ι': 'ι', # iota 'κ': 'κ', # kappa 'λ': 'λ', # lambda 'μ': 'μ', # mu 'ν': 'ν', # nu 'ξ': 'ξ', # xi 'ο': 'ο', # omicron 'π': 'π', # pi 'ρ': 'ρ', # rho 'ς': 'ς', # final sigma 'σ': 'σ', # sigma 'τ': 'τ', # tau 'υ': 'υ', # upsilon 'φ': 'φ', # phi 'χ': 'χ', # chi 'ψ': 'ψ', # psi 'ω': 'ω', # omega 'ϕ': 'ϕ', # phi variant # Greek letters (uppercase) 'Α': 'Α', # Alpha 'Β': 'Β', # Beta 'Γ': 'Γ', # Gamma 'Δ': 'Δ', # Delta 'Ε': 'Ε', # Epsilon 'Ζ': 'Ζ', # Zeta 'Η': 'Η', # Eta 'Θ': 'Θ', # Theta 'Ι': 'Ι', # Iota 'Κ': 'Κ', # Kappa 'Λ': 'Λ', # Lambda 'Μ': 'Μ', # Mu 'Ν': 'Ν', # Nu 'Ξ': 'Ξ', # Xi 'Ο': 'Ο', # Omicron 'Π': 'Π', # Pi 'Ρ': 'Ρ', # Rho 'Σ': 'Σ', # Sigma 'Τ': 'Τ', # Tau 'Υ': 'Υ', # Upsilon 'Φ': 'Φ', # Phi 'Χ': 'Χ', # Chi 'Ψ': 'Ψ', # Psi 'Ω': 'Ω', # Omega # Math symbols '∅': '∅', # emptyset '∈': '∈', # in '∉': '∉', # notin '∋': '∋', # ni '∌': '∌', # nni '∑': '∑', # sum '∏': '∏', # prod '√': '√', # sqrt '∛': '∛', # cbrt '∜': '∜', # fourthroot '∞': '∞', # infty '∩': '∩', # cap '∪': '∪', # cup '∫': '∫', # int '∬': '∬', # iint '∭': '∭', # iiint '∮': '∮', # oint '⊂': '⊂', # subset '⊃': '⊃', # supset '⊄': '⊄', # nsubset '⊅': '⊅', # nsupset '⊆': '⊆', # subseteq '⊇': '⊇', # supseteq '⊈': '⊈', # nsubseteq '⊉': '⊉', # nsupseteq '≤': '≤', # leq '≥': '≥', # geq '≠': '≠', # neq '≡': '≡', # equiv '≈': '≈', # approx '≃': '≃', # simeq '≅': '≅', # cong '∂': '∂', # partial '∇': '∇', # nabla '∀': '∀', # forall '∃': '∃', # exists '∄': '∄', # nexists '¬': '¬', # neg/lnot '∧': '∧', # wedge/land '∨': '∨', # vee/lor '→': '→', # to/rightarrow '←': '←', # leftarrow '↔': '↔', # leftrightarrow '⇒': '⇒', # Rightarrow '⇐': '⇐', # Leftarrow '⇔': '⇔', # Leftrightarrow '↑': '↑', # uparrow '↓': '↓', # downarrow '⇑': '⇑', # Uparrow '⇓': '⇓', # Downarrow '↕': '↕', # updownarrow '⇕': '⇕', # Updownarrow '≠': '≠', # ne '≪': '≪', # ll '≫': '≫', # gg '⩽': '⩽', # leqslant '⩾': '⩾', # geqslant '⊥': '⊥', # perp '∥': '∥', # parallel '∠': '∠', # angle '△': '△', # triangle '□': '□', # square '◊': '◊', # diamond '♠': '♠', # spadesuit '♡': '♡', # heartsuit '♢': '♢', # diamondsuit '♣': '♣', # clubsuit 'ℓ': 'ℓ', # ell '℘': '℘', # wp (Weierstrass p) 'ℜ': 'ℜ', # Re (real part) 'ℑ': 'ℑ', # Im (imaginary part) 'ℵ': 'ℵ', # aleph 'ℶ': 'ℶ', # beth } for entity, char in unicode_map.items(): mathml = mathml.replace(entity, char) # Also handle decimal entity format (&#NNNN;) for common characters # Convert decimal to hex-based lookup decimal_patterns = [ (r'λ', 'λ'), # lambda (decimal 955 = hex 03BB) (r'⋮', '⋮'), # vdots (decimal 8942 = hex 22EE) (r'⋯', '⋯'), # cdots (decimal 8943 = hex 22EF) (r'…', '…'), # ldots (decimal 8230 = hex 2026) (r'∞', '∞'), # infty (decimal 8734 = hex 221E) (r'∑', '∑'), # sum (decimal 8721 = hex 2211) (r'∏', '∏'), # prod (decimal 8719 = hex 220F) (r'√', '√'), # sqrt (decimal 8730 = hex 221A) (r'∈', '∈'), # in (decimal 8712 = hex 2208) (r'∉', '∉'), # notin (decimal 8713 = hex 2209) (r'∩', '∩'), # cap (decimal 8745 = hex 2229) (r'∪', '∪'), # cup (decimal 8746 = hex 222A) (r'≤', '≤'), # leq (decimal 8804 = hex 2264) (r'≥', '≥'), # geq (decimal 8805 = hex 2265) (r'≠', '≠'), # neq (decimal 8800 = hex 2260) (r'≈', '≈'), # approx (decimal 8776 = hex 2248) (r'≡', '≡'), # equiv (decimal 8801 = hex 2261) ] for pattern, char in decimal_patterns: mathml = mathml.replace(pattern, char) # Step 8: Clean up extra whitespace mathml = re.sub(r'>\s+<', '><', mathml) return mathml def _latex_to_mathml(self, latex_formula: str) -> str: """Convert LaTeX formula to standard MathML. Args: latex_formula: Pure LaTeX formula (without delimiters). Returns: Standard MathML representation. """ return self._latex_to_mathml_cached(latex_formula) def _mathml_to_mml(self, mathml: str) -> str: """Convert standard MathML to mml:math format with namespace prefix. Uses XSLT for efficient transformation. Transforms: - to - All child elements like , to , Args: mathml: Standard MathML string. Returns: MathML with mml: namespace prefix. """ if not mathml: return "" try: from lxml import etree # Parse MathML root = etree.fromstring(mathml.encode("utf-8")) # Apply XSLT transformation (cached) transform = self._get_mml_xslt_transform() result_tree = transform(root) # Serialize to string return str(result_tree) except Exception: # Fallback: simple string replacement (less robust but no lxml dependency) result = mathml # Add namespace to root math element result = re.sub( r"", "", result) # Add mml: prefix to all other elements using a single regex # Match opening tags result = re.sub( r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|" r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|" r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|" r"maction|semantics|annotation|annotation-xml)\b", r"", r"", result, ) return result def _latex_to_omml(self, latex_formula: str) -> str: """Convert LaTeX formula to OMML (Office Math Markup Language). Uses Pandoc to create DOCX in memory and extracts OMML from it. Optimized to minimize disk I/O by using in-memory zip processing. Args: latex_formula: Pure LaTeX formula (without delimiters). Returns: OMML representation as XML string. """ import io import zipfile try: from lxml import etree # Convert to DOCX bytes using Pandoc # We still need a temp file for input, but output goes to temp file too # Then we process the DOCX in memory with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(f"$${latex_formula}$$\n") temp_md = f.name temp_docx = temp_md.replace(".md", ".docx") try: pypandoc.convert_file( temp_md, "docx", format=self.INPUT_FORMAT, outputfile=temp_docx, ) # Read DOCX into memory and process as ZIP with open(temp_docx, "rb") as f: docx_bytes = f.read() # Extract document.xml from DOCX (which is a ZIP file) with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf: document_xml = zf.read("word/document.xml") # Parse XML and extract OMML root = etree.fromstring(document_xml) # Find all oMath elements omml_parts = [] for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"): omml_parts.append(etree.tostring(math, encoding="unicode")) return "\n".join(omml_parts) finally: # Cleanup temp files if os.path.exists(temp_md): os.remove(temp_md) if os.path.exists(temp_docx): os.remove(temp_docx) except Exception as e: raise RuntimeError(f"OMML conversion failed: {e}") from e def preprocess_for_export(self, md_text: str) -> str: """Preprocess markdown text for export to docx/pdf. Handles LaTeX formula formatting, matrix environments, and other transformations needed for proper Word/PDF rendering. Uses pre-compiled regex patterns for better performance. Args: md_text: Raw markdown text. Returns: Preprocessed markdown text. """ # Replace \[1mm] => \vspace{1mm} md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text) # Add blank lines around \[...\] block formulas md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text) md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text) # Remove arithmatex span wrappers cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text) # Convert inline formulas: \( \) => $ $ cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$") # Convert block formulas: \[ \] => $$ $$ cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$") # Remove spaces between $ and formula content cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md) # Convert matrix environments for better Word rendering cleaned_md = self._convert_matrix_environments(cleaned_md) # Fix array environment column specifiers (remove spaces) cleaned_md = self._fix_array_column_specifiers(cleaned_md) # Fix brace spacing for equation systems cleaned_md = self._fix_brace_spacing(cleaned_md) # Convert cases and aligned environments cleaned_md = self._convert_special_environments(cleaned_md) # Handle LaTeX \tag{} commands for equation numbering cleaned_md = self._convert_tag_commands(cleaned_md) return cleaned_md def _convert_matrix_environments(self, md_text: str) -> str: """Convert vmatrix/Vmatrix to left/right delimited forms. This fixes the vertical line height issues in Word. """ # vmatrix -> \left| \begin{matrix}...\end{matrix} \right| md_text = self._RE_VMATRIX.sub( r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", md_text, ) # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| md_text = self._RE_VMATRIX_DOUBLE.sub( r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", md_text, ) return md_text def _fix_array_column_specifiers(self, md_text: str) -> str: """Fix array environment column specifiers by removing spaces. Pandoc's OMML converter doesn't accept spaces between column alignment specifiers in array environments. This converts patterns like {c c c c} to {cccc}. """ def remove_spaces_in_specifier(match: re.Match) -> str: """Remove spaces from column specifier.""" specifier = match.group(1) return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}" return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text) def _fix_brace_spacing(self, md_text: str) -> str: """Fix spacing issues with braces in equation systems. Removes whitespace and adds negative space for proper alignment in Word/OMML. """ md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text) md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text) return md_text def _convert_special_environments(self, md_text: str) -> str: """Convert cases and aligned environments to array format. These environments have better rendering support in Word/OMML. """ # Pre-compiled pattern for alignment marker removal _re_align_marker = re.compile(r"(^|\\\\)\s*&") def convert_cases(match: re.Match) -> str: content = match.group(1) return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." md_text = self._RE_CASES.sub(convert_cases, md_text) def convert_aligned_to_array(match: re.Match) -> str: content = match.group(1) content = _re_align_marker.sub(r"\1", content) return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text) def convert_standalone_aligned(match: re.Match) -> str: content = match.group(1) content = _re_align_marker.sub(r"\1", content) return r"\begin{array}{l}" + content + r"\end{array}" md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text) return md_text def _convert_tag_commands(self, md_text: str) -> str: """Convert LaTeX \\tag{} commands to Word-compatible format. The \\tag{} command is not supported in Word OMML format, so we convert it to use simple spacing (\\quad) to push the equation number to the right side. """ def convert_tag(match: re.Match) -> str: formula_content = match.group(1) tag_content = match.group(2) return f"$${formula_content} \\quad ({tag_content})$$" return self._RE_TAG.sub(convert_tag, md_text) def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: """Export markdown to docx or pdf file. Args: md_text: Markdown text to export. export_type: Export format, either 'docx' or 'pdf'. Returns: bytes of the exported file. Raises: ValueError: If export_type is not supported. RuntimeError: If export fails. """ # Preprocess markdown cleaned_md = self.preprocess_for_export(md_text) # Create temp file for input with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in: f_in.write(cleaned_md.encode("utf-8")) md_path = f_in.name output_file = md_path + "." + export_type try: if export_type == "docx": self._export_docx(md_path, output_file) with open(output_file, "rb") as f: return f.read() else: # pdf self._export_pdf(md_path, output_file) with open(output_file, "rb") as f: return f.read() except Exception as e: # Cleanup on error self._cleanup_files(md_path, output_file) raise RuntimeError(f"Export failed: {e}") from e finally: # Always cleanup input file if os.path.exists(md_path): os.remove(md_path) def _export_docx(self, input_path: str, output_path: str) -> None: """Export to DOCX format using pypandoc.""" extra_args = [ "--highlight-style=pygments", f"--reference-doc=app/pkg/reference.docx", ] pypandoc.convert_file( input_path, "docx", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _export_pdf(self, input_path: str, output_path: str) -> None: """Export to PDF format using pypandoc with XeLaTeX.""" extra_args = [ "--pdf-engine=xelatex", "-V", "mainfont=Noto Sans CJK SC", "--highlight-style=pygments", ] pypandoc.convert_file( input_path, "pdf", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _cleanup_files(self, *paths: str) -> None: """Remove files if they exist.""" for path in paths: if os.path.exists(path): os.remove(path) def cleanup_export_file(self, file_path: str) -> None: """Cleanup exported file after sending response. Call this after sending the file to the client. Args: file_path: Path to the exported file. """ if os.path.exists(file_path): os.remove(file_path)