"""Markdown conversion and export service using pypandoc.""" import os import re import tempfile from dataclasses import dataclass from typing import Literal import pypandoc @dataclass class ConvertResult: """Result of markdown conversion.""" latex: str mathml: str @dataclass class ExportResult: """Result of markdown export.""" file_path: str content_type: str download_name: str ExportType = Literal["docx", "pdf"] class Converter: """Service for conversion and export operations.""" # Pandoc input format with LaTeX math extensions INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" def __init__(self): """Initialize converter.""" def convert_to_formats(self, md_text: str) -> ConvertResult: """Convert markdown to LaTeX and MathML formats. Args: md_text: Markdown text to convert. Returns: ConvertResult with latex and mathml fields. Raises: ValueError: If md_text is empty. RuntimeError: If conversion fails. """ if md_text == "": return ConvertResult(latex="", mathml="") try: # Convert to LaTeX latex_output = pypandoc.convert_text( md_text, "latex", format=self.INPUT_FORMAT, ).rstrip("\n") # Convert to HTML with MathML mathml_output = pypandoc.convert_text( md_text, "html", format=self.INPUT_FORMAT, extra_args=["--mathml"], ).rstrip("\n") return ConvertResult(latex=latex_output, mathml=mathml_output) except Exception as e: raise RuntimeError(f"Conversion failed: {e}") from e def preprocess_for_export(self, md_text: str) -> str: """Preprocess markdown text for export to docx/pdf. Handles LaTeX formula formatting, matrix environments, and other transformations needed for proper Word/PDF rendering. Args: md_text: Raw markdown text. Returns: Preprocessed markdown text. """ # Replace \[1mm] => \vspace{1mm} md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text) # Add blank lines around \[...\] block formulas md_text = re.sub( r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", r"\1\n\n\\[\3\\]\n\n\4", md_text, flags=re.DOTALL, ) md_text = re.sub( r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", r"\n\\[\2\\]\n", md_text, flags=re.MULTILINE | re.DOTALL, ) # Remove arithmatex span wrappers cleaned_md = re.sub(r'(.*?)', r"\1", md_text) # Convert inline formulas: \( \) => $ $ cleaned_md = re.sub(r"\\\(", r"$", cleaned_md) cleaned_md = re.sub(r"\\\)", r"$", cleaned_md) # Convert block formulas: \[ \] => $$ $$ cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md) cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md) # Remove spaces between $ and formula content # Use negative lookahead/lookbehind to avoid matching $$ block formulas cleaned_md = re.sub(r"(? str: """Convert vmatrix/Vmatrix to left/right delimited forms. This fixes the vertical line height issues in Word. """ # vmatrix -> \left| \begin{matrix}...\end{matrix} \right| md_text = re.sub( r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", md_text, flags=re.DOTALL, ) # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| md_text = re.sub( r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", md_text, flags=re.DOTALL, ) return md_text def _fix_brace_spacing(self, md_text: str) -> str: """Fix spacing issues with braces in equation systems. Removes whitespace and adds negative space for proper alignment in Word/OMML. """ # Fix \left\{ spacing md_text = re.sub( r"\\left\\\{\s+", r"\\left\\{\\!", md_text, ) # Fix \right\} spacing md_text = re.sub( r"\s+\\right\\\}", r"\\!\\right\\}", md_text, ) return md_text def _convert_special_environments(self, md_text: str) -> str: """Convert cases and aligned environments to array format. These environments have better rendering support in Word/OMML. """ def convert_cases(match: re.Match) -> str: content = match.group(1) return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." md_text = re.sub( r"\\begin\{cases\}(.*?)\\end\{cases\}", convert_cases, md_text, flags=re.DOTALL, ) def convert_aligned_to_array(match: re.Match) -> str: content = match.group(1) # Remove leading & alignment markers (not needed in array{l}) content = re.sub(r"(^|\\\\)\s*&", r"\1", content) return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." md_text = re.sub( r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", convert_aligned_to_array, md_text, flags=re.DOTALL, ) def convert_standalone_aligned(match: re.Match) -> str: content = match.group(1) content = re.sub(r"(^|\\\\)\s*&", r"\1", content) return r"\begin{array}{l}" + content + r"\end{array}" md_text = re.sub( r"\\begin\{aligned\}(.*?)\\end\{aligned\}", convert_standalone_aligned, md_text, flags=re.DOTALL, ) return md_text def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: """Export markdown to docx or pdf file. Args: md_text: Markdown text to export. export_type: Export format, either 'docx' or 'pdf'. Returns: bytes of the exported file. Raises: ValueError: If export_type is not supported. RuntimeError: If export fails. """ # Preprocess markdown cleaned_md = self.preprocess_for_export(md_text) # Create temp file for input with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in: f_in.write(cleaned_md.encode("utf-8")) md_path = f_in.name output_file = md_path + "." + export_type try: if export_type == "docx": self._export_docx(md_path, output_file) with open(output_file, "rb") as f: return f.read() else: # pdf self._export_pdf(md_path, output_file) with open(output_file, "rb") as f: return f.read() except Exception as e: # Cleanup on error self._cleanup_files(md_path, output_file) raise RuntimeError(f"Export failed: {e}") from e finally: # Always cleanup input file if os.path.exists(md_path): os.remove(md_path) def _export_docx(self, input_path: str, output_path: str) -> None: """Export to DOCX format using pypandoc.""" extra_args = [ "--highlight-style=pygments", f"--reference-doc=app/pkg/reference.docx", ] pypandoc.convert_file( input_path, "docx", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _export_pdf(self, input_path: str, output_path: str) -> None: """Export to PDF format using pypandoc with XeLaTeX.""" extra_args = [ "--pdf-engine=xelatex", "-V", "mainfont=Noto Sans CJK SC", "--highlight-style=pygments", ] pypandoc.convert_file( input_path, "pdf", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _cleanup_files(self, *paths: str) -> None: """Remove files if they exist.""" for path in paths: if os.path.exists(path): os.remove(path) def cleanup_export_file(self, file_path: str) -> None: """Cleanup exported file after sending response. Call this after sending the file to the client. Args: file_path: Path to the exported file. """ if os.path.exists(file_path): os.remove(file_path)