"""Markdown conversion and export service using pypandoc.""" import os import re import tempfile from dataclasses import dataclass from typing import Literal import pypandoc @dataclass class ConvertResult: """Result of markdown conversion.""" latex: str mathml: str @dataclass class ExportResult: """Result of markdown export.""" file_path: str content_type: str download_name: str ExportType = Literal["docx", "pdf"] class Converter: """Service for conversion and export operations.""" # Pandoc input format with LaTeX math extensions INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash" def __init__(self): """Initialize converter.""" def convert_to_formats(self, md_text: str) -> ConvertResult: """Convert markdown to LaTeX and MathML formats. Args: md_text: Markdown text to convert. Returns: ConvertResult with latex and mathml fields. Raises: ValueError: If md_text is empty. RuntimeError: If conversion fails. """ if md_text == "": return ConvertResult(latex="", mathml="") try: # Convert to LaTeX latex_output = pypandoc.convert_text( md_text, "latex", format=self.INPUT_FORMAT, ).rstrip("\n") # Convert to HTML with MathML mathml_output = pypandoc.convert_text( md_text, "html", format=self.INPUT_FORMAT, extra_args=["--mathml"], ).rstrip("\n") return ConvertResult(latex=latex_output, mathml=mathml_output) except Exception as e: raise RuntimeError(f"Conversion failed: {e}") from e def preprocess_for_export(self, md_text: str) -> str: """Preprocess markdown text for export to docx/pdf. Handles LaTeX formula formatting, matrix environments, and other transformations needed for proper Word/PDF rendering. Args: md_text: Raw markdown text. Returns: Preprocessed markdown text. """ # Replace \[1mm] => \vspace{1mm} md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text) # Add blank lines around \[...\] block formulas md_text = re.sub( r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", r"\1\n\n\\[\3\\]\n\n\4", md_text, flags=re.DOTALL, ) md_text = re.sub( r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", r"\n\\[\2\\]\n", md_text, flags=re.MULTILINE | re.DOTALL, ) # Remove arithmatex span wrappers cleaned_md = re.sub(r'(.*?)', r"\1", md_text) # Convert inline formulas: \( \) => $ $ cleaned_md = re.sub(r"\\\(", r"$", cleaned_md) cleaned_md = re.sub(r"\\\)", r"$", cleaned_md) # Convert block formulas: \[ \] => $$ $$ cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md) cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md) # Remove spaces between $ and formula content # Use negative lookahead/lookbehind to avoid matching $$ block formulas cleaned_md = re.sub(r"(? str: """Convert vmatrix/Vmatrix to left/right delimited forms. This fixes the vertical line height issues in Word. """ # vmatrix -> \left| \begin{matrix}...\end{matrix} \right| md_text = re.sub( r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", md_text, flags=re.DOTALL, ) # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| md_text = re.sub( r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", md_text, flags=re.DOTALL, ) return md_text def _fix_array_column_specifiers(self, md_text: str) -> str: """Fix array environment column specifiers by removing spaces. Pandoc's OMML converter doesn't accept spaces between column alignment specifiers in array environments. This converts patterns like {c c c c} to {cccc}. Args: md_text: Markdown text with LaTeX formulas. Returns: Markdown text with fixed array column specifiers. """ def remove_spaces_in_specifier(match: re.Match) -> str: """Remove spaces from column specifier.""" specifier = match.group(1) # Remove all spaces from the specifier specifier_no_spaces = re.sub(r"\s+", "", specifier) return f"\\begin{{array}}{{{specifier_no_spaces}}}" # Match \begin{array}{...} and remove spaces in the column specifier # Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...} md_text = re.sub( r"\\begin\{array\}\{([^}]+)\}", remove_spaces_in_specifier, md_text, ) return md_text def _fix_brace_spacing(self, md_text: str) -> str: """Fix spacing issues with braces in equation systems. Removes whitespace and adds negative space for proper alignment in Word/OMML. """ # Fix \left\{ spacing md_text = re.sub( r"\\left\\\{\s+", r"\\left\\{\\!", md_text, ) # Fix \right\} spacing md_text = re.sub( r"\s+\\right\\\}", r"\\!\\right\\}", md_text, ) return md_text def _convert_special_environments(self, md_text: str) -> str: """Convert cases and aligned environments to array format. These environments have better rendering support in Word/OMML. """ def convert_cases(match: re.Match) -> str: content = match.group(1) return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." md_text = re.sub( r"\\begin\{cases\}(.*?)\\end\{cases\}", convert_cases, md_text, flags=re.DOTALL, ) def convert_aligned_to_array(match: re.Match) -> str: content = match.group(1) # Remove leading & alignment markers (not needed in array{l}) content = re.sub(r"(^|\\\\)\s*&", r"\1", content) return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." md_text = re.sub( r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", convert_aligned_to_array, md_text, flags=re.DOTALL, ) def convert_standalone_aligned(match: re.Match) -> str: content = match.group(1) content = re.sub(r"(^|\\\\)\s*&", r"\1", content) return r"\begin{array}{l}" + content + r"\end{array}" md_text = re.sub( r"\\begin\{aligned\}(.*?)\\end\{aligned\}", convert_standalone_aligned, md_text, flags=re.DOTALL, ) return md_text def _convert_tag_commands(self, md_text: str) -> str: """Convert LaTeX \\tag{} commands to Word-compatible format. The \\tag{} command is not supported in Word OMML format, so we convert it to use simple spacing (\quad) to push the equation number to the right side. The tag remains inside the formula for better compatibility. Args: md_text: Markdown text containing LaTeX formulas with \\tag{}. Returns: Markdown text with \\tag{} commands converted to spacing format. """ def convert_tag(match: re.Match) -> str: """Convert a single \\tag{} command within a formula.""" formula_content = match.group(1) tag_content = match.group(2) # Replace \tag{...} with \quad (...) to push the number to the right # Keep it inside the formula for better Word compatibility return f"$${formula_content} \\quad ({tag_content})$$" # Match display formulas ($$...$$) containing \\tag{...} # Pattern: $$...content...\\tag {?...}...$$ # Allow optional space between \tag and { md_text = re.sub( r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", convert_tag, md_text, flags=re.DOTALL, ) return md_text def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: """Export markdown to docx or pdf file. Args: md_text: Markdown text to export. export_type: Export format, either 'docx' or 'pdf'. Returns: bytes of the exported file. Raises: ValueError: If export_type is not supported. RuntimeError: If export fails. """ # Preprocess markdown cleaned_md = self.preprocess_for_export(md_text) # Create temp file for input with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in: f_in.write(cleaned_md.encode("utf-8")) md_path = f_in.name output_file = md_path + "." + export_type try: if export_type == "docx": self._export_docx(md_path, output_file) with open(output_file, "rb") as f: return f.read() else: # pdf self._export_pdf(md_path, output_file) with open(output_file, "rb") as f: return f.read() except Exception as e: # Cleanup on error self._cleanup_files(md_path, output_file) raise RuntimeError(f"Export failed: {e}") from e finally: # Always cleanup input file if os.path.exists(md_path): os.remove(md_path) def _export_docx(self, input_path: str, output_path: str) -> None: """Export to DOCX format using pypandoc.""" extra_args = [ "--highlight-style=pygments", f"--reference-doc=app/pkg/reference.docx", ] pypandoc.convert_file( input_path, "docx", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _export_pdf(self, input_path: str, output_path: str) -> None: """Export to PDF format using pypandoc with XeLaTeX.""" extra_args = [ "--pdf-engine=xelatex", "-V", "mainfont=Noto Sans CJK SC", "--highlight-style=pygments", ] pypandoc.convert_file( input_path, "pdf", format=self.INPUT_FORMAT, outputfile=output_path, extra_args=extra_args, ) def _cleanup_files(self, *paths: str) -> None: """Remove files if they exist.""" for path in paths: if os.path.exists(path): os.remove(path) def cleanup_export_file(self, file_path: str) -> None: """Cleanup exported file after sending response. Call this after sending the file to the client. Args: file_path: Path to the exported file. """ if os.path.exists(file_path): os.remove(file_path)