2025-12-31 17:38:32 +08:00
|
|
|
"""Markdown conversion and export service using pypandoc."""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import tempfile
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from typing import Literal
|
|
|
|
|
|
|
|
|
|
import pypandoc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class ConvertResult:
|
|
|
|
|
"""Result of markdown conversion."""
|
|
|
|
|
|
|
|
|
|
latex: str
|
|
|
|
|
mathml: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class ExportResult:
|
|
|
|
|
"""Result of markdown export."""
|
|
|
|
|
|
|
|
|
|
file_path: str
|
|
|
|
|
content_type: str
|
|
|
|
|
download_name: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ExportType = Literal["docx", "pdf"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Converter:
|
|
|
|
|
"""Service for conversion and export operations."""
|
|
|
|
|
|
|
|
|
|
# Pandoc input format with LaTeX math extensions
|
|
|
|
|
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
"""Initialize converter."""
|
|
|
|
|
|
|
|
|
|
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
|
|
|
|
"""Convert markdown to LaTeX and MathML formats.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
md_text: Markdown text to convert.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
ConvertResult with latex and mathml fields.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If md_text is empty.
|
|
|
|
|
RuntimeError: If conversion fails.
|
|
|
|
|
"""
|
|
|
|
|
if md_text == "":
|
|
|
|
|
return ConvertResult(latex="", mathml="")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Convert to LaTeX
|
|
|
|
|
latex_output = pypandoc.convert_text(
|
|
|
|
|
md_text,
|
|
|
|
|
"latex",
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
).rstrip("\n")
|
|
|
|
|
|
|
|
|
|
# Convert to HTML with MathML
|
|
|
|
|
mathml_output = pypandoc.convert_text(
|
|
|
|
|
md_text,
|
|
|
|
|
"html",
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
extra_args=["--mathml"],
|
|
|
|
|
).rstrip("\n")
|
|
|
|
|
|
|
|
|
|
return ConvertResult(latex=latex_output, mathml=mathml_output)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise RuntimeError(f"Conversion failed: {e}") from e
|
|
|
|
|
|
|
|
|
|
def preprocess_for_export(self, md_text: str) -> str:
|
|
|
|
|
"""Preprocess markdown text for export to docx/pdf.
|
|
|
|
|
|
|
|
|
|
Handles LaTeX formula formatting, matrix environments, and
|
|
|
|
|
other transformations needed for proper Word/PDF rendering.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
md_text: Raw markdown text.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Preprocessed markdown text.
|
|
|
|
|
"""
|
|
|
|
|
# Replace \[1mm] => \vspace{1mm}
|
|
|
|
|
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
|
|
|
|
|
|
|
|
|
|
# Add blank lines around \[...\] block formulas
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
|
|
|
|
|
r"\1\n\n\\[\3\\]\n\n\4",
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
|
|
|
|
|
r"\n\\[\2\\]\n",
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.MULTILINE | re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Remove arithmatex span wrappers
|
|
|
|
|
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
|
|
|
|
|
|
|
|
|
|
# Convert inline formulas: \( \) => $ $
|
|
|
|
|
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
|
|
|
|
|
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
|
|
|
|
|
|
|
|
|
|
# Convert block formulas: \[ \] => $$ $$
|
|
|
|
|
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
|
|
|
|
|
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
|
|
|
|
|
|
|
|
|
|
# Remove spaces between $ and formula content
|
|
|
|
|
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
|
|
|
|
|
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
|
|
|
|
|
|
|
|
|
|
# Convert matrix environments for better Word rendering
|
|
|
|
|
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
# Fix array environment column specifiers (remove spaces)
|
|
|
|
|
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
# Fix brace spacing for equation systems
|
|
|
|
|
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
|
|
|
|
|
|
|
|
|
# Convert cases and aligned environments
|
|
|
|
|
cleaned_md = self._convert_special_environments(cleaned_md)
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
# Handle LaTeX \tag{} commands for equation numbering
|
|
|
|
|
cleaned_md = self._convert_tag_commands(cleaned_md)
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
return cleaned_md
|
|
|
|
|
|
|
|
|
|
def _convert_matrix_environments(self, md_text: str) -> str:
|
|
|
|
|
"""Convert vmatrix/Vmatrix to left/right delimited forms.
|
|
|
|
|
|
|
|
|
|
This fixes the vertical line height issues in Word.
|
|
|
|
|
"""
|
|
|
|
|
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
|
|
|
|
|
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
|
|
|
|
|
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
def _fix_array_column_specifiers(self, md_text: str) -> str:
|
|
|
|
|
"""Fix array environment column specifiers by removing spaces.
|
|
|
|
|
|
|
|
|
|
Pandoc's OMML converter doesn't accept spaces between column alignment
|
|
|
|
|
specifiers in array environments. This converts patterns like
|
|
|
|
|
{c c c c} to {cccc}.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
md_text: Markdown text with LaTeX formulas.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Markdown text with fixed array column specifiers.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def remove_spaces_in_specifier(match: re.Match) -> str:
|
|
|
|
|
"""Remove spaces from column specifier."""
|
|
|
|
|
specifier = match.group(1)
|
|
|
|
|
# Remove all spaces from the specifier
|
|
|
|
|
specifier_no_spaces = re.sub(r"\s+", "", specifier)
|
|
|
|
|
return f"\\begin{{array}}{{{specifier_no_spaces}}}"
|
|
|
|
|
|
|
|
|
|
# Match \begin{array}{...} and remove spaces in the column specifier
|
|
|
|
|
# Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...}
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\\begin\{array\}\{([^}]+)\}",
|
|
|
|
|
remove_spaces_in_specifier,
|
|
|
|
|
md_text,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def _fix_brace_spacing(self, md_text: str) -> str:
|
|
|
|
|
"""Fix spacing issues with braces in equation systems.
|
|
|
|
|
|
|
|
|
|
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
|
|
|
|
"""
|
|
|
|
|
# Fix \left\{ spacing
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\\left\\\{\s+",
|
|
|
|
|
r"\\left\\{\\!",
|
|
|
|
|
md_text,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Fix \right\} spacing
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\s+\\right\\\}",
|
|
|
|
|
r"\\!\\right\\}",
|
|
|
|
|
md_text,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
|
|
|
|
def _convert_special_environments(self, md_text: str) -> str:
|
|
|
|
|
"""Convert cases and aligned environments to array format.
|
|
|
|
|
|
|
|
|
|
These environments have better rendering support in Word/OMML.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def convert_cases(match: re.Match) -> str:
|
|
|
|
|
content = match.group(1)
|
|
|
|
|
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
|
|
|
|
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\\begin\{cases\}(.*?)\\end\{cases\}",
|
|
|
|
|
convert_cases,
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def convert_aligned_to_array(match: re.Match) -> str:
|
|
|
|
|
content = match.group(1)
|
|
|
|
|
# Remove leading & alignment markers (not needed in array{l})
|
|
|
|
|
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
|
|
|
|
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
|
|
|
|
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
|
|
|
|
|
convert_aligned_to_array,
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def convert_standalone_aligned(match: re.Match) -> str:
|
|
|
|
|
content = match.group(1)
|
|
|
|
|
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
|
|
|
|
return r"\begin{array}{l}" + content + r"\end{array}"
|
|
|
|
|
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
|
|
|
|
|
convert_standalone_aligned,
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
2026-01-14 14:18:00 +08:00
|
|
|
def _convert_tag_commands(self, md_text: str) -> str:
|
|
|
|
|
"""Convert LaTeX \\tag{} commands to Word-compatible format.
|
|
|
|
|
|
|
|
|
|
The \\tag{} command is not supported in Word OMML format, so we convert it to
|
|
|
|
|
use simple spacing (\quad) to push the equation number to the right side.
|
|
|
|
|
The tag remains inside the formula for better compatibility.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
md_text: Markdown text containing LaTeX formulas with \\tag{}.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Markdown text with \\tag{} commands converted to spacing format.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def convert_tag(match: re.Match) -> str:
|
|
|
|
|
"""Convert a single \\tag{} command within a formula."""
|
|
|
|
|
formula_content = match.group(1)
|
|
|
|
|
tag_content = match.group(2)
|
|
|
|
|
|
|
|
|
|
# Replace \tag{...} with \quad (...) to push the number to the right
|
|
|
|
|
# Keep it inside the formula for better Word compatibility
|
|
|
|
|
return f"$${formula_content} \\quad ({tag_content})$$"
|
|
|
|
|
|
|
|
|
|
# Match display formulas ($$...$$) containing \\tag{...}
|
|
|
|
|
# Pattern: $$...content...\\tag {?...}...$$
|
|
|
|
|
# Allow optional space between \tag and {
|
|
|
|
|
md_text = re.sub(
|
|
|
|
|
r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$",
|
|
|
|
|
convert_tag,
|
|
|
|
|
md_text,
|
|
|
|
|
flags=re.DOTALL,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return md_text
|
|
|
|
|
|
2025-12-31 17:38:32 +08:00
|
|
|
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
|
|
|
|
"""Export markdown to docx or pdf file.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
md_text: Markdown text to export.
|
|
|
|
|
export_type: Export format, either 'docx' or 'pdf'.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
bytes of the exported file.
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
ValueError: If export_type is not supported.
|
|
|
|
|
RuntimeError: If export fails.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Preprocess markdown
|
|
|
|
|
cleaned_md = self.preprocess_for_export(md_text)
|
|
|
|
|
|
|
|
|
|
# Create temp file for input
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
|
|
|
|
|
f_in.write(cleaned_md.encode("utf-8"))
|
|
|
|
|
md_path = f_in.name
|
|
|
|
|
|
|
|
|
|
output_file = md_path + "." + export_type
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if export_type == "docx":
|
|
|
|
|
self._export_docx(md_path, output_file)
|
|
|
|
|
with open(output_file, "rb") as f:
|
|
|
|
|
return f.read()
|
|
|
|
|
else: # pdf
|
|
|
|
|
self._export_pdf(md_path, output_file)
|
|
|
|
|
with open(output_file, "rb") as f:
|
|
|
|
|
return f.read()
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
# Cleanup on error
|
|
|
|
|
self._cleanup_files(md_path, output_file)
|
|
|
|
|
raise RuntimeError(f"Export failed: {e}") from e
|
|
|
|
|
finally:
|
|
|
|
|
# Always cleanup input file
|
|
|
|
|
if os.path.exists(md_path):
|
|
|
|
|
os.remove(md_path)
|
|
|
|
|
|
|
|
|
|
def _export_docx(self, input_path: str, output_path: str) -> None:
|
|
|
|
|
"""Export to DOCX format using pypandoc."""
|
|
|
|
|
extra_args = [
|
|
|
|
|
"--highlight-style=pygments",
|
|
|
|
|
f"--reference-doc=app/pkg/reference.docx",
|
|
|
|
|
]
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
input_path,
|
|
|
|
|
"docx",
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
outputfile=output_path,
|
|
|
|
|
extra_args=extra_args,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _export_pdf(self, input_path: str, output_path: str) -> None:
|
|
|
|
|
"""Export to PDF format using pypandoc with XeLaTeX."""
|
|
|
|
|
extra_args = [
|
|
|
|
|
"--pdf-engine=xelatex",
|
|
|
|
|
"-V",
|
|
|
|
|
"mainfont=Noto Sans CJK SC",
|
|
|
|
|
"--highlight-style=pygments",
|
|
|
|
|
]
|
|
|
|
|
pypandoc.convert_file(
|
|
|
|
|
input_path,
|
|
|
|
|
"pdf",
|
|
|
|
|
format=self.INPUT_FORMAT,
|
|
|
|
|
outputfile=output_path,
|
|
|
|
|
extra_args=extra_args,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _cleanup_files(self, *paths: str) -> None:
|
|
|
|
|
"""Remove files if they exist."""
|
|
|
|
|
for path in paths:
|
|
|
|
|
if os.path.exists(path):
|
|
|
|
|
os.remove(path)
|
|
|
|
|
|
|
|
|
|
def cleanup_export_file(self, file_path: str) -> None:
|
|
|
|
|
"""Cleanup exported file after sending response.
|
|
|
|
|
|
|
|
|
|
Call this after sending the file to the client.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
file_path: Path to the exported file.
|
|
|
|
|
"""
|
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
|
os.remove(file_path)
|
|
|
|
|
|