313 lines
9.3 KiB
Python
313 lines
9.3 KiB
Python
|
|
"""Markdown conversion and export service using pypandoc."""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import tempfile
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from typing import Literal
|
||
|
|
|
||
|
|
import pypandoc
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ConvertResult:
|
||
|
|
"""Result of markdown conversion."""
|
||
|
|
|
||
|
|
latex: str
|
||
|
|
mathml: str
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ExportResult:
|
||
|
|
"""Result of markdown export."""
|
||
|
|
|
||
|
|
file_path: str
|
||
|
|
content_type: str
|
||
|
|
download_name: str
|
||
|
|
|
||
|
|
|
||
|
|
ExportType = Literal["docx", "pdf"]
|
||
|
|
|
||
|
|
|
||
|
|
class Converter:
|
||
|
|
"""Service for conversion and export operations."""
|
||
|
|
|
||
|
|
# Pandoc input format with LaTeX math extensions
|
||
|
|
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
"""Initialize converter."""
|
||
|
|
|
||
|
|
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
||
|
|
"""Convert markdown to LaTeX and MathML formats.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
md_text: Markdown text to convert.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
ConvertResult with latex and mathml fields.
|
||
|
|
|
||
|
|
Raises:
|
||
|
|
ValueError: If md_text is empty.
|
||
|
|
RuntimeError: If conversion fails.
|
||
|
|
"""
|
||
|
|
if md_text == "":
|
||
|
|
return ConvertResult(latex="", mathml="")
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Convert to LaTeX
|
||
|
|
latex_output = pypandoc.convert_text(
|
||
|
|
md_text,
|
||
|
|
"latex",
|
||
|
|
format=self.INPUT_FORMAT,
|
||
|
|
).rstrip("\n")
|
||
|
|
|
||
|
|
# Convert to HTML with MathML
|
||
|
|
mathml_output = pypandoc.convert_text(
|
||
|
|
md_text,
|
||
|
|
"html",
|
||
|
|
format=self.INPUT_FORMAT,
|
||
|
|
extra_args=["--mathml"],
|
||
|
|
).rstrip("\n")
|
||
|
|
|
||
|
|
return ConvertResult(latex=latex_output, mathml=mathml_output)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
raise RuntimeError(f"Conversion failed: {e}") from e
|
||
|
|
|
||
|
|
def preprocess_for_export(self, md_text: str) -> str:
|
||
|
|
"""Preprocess markdown text for export to docx/pdf.
|
||
|
|
|
||
|
|
Handles LaTeX formula formatting, matrix environments, and
|
||
|
|
other transformations needed for proper Word/PDF rendering.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
md_text: Raw markdown text.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Preprocessed markdown text.
|
||
|
|
"""
|
||
|
|
# Replace \[1mm] => \vspace{1mm}
|
||
|
|
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
|
||
|
|
|
||
|
|
# Add blank lines around \[...\] block formulas
|
||
|
|
md_text = re.sub(
|
||
|
|
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
|
||
|
|
r"\1\n\n\\[\3\\]\n\n\4",
|
||
|
|
md_text,
|
||
|
|
flags=re.DOTALL,
|
||
|
|
)
|
||
|
|
md_text = re.sub(
|
||
|
|
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
|
||
|
|
r"\n\\[\2\\]\n",
|
||
|
|
md_text,
|
||
|
|
flags=re.MULTILINE | re.DOTALL,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Remove arithmatex span wrappers
|
||
|
|
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
|
||
|
|
|
||
|
|
# Convert inline formulas: \( \) => $ $
|
||
|
|
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
|
||
|
|
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
|
||
|
|
|
||
|
|
# Convert block formulas: \[ \] => $$ $$
|
||
|
|
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
|
||
|
|
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
|
||
|
|
|
||
|
|
# Remove spaces between $ and formula content
|
||
|
|
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
|
||
|
|
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
|
||
|
|
|
||
|
|
# Convert matrix environments for better Word rendering
|
||
|
|
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
||
|
|
|
||
|
|
# Fix brace spacing for equation systems
|
||
|
|
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
||
|
|
|
||
|
|
# Convert cases and aligned environments
|
||
|
|
cleaned_md = self._convert_special_environments(cleaned_md)
|
||
|
|
|
||
|
|
return cleaned_md
|
||
|
|
|
||
|
|
def _convert_matrix_environments(self, md_text: str) -> str:
|
||
|
|
"""Convert vmatrix/Vmatrix to left/right delimited forms.
|
||
|
|
|
||
|
|
This fixes the vertical line height issues in Word.
|
||
|
|
"""
|
||
|
|
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
||
|
|
md_text = re.sub(
|
||
|
|
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
|
||
|
|
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
||
|
|
md_text,
|
||
|
|
flags=re.DOTALL,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
||
|
|
md_text = re.sub(
|
||
|
|
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
|
||
|
|
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
||
|
|
md_text,
|
||
|
|
flags=re.DOTALL,
|
||
|
|
)
|
||
|
|
|
||
|
|
return md_text
|
||
|
|
|
||
|
|
def _fix_brace_spacing(self, md_text: str) -> str:
|
||
|
|
"""Fix spacing issues with braces in equation systems.
|
||
|
|
|
||
|
|
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
||
|
|
"""
|
||
|
|
# Fix \left\{ spacing
|
||
|
|
md_text = re.sub(
|
||
|
|
r"\\left\\\{\s+",
|
||
|
|
r"\\left\\{\\!",
|
||
|
|
md_text,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Fix \right\} spacing
|
||
|
|
md_text = re.sub(
|
||
|
|
r"\s+\\right\\\}",
|
||
|
|
r"\\!\\right\\}",
|
||
|
|
md_text,
|
||
|
|
)
|
||
|
|
|
||
|
|
return md_text
|
||
|
|
|
||
|
|
def _convert_special_environments(self, md_text: str) -> str:
|
||
|
|
"""Convert cases and aligned environments to array format.
|
||
|
|
|
||
|
|
These environments have better rendering support in Word/OMML.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def convert_cases(match: re.Match) -> str:
|
||
|
|
content = match.group(1)
|
||
|
|
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
||
|
|
|
||
|
|
md_text = re.sub(
|
||
|
|
r"\\begin\{cases\}(.*?)\\end\{cases\}",
|
||
|
|
convert_cases,
|
||
|
|
md_text,
|
||
|
|
flags=re.DOTALL,
|
||
|
|
)
|
||
|
|
|
||
|
|
def convert_aligned_to_array(match: re.Match) -> str:
|
||
|
|
content = match.group(1)
|
||
|
|
# Remove leading & alignment markers (not needed in array{l})
|
||
|
|
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||
|
|
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
||
|
|
|
||
|
|
md_text = re.sub(
|
||
|
|
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
|
||
|
|
convert_aligned_to_array,
|
||
|
|
md_text,
|
||
|
|
flags=re.DOTALL,
|
||
|
|
)
|
||
|
|
|
||
|
|
def convert_standalone_aligned(match: re.Match) -> str:
|
||
|
|
content = match.group(1)
|
||
|
|
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||
|
|
return r"\begin{array}{l}" + content + r"\end{array}"
|
||
|
|
|
||
|
|
md_text = re.sub(
|
||
|
|
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
|
||
|
|
convert_standalone_aligned,
|
||
|
|
md_text,
|
||
|
|
flags=re.DOTALL,
|
||
|
|
)
|
||
|
|
|
||
|
|
return md_text
|
||
|
|
|
||
|
|
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
||
|
|
"""Export markdown to docx or pdf file.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
md_text: Markdown text to export.
|
||
|
|
export_type: Export format, either 'docx' or 'pdf'.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
bytes of the exported file.
|
||
|
|
|
||
|
|
Raises:
|
||
|
|
ValueError: If export_type is not supported.
|
||
|
|
RuntimeError: If export fails.
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Preprocess markdown
|
||
|
|
cleaned_md = self.preprocess_for_export(md_text)
|
||
|
|
|
||
|
|
# Create temp file for input
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
|
||
|
|
f_in.write(cleaned_md.encode("utf-8"))
|
||
|
|
md_path = f_in.name
|
||
|
|
|
||
|
|
output_file = md_path + "." + export_type
|
||
|
|
|
||
|
|
try:
|
||
|
|
if export_type == "docx":
|
||
|
|
self._export_docx(md_path, output_file)
|
||
|
|
with open(output_file, "rb") as f:
|
||
|
|
return f.read()
|
||
|
|
else: # pdf
|
||
|
|
self._export_pdf(md_path, output_file)
|
||
|
|
with open(output_file, "rb") as f:
|
||
|
|
return f.read()
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
# Cleanup on error
|
||
|
|
self._cleanup_files(md_path, output_file)
|
||
|
|
raise RuntimeError(f"Export failed: {e}") from e
|
||
|
|
finally:
|
||
|
|
# Always cleanup input file
|
||
|
|
if os.path.exists(md_path):
|
||
|
|
os.remove(md_path)
|
||
|
|
|
||
|
|
def _export_docx(self, input_path: str, output_path: str) -> None:
|
||
|
|
"""Export to DOCX format using pypandoc."""
|
||
|
|
extra_args = [
|
||
|
|
"--highlight-style=pygments",
|
||
|
|
f"--reference-doc=app/pkg/reference.docx",
|
||
|
|
]
|
||
|
|
pypandoc.convert_file(
|
||
|
|
input_path,
|
||
|
|
"docx",
|
||
|
|
format=self.INPUT_FORMAT,
|
||
|
|
outputfile=output_path,
|
||
|
|
extra_args=extra_args,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _export_pdf(self, input_path: str, output_path: str) -> None:
|
||
|
|
"""Export to PDF format using pypandoc with XeLaTeX."""
|
||
|
|
extra_args = [
|
||
|
|
"--pdf-engine=xelatex",
|
||
|
|
"-V",
|
||
|
|
"mainfont=Noto Sans CJK SC",
|
||
|
|
"--highlight-style=pygments",
|
||
|
|
]
|
||
|
|
pypandoc.convert_file(
|
||
|
|
input_path,
|
||
|
|
"pdf",
|
||
|
|
format=self.INPUT_FORMAT,
|
||
|
|
outputfile=output_path,
|
||
|
|
extra_args=extra_args,
|
||
|
|
)
|
||
|
|
|
||
|
|
def _cleanup_files(self, *paths: str) -> None:
|
||
|
|
"""Remove files if they exist."""
|
||
|
|
for path in paths:
|
||
|
|
if os.path.exists(path):
|
||
|
|
os.remove(path)
|
||
|
|
|
||
|
|
def cleanup_export_file(self, file_path: str) -> None:
|
||
|
|
"""Cleanup exported file after sending response.
|
||
|
|
|
||
|
|
Call this after sending the file to the client.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to the exported file.
|
||
|
|
"""
|
||
|
|
if os.path.exists(file_path):
|
||
|
|
os.remove(file_path)
|
||
|
|
|