fix: refact logic
This commit is contained in:
312
app/services/converter.py
Normal file
312
app/services/converter.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""Markdown conversion and export service using pypandoc."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
import pypandoc
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConvertResult:
|
||||
"""Result of markdown conversion."""
|
||||
|
||||
latex: str
|
||||
mathml: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExportResult:
|
||||
"""Result of markdown export."""
|
||||
|
||||
file_path: str
|
||||
content_type: str
|
||||
download_name: str
|
||||
|
||||
|
||||
ExportType = Literal["docx", "pdf"]
|
||||
|
||||
|
||||
class Converter:
|
||||
"""Service for conversion and export operations."""
|
||||
|
||||
# Pandoc input format with LaTeX math extensions
|
||||
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize converter."""
|
||||
|
||||
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
||||
"""Convert markdown to LaTeX and MathML formats.
|
||||
|
||||
Args:
|
||||
md_text: Markdown text to convert.
|
||||
|
||||
Returns:
|
||||
ConvertResult with latex and mathml fields.
|
||||
|
||||
Raises:
|
||||
ValueError: If md_text is empty.
|
||||
RuntimeError: If conversion fails.
|
||||
"""
|
||||
if md_text == "":
|
||||
return ConvertResult(latex="", mathml="")
|
||||
|
||||
try:
|
||||
# Convert to LaTeX
|
||||
latex_output = pypandoc.convert_text(
|
||||
md_text,
|
||||
"latex",
|
||||
format=self.INPUT_FORMAT,
|
||||
).rstrip("\n")
|
||||
|
||||
# Convert to HTML with MathML
|
||||
mathml_output = pypandoc.convert_text(
|
||||
md_text,
|
||||
"html",
|
||||
format=self.INPUT_FORMAT,
|
||||
extra_args=["--mathml"],
|
||||
).rstrip("\n")
|
||||
|
||||
return ConvertResult(latex=latex_output, mathml=mathml_output)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Conversion failed: {e}") from e
|
||||
|
||||
def preprocess_for_export(self, md_text: str) -> str:
|
||||
"""Preprocess markdown text for export to docx/pdf.
|
||||
|
||||
Handles LaTeX formula formatting, matrix environments, and
|
||||
other transformations needed for proper Word/PDF rendering.
|
||||
|
||||
Args:
|
||||
md_text: Raw markdown text.
|
||||
|
||||
Returns:
|
||||
Preprocessed markdown text.
|
||||
"""
|
||||
# Replace \[1mm] => \vspace{1mm}
|
||||
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
|
||||
|
||||
# Add blank lines around \[...\] block formulas
|
||||
md_text = re.sub(
|
||||
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
|
||||
r"\1\n\n\\[\3\\]\n\n\4",
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
md_text = re.sub(
|
||||
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
|
||||
r"\n\\[\2\\]\n",
|
||||
md_text,
|
||||
flags=re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
# Remove arithmatex span wrappers
|
||||
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
|
||||
|
||||
# Convert inline formulas: \( \) => $ $
|
||||
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
|
||||
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
|
||||
|
||||
# Convert block formulas: \[ \] => $$ $$
|
||||
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
|
||||
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
|
||||
|
||||
# Remove spaces between $ and formula content
|
||||
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
|
||||
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
|
||||
|
||||
# Convert matrix environments for better Word rendering
|
||||
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
||||
|
||||
# Fix brace spacing for equation systems
|
||||
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
||||
|
||||
# Convert cases and aligned environments
|
||||
cleaned_md = self._convert_special_environments(cleaned_md)
|
||||
|
||||
return cleaned_md
|
||||
|
||||
def _convert_matrix_environments(self, md_text: str) -> str:
|
||||
"""Convert vmatrix/Vmatrix to left/right delimited forms.
|
||||
|
||||
This fixes the vertical line height issues in Word.
|
||||
"""
|
||||
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
||||
md_text = re.sub(
|
||||
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
|
||||
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
||||
md_text = re.sub(
|
||||
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
|
||||
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
return md_text
|
||||
|
||||
def _fix_brace_spacing(self, md_text: str) -> str:
|
||||
"""Fix spacing issues with braces in equation systems.
|
||||
|
||||
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
||||
"""
|
||||
# Fix \left\{ spacing
|
||||
md_text = re.sub(
|
||||
r"\\left\\\{\s+",
|
||||
r"\\left\\{\\!",
|
||||
md_text,
|
||||
)
|
||||
|
||||
# Fix \right\} spacing
|
||||
md_text = re.sub(
|
||||
r"\s+\\right\\\}",
|
||||
r"\\!\\right\\}",
|
||||
md_text,
|
||||
)
|
||||
|
||||
return md_text
|
||||
|
||||
def _convert_special_environments(self, md_text: str) -> str:
|
||||
"""Convert cases and aligned environments to array format.
|
||||
|
||||
These environments have better rendering support in Word/OMML.
|
||||
"""
|
||||
|
||||
def convert_cases(match: re.Match) -> str:
|
||||
content = match.group(1)
|
||||
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
||||
|
||||
md_text = re.sub(
|
||||
r"\\begin\{cases\}(.*?)\\end\{cases\}",
|
||||
convert_cases,
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
def convert_aligned_to_array(match: re.Match) -> str:
|
||||
content = match.group(1)
|
||||
# Remove leading & alignment markers (not needed in array{l})
|
||||
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||||
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
||||
|
||||
md_text = re.sub(
|
||||
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
|
||||
convert_aligned_to_array,
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
def convert_standalone_aligned(match: re.Match) -> str:
|
||||
content = match.group(1)
|
||||
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||||
return r"\begin{array}{l}" + content + r"\end{array}"
|
||||
|
||||
md_text = re.sub(
|
||||
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
|
||||
convert_standalone_aligned,
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
return md_text
|
||||
|
||||
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
||||
"""Export markdown to docx or pdf file.
|
||||
|
||||
Args:
|
||||
md_text: Markdown text to export.
|
||||
export_type: Export format, either 'docx' or 'pdf'.
|
||||
|
||||
Returns:
|
||||
bytes of the exported file.
|
||||
|
||||
Raises:
|
||||
ValueError: If export_type is not supported.
|
||||
RuntimeError: If export fails.
|
||||
|
||||
"""
|
||||
|
||||
# Preprocess markdown
|
||||
cleaned_md = self.preprocess_for_export(md_text)
|
||||
|
||||
# Create temp file for input
|
||||
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
|
||||
f_in.write(cleaned_md.encode("utf-8"))
|
||||
md_path = f_in.name
|
||||
|
||||
output_file = md_path + "." + export_type
|
||||
|
||||
try:
|
||||
if export_type == "docx":
|
||||
self._export_docx(md_path, output_file)
|
||||
with open(output_file, "rb") as f:
|
||||
return f.read()
|
||||
else: # pdf
|
||||
self._export_pdf(md_path, output_file)
|
||||
with open(output_file, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
except Exception as e:
|
||||
# Cleanup on error
|
||||
self._cleanup_files(md_path, output_file)
|
||||
raise RuntimeError(f"Export failed: {e}") from e
|
||||
finally:
|
||||
# Always cleanup input file
|
||||
if os.path.exists(md_path):
|
||||
os.remove(md_path)
|
||||
|
||||
def _export_docx(self, input_path: str, output_path: str) -> None:
|
||||
"""Export to DOCX format using pypandoc."""
|
||||
extra_args = [
|
||||
"--highlight-style=pygments",
|
||||
f"--reference-doc=app/pkg/reference.docx",
|
||||
]
|
||||
pypandoc.convert_file(
|
||||
input_path,
|
||||
"docx",
|
||||
format=self.INPUT_FORMAT,
|
||||
outputfile=output_path,
|
||||
extra_args=extra_args,
|
||||
)
|
||||
|
||||
def _export_pdf(self, input_path: str, output_path: str) -> None:
|
||||
"""Export to PDF format using pypandoc with XeLaTeX."""
|
||||
extra_args = [
|
||||
"--pdf-engine=xelatex",
|
||||
"-V",
|
||||
"mainfont=Noto Sans CJK SC",
|
||||
"--highlight-style=pygments",
|
||||
]
|
||||
pypandoc.convert_file(
|
||||
input_path,
|
||||
"pdf",
|
||||
format=self.INPUT_FORMAT,
|
||||
outputfile=output_path,
|
||||
extra_args=extra_args,
|
||||
)
|
||||
|
||||
def _cleanup_files(self, *paths: str) -> None:
|
||||
"""Remove files if they exist."""
|
||||
for path in paths:
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
|
||||
def cleanup_export_file(self, file_path: str) -> None:
|
||||
"""Cleanup exported file after sending response.
|
||||
|
||||
Call this after sending the file to the client.
|
||||
|
||||
Args:
|
||||
file_path: Path to the exported file.
|
||||
"""
|
||||
if os.path.exists(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
Reference in New Issue
Block a user