Files
doc_processer/app/services/converter.py
2025-12-31 17:38:32 +08:00

313 lines
9.3 KiB
Python

"""Markdown conversion and export service using pypandoc."""
import os
import re
import tempfile
from dataclasses import dataclass
from typing import Literal
import pypandoc
@dataclass
class ConvertResult:
"""Result of markdown conversion."""
latex: str
mathml: str
@dataclass
class ExportResult:
"""Result of markdown export."""
file_path: str
content_type: str
download_name: str
ExportType = Literal["docx", "pdf"]
class Converter:
"""Service for conversion and export operations."""
# Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
def __init__(self):
"""Initialize converter."""
def convert_to_formats(self, md_text: str) -> ConvertResult:
"""Convert markdown to LaTeX and MathML formats.
Args:
md_text: Markdown text to convert.
Returns:
ConvertResult with latex and mathml fields.
Raises:
ValueError: If md_text is empty.
RuntimeError: If conversion fails.
"""
if md_text == "":
return ConvertResult(latex="", mathml="")
try:
# Convert to LaTeX
latex_output = pypandoc.convert_text(
md_text,
"latex",
format=self.INPUT_FORMAT,
).rstrip("\n")
# Convert to HTML with MathML
mathml_output = pypandoc.convert_text(
md_text,
"html",
format=self.INPUT_FORMAT,
extra_args=["--mathml"],
).rstrip("\n")
return ConvertResult(latex=latex_output, mathml=mathml_output)
except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e
def preprocess_for_export(self, md_text: str) -> str:
"""Preprocess markdown text for export to docx/pdf.
Handles LaTeX formula formatting, matrix environments, and
other transformations needed for proper Word/PDF rendering.
Args:
md_text: Raw markdown text.
Returns:
Preprocessed markdown text.
"""
# Replace \[1mm] => \vspace{1mm}
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
# Add blank lines around \[...\] block formulas
md_text = re.sub(
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
r"\1\n\n\\[\3\\]\n\n\4",
md_text,
flags=re.DOTALL,
)
md_text = re.sub(
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
r"\n\\[\2\\]\n",
md_text,
flags=re.MULTILINE | re.DOTALL,
)
# Remove arithmatex span wrappers
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
# Convert inline formulas: \( \) => $ $
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
# Convert block formulas: \[ \] => $$ $$
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
# Remove spaces between $ and formula content
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
# Convert matrix environments for better Word rendering
cleaned_md = self._convert_matrix_environments(cleaned_md)
# Fix brace spacing for equation systems
cleaned_md = self._fix_brace_spacing(cleaned_md)
# Convert cases and aligned environments
cleaned_md = self._convert_special_environments(cleaned_md)
return cleaned_md
def _convert_matrix_environments(self, md_text: str) -> str:
"""Convert vmatrix/Vmatrix to left/right delimited forms.
This fixes the vertical line height issues in Word.
"""
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
md_text = re.sub(
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
md_text,
flags=re.DOTALL,
)
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
md_text = re.sub(
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
md_text,
flags=re.DOTALL,
)
return md_text
def _fix_brace_spacing(self, md_text: str) -> str:
"""Fix spacing issues with braces in equation systems.
Removes whitespace and adds negative space for proper alignment in Word/OMML.
"""
# Fix \left\{ spacing
md_text = re.sub(
r"\\left\\\{\s+",
r"\\left\\{\\!",
md_text,
)
# Fix \right\} spacing
md_text = re.sub(
r"\s+\\right\\\}",
r"\\!\\right\\}",
md_text,
)
return md_text
def _convert_special_environments(self, md_text: str) -> str:
"""Convert cases and aligned environments to array format.
These environments have better rendering support in Word/OMML.
"""
def convert_cases(match: re.Match) -> str:
content = match.group(1)
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
md_text = re.sub(
r"\\begin\{cases\}(.*?)\\end\{cases\}",
convert_cases,
md_text,
flags=re.DOTALL,
)
def convert_aligned_to_array(match: re.Match) -> str:
content = match.group(1)
# Remove leading & alignment markers (not needed in array{l})
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
md_text = re.sub(
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
convert_aligned_to_array,
md_text,
flags=re.DOTALL,
)
def convert_standalone_aligned(match: re.Match) -> str:
content = match.group(1)
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
return r"\begin{array}{l}" + content + r"\end{array}"
md_text = re.sub(
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
convert_standalone_aligned,
md_text,
flags=re.DOTALL,
)
return md_text
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
"""Export markdown to docx or pdf file.
Args:
md_text: Markdown text to export.
export_type: Export format, either 'docx' or 'pdf'.
Returns:
bytes of the exported file.
Raises:
ValueError: If export_type is not supported.
RuntimeError: If export fails.
"""
# Preprocess markdown
cleaned_md = self.preprocess_for_export(md_text)
# Create temp file for input
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
f_in.write(cleaned_md.encode("utf-8"))
md_path = f_in.name
output_file = md_path + "." + export_type
try:
if export_type == "docx":
self._export_docx(md_path, output_file)
with open(output_file, "rb") as f:
return f.read()
else: # pdf
self._export_pdf(md_path, output_file)
with open(output_file, "rb") as f:
return f.read()
except Exception as e:
# Cleanup on error
self._cleanup_files(md_path, output_file)
raise RuntimeError(f"Export failed: {e}") from e
finally:
# Always cleanup input file
if os.path.exists(md_path):
os.remove(md_path)
def _export_docx(self, input_path: str, output_path: str) -> None:
"""Export to DOCX format using pypandoc."""
extra_args = [
"--highlight-style=pygments",
f"--reference-doc=app/pkg/reference.docx",
]
pypandoc.convert_file(
input_path,
"docx",
format=self.INPUT_FORMAT,
outputfile=output_path,
extra_args=extra_args,
)
def _export_pdf(self, input_path: str, output_path: str) -> None:
"""Export to PDF format using pypandoc with XeLaTeX."""
extra_args = [
"--pdf-engine=xelatex",
"-V",
"mainfont=Noto Sans CJK SC",
"--highlight-style=pygments",
]
pypandoc.convert_file(
input_path,
"pdf",
format=self.INPUT_FORMAT,
outputfile=output_path,
extra_args=extra_args,
)
def _cleanup_files(self, *paths: str) -> None:
"""Remove files if they exist."""
for path in paths:
if os.path.exists(path):
os.remove(path)
def cleanup_export_file(self, file_path: str) -> None:
"""Cleanup exported file after sending response.
Call this after sending the file to the client.
Args:
file_path: Path to the exported file.
"""
if os.path.exists(file_path):
os.remove(file_path)