Files
doc_processer/app/services/converter.py

385 lines
12 KiB
Python

"""Markdown conversion and export service using pypandoc."""
import os
import re
import tempfile
from dataclasses import dataclass
from typing import Literal
import pypandoc
@dataclass
class ConvertResult:
"""Result of markdown conversion."""
latex: str
mathml: str
@dataclass
class ExportResult:
"""Result of markdown export."""
file_path: str
content_type: str
download_name: str
ExportType = Literal["docx", "pdf"]
class Converter:
"""Service for conversion and export operations."""
# Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
def __init__(self):
"""Initialize converter."""
def convert_to_formats(self, md_text: str) -> ConvertResult:
"""Convert markdown to LaTeX and MathML formats.
Args:
md_text: Markdown text to convert.
Returns:
ConvertResult with latex and mathml fields.
Raises:
ValueError: If md_text is empty.
RuntimeError: If conversion fails.
"""
if md_text == "":
return ConvertResult(latex="", mathml="")
try:
# Convert to LaTeX
latex_output = pypandoc.convert_text(
md_text,
"latex",
format=self.INPUT_FORMAT,
).rstrip("\n")
# Convert to HTML with MathML
mathml_output = pypandoc.convert_text(
md_text,
"html",
format=self.INPUT_FORMAT,
extra_args=["--mathml"],
).rstrip("\n")
return ConvertResult(latex=latex_output, mathml=mathml_output)
except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e
def preprocess_for_export(self, md_text: str) -> str:
"""Preprocess markdown text for export to docx/pdf.
Handles LaTeX formula formatting, matrix environments, and
other transformations needed for proper Word/PDF rendering.
Args:
md_text: Raw markdown text.
Returns:
Preprocessed markdown text.
"""
# Replace \[1mm] => \vspace{1mm}
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
# Add blank lines around \[...\] block formulas
md_text = re.sub(
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
r"\1\n\n\\[\3\\]\n\n\4",
md_text,
flags=re.DOTALL,
)
md_text = re.sub(
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
r"\n\\[\2\\]\n",
md_text,
flags=re.MULTILINE | re.DOTALL,
)
# Remove arithmatex span wrappers
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
# Convert inline formulas: \( \) => $ $
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
# Convert block formulas: \[ \] => $$ $$
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
# Remove spaces between $ and formula content
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
# Convert matrix environments for better Word rendering
cleaned_md = self._convert_matrix_environments(cleaned_md)
# Fix array environment column specifiers (remove spaces)
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
# Fix brace spacing for equation systems
cleaned_md = self._fix_brace_spacing(cleaned_md)
# Convert cases and aligned environments
cleaned_md = self._convert_special_environments(cleaned_md)
# Handle LaTeX \tag{} commands for equation numbering
cleaned_md = self._convert_tag_commands(cleaned_md)
return cleaned_md
def _convert_matrix_environments(self, md_text: str) -> str:
"""Convert vmatrix/Vmatrix to left/right delimited forms.
This fixes the vertical line height issues in Word.
"""
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
md_text = re.sub(
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
md_text,
flags=re.DOTALL,
)
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
md_text = re.sub(
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
md_text,
flags=re.DOTALL,
)
return md_text
def _fix_array_column_specifiers(self, md_text: str) -> str:
"""Fix array environment column specifiers by removing spaces.
Pandoc's OMML converter doesn't accept spaces between column alignment
specifiers in array environments. This converts patterns like
{c c c c} to {cccc}.
Args:
md_text: Markdown text with LaTeX formulas.
Returns:
Markdown text with fixed array column specifiers.
"""
def remove_spaces_in_specifier(match: re.Match) -> str:
"""Remove spaces from column specifier."""
specifier = match.group(1)
# Remove all spaces from the specifier
specifier_no_spaces = re.sub(r"\s+", "", specifier)
return f"\\begin{{array}}{{{specifier_no_spaces}}}"
# Match \begin{array}{...} and remove spaces in the column specifier
# Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...}
md_text = re.sub(
r"\\begin\{array\}\{([^}]+)\}",
remove_spaces_in_specifier,
md_text,
)
return md_text
def _fix_brace_spacing(self, md_text: str) -> str:
"""Fix spacing issues with braces in equation systems.
Removes whitespace and adds negative space for proper alignment in Word/OMML.
"""
# Fix \left\{ spacing
md_text = re.sub(
r"\\left\\\{\s+",
r"\\left\\{\\!",
md_text,
)
# Fix \right\} spacing
md_text = re.sub(
r"\s+\\right\\\}",
r"\\!\\right\\}",
md_text,
)
return md_text
def _convert_special_environments(self, md_text: str) -> str:
"""Convert cases and aligned environments to array format.
These environments have better rendering support in Word/OMML.
"""
def convert_cases(match: re.Match) -> str:
content = match.group(1)
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
md_text = re.sub(
r"\\begin\{cases\}(.*?)\\end\{cases\}",
convert_cases,
md_text,
flags=re.DOTALL,
)
def convert_aligned_to_array(match: re.Match) -> str:
content = match.group(1)
# Remove leading & alignment markers (not needed in array{l})
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
md_text = re.sub(
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
convert_aligned_to_array,
md_text,
flags=re.DOTALL,
)
def convert_standalone_aligned(match: re.Match) -> str:
content = match.group(1)
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
return r"\begin{array}{l}" + content + r"\end{array}"
md_text = re.sub(
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
convert_standalone_aligned,
md_text,
flags=re.DOTALL,
)
return md_text
def _convert_tag_commands(self, md_text: str) -> str:
"""Convert LaTeX \\tag{} commands to Word-compatible format.
The \\tag{} command is not supported in Word OMML format, so we convert it to
use simple spacing (\quad) to push the equation number to the right side.
The tag remains inside the formula for better compatibility.
Args:
md_text: Markdown text containing LaTeX formulas with \\tag{}.
Returns:
Markdown text with \\tag{} commands converted to spacing format.
"""
def convert_tag(match: re.Match) -> str:
"""Convert a single \\tag{} command within a formula."""
formula_content = match.group(1)
tag_content = match.group(2)
# Replace \tag{...} with \quad (...) to push the number to the right
# Keep it inside the formula for better Word compatibility
return f"$${formula_content} \\quad ({tag_content})$$"
# Match display formulas ($$...$$) containing \\tag{...}
# Pattern: $$...content...\\tag {?...}...$$
# Allow optional space between \tag and {
md_text = re.sub(
r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$",
convert_tag,
md_text,
flags=re.DOTALL,
)
return md_text
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
"""Export markdown to docx or pdf file.
Args:
md_text: Markdown text to export.
export_type: Export format, either 'docx' or 'pdf'.
Returns:
bytes of the exported file.
Raises:
ValueError: If export_type is not supported.
RuntimeError: If export fails.
"""
# Preprocess markdown
cleaned_md = self.preprocess_for_export(md_text)
# Create temp file for input
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
f_in.write(cleaned_md.encode("utf-8"))
md_path = f_in.name
output_file = md_path + "." + export_type
try:
if export_type == "docx":
self._export_docx(md_path, output_file)
with open(output_file, "rb") as f:
return f.read()
else: # pdf
self._export_pdf(md_path, output_file)
with open(output_file, "rb") as f:
return f.read()
except Exception as e:
# Cleanup on error
self._cleanup_files(md_path, output_file)
raise RuntimeError(f"Export failed: {e}") from e
finally:
# Always cleanup input file
if os.path.exists(md_path):
os.remove(md_path)
def _export_docx(self, input_path: str, output_path: str) -> None:
"""Export to DOCX format using pypandoc."""
extra_args = [
"--highlight-style=pygments",
f"--reference-doc=app/pkg/reference.docx",
]
pypandoc.convert_file(
input_path,
"docx",
format=self.INPUT_FORMAT,
outputfile=output_path,
extra_args=extra_args,
)
def _export_pdf(self, input_path: str, output_path: str) -> None:
"""Export to PDF format using pypandoc with XeLaTeX."""
extra_args = [
"--pdf-engine=xelatex",
"-V",
"mainfont=Noto Sans CJK SC",
"--highlight-style=pygments",
]
pypandoc.convert_file(
input_path,
"pdf",
format=self.INPUT_FORMAT,
outputfile=output_path,
extra_args=extra_args,
)
def _cleanup_files(self, *paths: str) -> None:
"""Remove files if they exist."""
for path in paths:
if os.path.exists(path):
os.remove(path)
def cleanup_export_file(self, file_path: str) -> None:
"""Cleanup exported file after sending response.
Call this after sending the file to the client.
Args:
file_path: Path to the exported file.
"""
if os.path.exists(file_path):
os.remove(file_path)