app/services/converter.py

"""Markdown conversion and export service using pypandoc."""

import os
import re
import tempfile
from dataclasses import dataclass
from typing import Literal

import pypandoc


@dataclass
class ConvertResult:
    """Result of markdown conversion."""

    latex: str
    mathml: str


@dataclass
class ExportResult:
    """Result of markdown export."""

    file_path: str
    content_type: str
    download_name: str


ExportType = Literal["docx", "pdf"]


class Converter:
    """Service for conversion and export operations."""

    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"

    def __init__(self):
        """Initialize converter."""

    def convert_to_formats(self, md_text: str) -> ConvertResult:
        """Convert markdown to LaTeX and MathML formats.

        Args:
            md_text: Markdown text to convert.

        Returns:
            ConvertResult with latex and mathml fields.

        Raises:
            ValueError: If md_text is empty.
            RuntimeError: If conversion fails.
        """
        if md_text == "":
            return ConvertResult(latex="", mathml="")

        try:
            # Convert to LaTeX
            latex_output = pypandoc.convert_text(
                md_text,
                "latex",
                format=self.INPUT_FORMAT,
            ).rstrip("\n")

            # Convert to HTML with MathML
            mathml_output = pypandoc.convert_text(
                md_text,
                "html",
                format=self.INPUT_FORMAT,
                extra_args=["--mathml"],
            ).rstrip("\n")

            return ConvertResult(latex=latex_output, mathml=mathml_output)

        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e

    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

        Args:
            md_text: Raw markdown text.

        Returns:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
        md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
        md_text = re.sub(
            r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
            r"\1\n\n\\[\3\\]\n\n\4",
            md_text,
            flags=re.DOTALL,
        )
        md_text = re.sub(
            r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
            r"\n\\[\2\\]\n",
            md_text,
            flags=re.MULTILINE | re.DOTALL,
        )

        # Remove arithmatex span wrappers
        cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
        cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
        cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)

        # Convert block formulas: \[ \] => $$ $$
        cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
        cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)

        # Remove spaces between $ and formula content
        # Use negative lookahead/lookbehind to avoid matching $$ block formulas
        cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)

        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)

        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)

        return cleaned_md

    def _convert_matrix_environments(self, md_text: str) -> str:
        """Convert vmatrix/Vmatrix to left/right delimited forms.

        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
        md_text = re.sub(
            r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
            flags=re.DOTALL,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
        md_text = re.sub(
            r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
            flags=re.DOTALL,
        )

        return md_text

    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
        # Fix \left\{ spacing
        md_text = re.sub(
            r"\\left\\\{\s+",
            r"\\left\\{\\!",
            md_text,
        )

        # Fix \right\} spacing
        md_text = re.sub(
            r"\s+\\right\\\}",
            r"\\!\\right\\}",
            md_text,
        )

        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
        """Convert cases and aligned environments to array format.

        These environments have better rendering support in Word/OMML.
        """

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

        md_text = re.sub(
            r"\\begin\{cases\}(.*?)\\end\{cases\}",
            convert_cases,
            md_text,
            flags=re.DOTALL,
        )

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
            # Remove leading & alignment markers (not needed in array{l})
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

        md_text = re.sub(
            r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
            convert_aligned_to_array,
            md_text,
            flags=re.DOTALL,
        )

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

        md_text = re.sub(
            r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
            convert_standalone_aligned,
            md_text,
            flags=re.DOTALL,
        )

        return md_text

    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.

        Args:
            md_text: Markdown text to export.
            export_type: Export format, either 'docx' or 'pdf'.

        Returns:
            bytes of the exported file.

        Raises:
            ValueError: If export_type is not supported.
            RuntimeError: If export fails.

        """

        # Preprocess markdown
        cleaned_md = self.preprocess_for_export(md_text)

        # Create temp file for input
        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
            f_in.write(cleaned_md.encode("utf-8"))
            md_path = f_in.name

        output_file = md_path + "." + export_type

        try:
            if export_type == "docx":
                self._export_docx(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
            else:  # pdf
                self._export_pdf(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()

        except Exception as e:
            # Cleanup on error
            self._cleanup_files(md_path, output_file)
            raise RuntimeError(f"Export failed: {e}") from e
        finally:
            # Always cleanup input file
            if os.path.exists(md_path):
                os.remove(md_path)

    def _export_docx(self, input_path: str, output_path: str) -> None:
        """Export to DOCX format using pypandoc."""
        extra_args = [
            "--highlight-style=pygments",
            f"--reference-doc=app/pkg/reference.docx",
        ]
        pypandoc.convert_file(
            input_path,
            "docx",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _export_pdf(self, input_path: str, output_path: str) -> None:
        """Export to PDF format using pypandoc with XeLaTeX."""
        extra_args = [
            "--pdf-engine=xelatex",
            "-V",
            "mainfont=Noto Sans CJK SC",
            "--highlight-style=pygments",
        ]
        pypandoc.convert_file(
            input_path,
            "pdf",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _cleanup_files(self, *paths: str) -> None:
        """Remove files if they exist."""
        for path in paths:
            if os.path.exists(path):
                os.remove(path)

    def cleanup_export_file(self, file_path: str) -> None:
        """Cleanup exported file after sending response.

        Call this after sending the file to the client.

        Args:
            file_path: Path to the exported file.
        """
        if os.path.exists(file_path):
            os.remove(file_path)
fix: refact logic 2025-12-31 17:38:32 +08:00			`"""Markdown conversion and export service using pypandoc."""`

			`import os`
			`import re`
			`import tempfile`
			`from dataclasses import dataclass`
			`from typing import Literal`

			`import pypandoc`


			`@dataclass`
			`class ConvertResult:`
			`"""Result of markdown conversion."""`

			`latex: str`
			`mathml: str`


			`@dataclass`
			`class ExportResult:`
			`"""Result of markdown export."""`

			`file_path: str`
			`content_type: str`
			`download_name: str`


			`ExportType = Literal["docx", "pdf"]`


			`class Converter:`
			`"""Service for conversion and export operations."""`

			`# Pandoc input format with LaTeX math extensions`
			`INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"`

			`def __init__(self):`
			`"""Initialize converter."""`

			`def convert_to_formats(self, md_text: str) -> ConvertResult:`
			`"""Convert markdown to LaTeX and MathML formats.`

			`Args:`
			`md_text: Markdown text to convert.`

			`Returns:`
			`ConvertResult with latex and mathml fields.`

			`Raises:`
			`ValueError: If md_text is empty.`
			`RuntimeError: If conversion fails.`
			`"""`
			`if md_text == "":`
			`return ConvertResult(latex="", mathml="")`

			`try:`
			`# Convert to LaTeX`
			`latex_output = pypandoc.convert_text(`
			`md_text,`
			`"latex",`
			`format=self.INPUT_FORMAT,`
			`).rstrip("\n")`

			`# Convert to HTML with MathML`
			`mathml_output = pypandoc.convert_text(`
			`md_text,`
			`"html",`
			`format=self.INPUT_FORMAT,`
			`extra_args=["--mathml"],`
			`).rstrip("\n")`

			`return ConvertResult(latex=latex_output, mathml=mathml_output)`

			`except Exception as e:`
			`raise RuntimeError(f"Conversion failed: {e}") from e`

			`def preprocess_for_export(self, md_text: str) -> str:`
			`"""Preprocess markdown text for export to docx/pdf.`

			`Handles LaTeX formula formatting, matrix environments, and`
			`other transformations needed for proper Word/PDF rendering.`

			`Args:`
			`md_text: Raw markdown text.`

			`Returns:`
			`Preprocessed markdown text.`
			`"""`
			`# Replace \[1mm] => \vspace{1mm}`
			`md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)`

			`# Add blank lines around \[...\] block formulas`
			`md_text = re.sub(`
			`r"([^\n])(\s)\\\[(.?)\\\]([^\n])",`
			`r"\1\n\n\\[\3\\]\n\n\4",`
			`md_text,`
			`flags=re.DOTALL,`
			`)`
			`md_text = re.sub(`
			`r"^(\s)\\\[(.?)\\\](\s*)(?=\n\|$)",`
			`r"\n\\[\2\\]\n",`
			`md_text,`
			`flags=re.MULTILINE \| re.DOTALL,`
			`)`

			`# Remove arithmatex span wrappers`
			`cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)`

			`# Convert inline formulas: \( \) => $ $`
			`cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)`
			`cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)`

			`# Convert block formulas: \[ \] => $$ $$`
			`cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)`
			`cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)`

			`# Remove spaces between $ and formula content`
			`# Use negative lookahead/lookbehind to avoid matching $$ block formulas`
			`cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)`

			`# Convert matrix environments for better Word rendering`
			`cleaned_md = self._convert_matrix_environments(cleaned_md)`

			`# Fix brace spacing for equation systems`
			`cleaned_md = self._fix_brace_spacing(cleaned_md)`

			`# Convert cases and aligned environments`
			`cleaned_md = self._convert_special_environments(cleaned_md)`

			`return cleaned_md`

			`def _convert_matrix_environments(self, md_text: str) -> str:`
			`"""Convert vmatrix/Vmatrix to left/right delimited forms.`

			`This fixes the vertical line height issues in Word.`
			`"""`
			`# vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|`
			`md_text = re.sub(`
			`r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",`
			`r"\\left\| \\begin{matrix}\1\\end{matrix} \\right\|",`
			`md_text,`
			`flags=re.DOTALL,`
			`)`

			`# Vmatrix -> \left\\| \begin{matrix}...\end{matrix} \right\\|`
			`md_text = re.sub(`
			`r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",`
			`r"\\left\\\| \\begin{matrix}\1\\end{matrix} \\right\\\|",`
			`md_text,`
			`flags=re.DOTALL,`
			`)`

			`return md_text`

			`def _fix_brace_spacing(self, md_text: str) -> str:`
			`"""Fix spacing issues with braces in equation systems.`

			`Removes whitespace and adds negative space for proper alignment in Word/OMML.`
			`"""`
			`# Fix \left\{ spacing`
			`md_text = re.sub(`
			`r"\\left\\\{\s+",`
			`r"\\left\\{\\!",`
			`md_text,`
			`)`

			`# Fix \right\} spacing`
			`md_text = re.sub(`
			`r"\s+\\right\\\}",`
			`r"\\!\\right\\}",`
			`md_text,`
			`)`

			`return md_text`

			`def _convert_special_environments(self, md_text: str) -> str:`
			`"""Convert cases and aligned environments to array format.`

			`These environments have better rendering support in Word/OMML.`
			`"""`

			`def convert_cases(match: re.Match) -> str:`
			`content = match.group(1)`
			`return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."`

			`md_text = re.sub(`
			`r"\\begin\{cases\}(.*?)\\end\{cases\}",`
			`convert_cases,`
			`md_text,`
			`flags=re.DOTALL,`
			`)`

			`def convert_aligned_to_array(match: re.Match) -> str:`
			`content = match.group(1)`
			`# Remove leading & alignment markers (not needed in array{l})`
			`content = re.sub(r"(^\|\\\\)\s*&", r"\1", content)`
			`return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."`

			`md_text = re.sub(`
			`r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",`
			`convert_aligned_to_array,`
			`md_text,`
			`flags=re.DOTALL,`
			`)`

			`def convert_standalone_aligned(match: re.Match) -> str:`
			`content = match.group(1)`
			`content = re.sub(r"(^\|\\\\)\s*&", r"\1", content)`
			`return r"\begin{array}{l}" + content + r"\end{array}"`

			`md_text = re.sub(`
			`r"\\begin\{aligned\}(.*?)\\end\{aligned\}",`
			`convert_standalone_aligned,`
			`md_text,`
			`flags=re.DOTALL,`
			`)`

			`return md_text`

			`def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:`
			`"""Export markdown to docx or pdf file.`

			`Args:`
			`md_text: Markdown text to export.`
			`export_type: Export format, either 'docx' or 'pdf'.`

			`Returns:`
			`bytes of the exported file.`

			`Raises:`
			`ValueError: If export_type is not supported.`
			`RuntimeError: If export fails.`

			`"""`

			`# Preprocess markdown`
			`cleaned_md = self.preprocess_for_export(md_text)`

			`# Create temp file for input`
			`with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:`
			`f_in.write(cleaned_md.encode("utf-8"))`
			`md_path = f_in.name`

			`output_file = md_path + "." + export_type`

			`try:`
			`if export_type == "docx":`
			`self._export_docx(md_path, output_file)`
			`with open(output_file, "rb") as f:`
			`return f.read()`
			`else: # pdf`
			`self._export_pdf(md_path, output_file)`
			`with open(output_file, "rb") as f:`
			`return f.read()`

			`except Exception as e:`
			`# Cleanup on error`
			`self._cleanup_files(md_path, output_file)`
			`raise RuntimeError(f"Export failed: {e}") from e`
			`finally:`
			`# Always cleanup input file`
			`if os.path.exists(md_path):`
			`os.remove(md_path)`

			`def _export_docx(self, input_path: str, output_path: str) -> None:`
			`"""Export to DOCX format using pypandoc."""`
			`extra_args = [`
			`"--highlight-style=pygments",`
			`f"--reference-doc=app/pkg/reference.docx",`
			`]`
			`pypandoc.convert_file(`
			`input_path,`
			`"docx",`
			`format=self.INPUT_FORMAT,`
			`outputfile=output_path,`
			`extra_args=extra_args,`
			`)`

			`def _export_pdf(self, input_path: str, output_path: str) -> None:`
			`"""Export to PDF format using pypandoc with XeLaTeX."""`
			`extra_args = [`
			`"--pdf-engine=xelatex",`
			`"-V",`
			`"mainfont=Noto Sans CJK SC",`
			`"--highlight-style=pygments",`
			`]`
			`pypandoc.convert_file(`
			`input_path,`
			`"pdf",`
			`format=self.INPUT_FORMAT,`
			`outputfile=output_path,`
			`extra_args=extra_args,`
			`)`

			`def _cleanup_files(self, *paths: str) -> None:`
			`"""Remove files if they exist."""`
			`for path in paths:`
			`if os.path.exists(path):`
			`os.remove(path)`

			`def cleanup_export_file(self, file_path: str) -> None:`
			`"""Cleanup exported file after sending response.`

			`Call this after sending the file to the client.`

			`Args:`
			`file_path: Path to the exported file.`
			`"""`
			`if os.path.exists(file_path):`
			`os.remove(file_path)`