doc_processer/app/services/converter.py

"""Markdown conversion and export service using pypandoc."""

import os
import re
import tempfile
from dataclasses import dataclass
from typing import Literal

import pypandoc


@dataclass
class ConvertResult:
    """Result of markdown conversion."""

    latex: str
    mathml: str


@dataclass
class ExportResult:
    """Result of markdown export."""

    file_path: str
    content_type: str
    download_name: str


ExportType = Literal["docx", "pdf"]


class Converter:
    """Service for conversion and export operations."""

    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"

    def __init__(self):
        """Initialize converter."""

    def convert_to_formats(self, md_text: str) -> ConvertResult:
        """Convert markdown to LaTeX and MathML formats.

        Args:
            md_text: Markdown text to convert.

        Returns:
            ConvertResult with latex and mathml fields.

        Raises:
            ValueError: If md_text is empty.
            RuntimeError: If conversion fails.
        """
        if md_text == "":
            return ConvertResult(latex="", mathml="")

        try:
            # Convert to LaTeX
            latex_output = pypandoc.convert_text(
                md_text,
                "latex",
                format=self.INPUT_FORMAT,
            ).rstrip("\n")

            # Convert to HTML with MathML
            mathml_output = pypandoc.convert_text(
                md_text,
                "html",
                format=self.INPUT_FORMAT,
                extra_args=["--mathml"],
            ).rstrip("\n")

            return ConvertResult(latex=latex_output, mathml=mathml_output)

        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e

    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

        Args:
            md_text: Raw markdown text.

        Returns:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
        md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
        md_text = re.sub(
            r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
            r"\1\n\n\\[\3\\]\n\n\4",
            md_text,
            flags=re.DOTALL,
        )
        md_text = re.sub(
            r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
            r"\n\\[\2\\]\n",
            md_text,
            flags=re.MULTILINE | re.DOTALL,
        )

        # Remove arithmatex span wrappers
        cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
        cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
        cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)

        # Convert block formulas: \[ \] => $$ $$
        cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
        cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)

        # Remove spaces between $ and formula content
        # Use negative lookahead/lookbehind to avoid matching $$ block formulas
        cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)

        # Fix array environment column specifiers (remove spaces)
        cleaned_md = self._fix_array_column_specifiers(cleaned_md)

        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)

        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)

        # Handle LaTeX \tag{} commands for equation numbering
        cleaned_md = self._convert_tag_commands(cleaned_md)

        return cleaned_md

    def _convert_matrix_environments(self, md_text: str) -> str:
        """Convert vmatrix/Vmatrix to left/right delimited forms.

        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
        md_text = re.sub(
            r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
            flags=re.DOTALL,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
        md_text = re.sub(
            r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
            flags=re.DOTALL,
        )

        return md_text

    def _fix_array_column_specifiers(self, md_text: str) -> str:
        """Fix array environment column specifiers by removing spaces.

        Pandoc's OMML converter doesn't accept spaces between column alignment
        specifiers in array environments. This converts patterns like
        {c c c c} to {cccc}.

        Args:
            md_text: Markdown text with LaTeX formulas.

        Returns:
            Markdown text with fixed array column specifiers.
        """

        def remove_spaces_in_specifier(match: re.Match) -> str:
            """Remove spaces from column specifier."""
            specifier = match.group(1)
            # Remove all spaces from the specifier
            specifier_no_spaces = re.sub(r"\s+", "", specifier)
            return f"\\begin{{array}}{{{specifier_no_spaces}}}"

        # Match \begin{array}{...} and remove spaces in the column specifier
        # Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...}
        md_text = re.sub(
            r"\\begin\{array\}\{([^}]+)\}",
            remove_spaces_in_specifier,
            md_text,
        )

        return md_text

    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
        # Fix \left\{ spacing
        md_text = re.sub(
            r"\\left\\\{\s+",
            r"\\left\\{\\!",
            md_text,
        )

        # Fix \right\} spacing
        md_text = re.sub(
            r"\s+\\right\\\}",
            r"\\!\\right\\}",
            md_text,
        )

        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
        """Convert cases and aligned environments to array format.

        These environments have better rendering support in Word/OMML.
        """

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

        md_text = re.sub(
            r"\\begin\{cases\}(.*?)\\end\{cases\}",
            convert_cases,
            md_text,
            flags=re.DOTALL,
        )

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
            # Remove leading & alignment markers (not needed in array{l})
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

        md_text = re.sub(
            r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
            convert_aligned_to_array,
            md_text,
            flags=re.DOTALL,
        )

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

        md_text = re.sub(
            r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
            convert_standalone_aligned,
            md_text,
            flags=re.DOTALL,
        )

        return md_text

    def _convert_tag_commands(self, md_text: str) -> str:
        """Convert LaTeX \\tag{} commands to Word-compatible format.

        The \\tag{} command is not supported in Word OMML format, so we convert it to
        use simple spacing (\quad) to push the equation number to the right side.
        The tag remains inside the formula for better compatibility.

        Args:
            md_text: Markdown text containing LaTeX formulas with \\tag{}.

        Returns:
            Markdown text with \\tag{} commands converted to spacing format.
        """

        def convert_tag(match: re.Match) -> str:
            """Convert a single \\tag{} command within a formula."""
            formula_content = match.group(1)
            tag_content = match.group(2)

            # Replace \tag{...} with \quad (...) to push the number to the right
            # Keep it inside the formula for better Word compatibility
            return f"$${formula_content} \\quad ({tag_content})$$"

        # Match display formulas ($$...$$) containing \\tag{...}
        # Pattern: $$...content...\\tag {?...}...$$
        # Allow optional space between \tag and {
        md_text = re.sub(
            r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$",
            convert_tag,
            md_text,
            flags=re.DOTALL,
        )

        return md_text

    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.

        Args:
            md_text: Markdown text to export.
            export_type: Export format, either 'docx' or 'pdf'.

        Returns:
            bytes of the exported file.

        Raises:
            ValueError: If export_type is not supported.
            RuntimeError: If export fails.

        """

        # Preprocess markdown
        cleaned_md = self.preprocess_for_export(md_text)

        # Create temp file for input
        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
            f_in.write(cleaned_md.encode("utf-8"))
            md_path = f_in.name

        output_file = md_path + "." + export_type

        try:
            if export_type == "docx":
                self._export_docx(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
            else:  # pdf
                self._export_pdf(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()

        except Exception as e:
            # Cleanup on error
            self._cleanup_files(md_path, output_file)
            raise RuntimeError(f"Export failed: {e}") from e
        finally:
            # Always cleanup input file
            if os.path.exists(md_path):
                os.remove(md_path)

    def _export_docx(self, input_path: str, output_path: str) -> None:
        """Export to DOCX format using pypandoc."""
        extra_args = [
            "--highlight-style=pygments",
            f"--reference-doc=app/pkg/reference.docx",
        ]
        pypandoc.convert_file(
            input_path,
            "docx",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _export_pdf(self, input_path: str, output_path: str) -> None:
        """Export to PDF format using pypandoc with XeLaTeX."""
        extra_args = [
            "--pdf-engine=xelatex",
            "-V",
            "mainfont=Noto Sans CJK SC",
            "--highlight-style=pygments",
        ]
        pypandoc.convert_file(
            input_path,
            "pdf",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _cleanup_files(self, *paths: str) -> None:
        """Remove files if they exist."""
        for path in paths:
            if os.path.exists(path):
                os.remove(path)

    def cleanup_export_file(self, file_path: str) -> None:
        """Cleanup exported file after sending response.

        Call this after sending the file to the client.

        Args:
            file_path: Path to the exported file.
        """
        if os.path.exists(file_path):
            os.remove(file_path)