doc_processer/app/services/converter.py

"""Markdown conversion and export service using pypandoc."""

import os
import re
import tempfile
from dataclasses import dataclass
from typing import Literal

import pypandoc


@dataclass
class ConvertResult:
    """Result of markdown conversion."""

    latex: str
    mathml: str


@dataclass
class ExportResult:
    """Result of markdown export."""

    file_path: str
    content_type: str
    download_name: str


ExportType = Literal["docx", "pdf"]


class Converter:
    """Service for conversion and export operations."""

    # Pandoc input format with LaTeX math extensions
    INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"

    def __init__(self):
        """Initialize converter."""

    def convert_to_formats(self, md_text: str) -> ConvertResult:
        """Convert markdown to LaTeX and MathML formats.

        Args:
            md_text: Markdown text to convert.

        Returns:
            ConvertResult with latex and mathml fields.

        Raises:
            ValueError: If md_text is empty.
            RuntimeError: If conversion fails.
        """
        if md_text == "":
            return ConvertResult(latex="", mathml="")

        try:
            # Convert to LaTeX
            latex_output = pypandoc.convert_text(
                md_text,
                "latex",
                format=self.INPUT_FORMAT,
            ).rstrip("\n")

            # Convert to HTML with MathML
            mathml_output = pypandoc.convert_text(
                md_text,
                "html",
                format=self.INPUT_FORMAT,
                extra_args=["--mathml"],
            ).rstrip("\n")

            return ConvertResult(latex=latex_output, mathml=mathml_output)

        except Exception as e:
            raise RuntimeError(f"Conversion failed: {e}") from e

    def preprocess_for_export(self, md_text: str) -> str:
        """Preprocess markdown text for export to docx/pdf.

        Handles LaTeX formula formatting, matrix environments, and
        other transformations needed for proper Word/PDF rendering.

        Args:
            md_text: Raw markdown text.

        Returns:
            Preprocessed markdown text.
        """
        # Replace \[1mm] => \vspace{1mm}
        md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)

        # Add blank lines around \[...\] block formulas
        md_text = re.sub(
            r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
            r"\1\n\n\\[\3\\]\n\n\4",
            md_text,
            flags=re.DOTALL,
        )
        md_text = re.sub(
            r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
            r"\n\\[\2\\]\n",
            md_text,
            flags=re.MULTILINE | re.DOTALL,
        )

        # Remove arithmatex span wrappers
        cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)

        # Convert inline formulas: \( \) => $ $
        cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
        cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)

        # Convert block formulas: \[ \] => $$ $$
        cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
        cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)

        # Remove spaces between $ and formula content
        # Use negative lookahead/lookbehind to avoid matching $$ block formulas
        cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)

        # Convert matrix environments for better Word rendering
        cleaned_md = self._convert_matrix_environments(cleaned_md)

        # Fix brace spacing for equation systems
        cleaned_md = self._fix_brace_spacing(cleaned_md)

        # Convert cases and aligned environments
        cleaned_md = self._convert_special_environments(cleaned_md)

        return cleaned_md

    def _convert_matrix_environments(self, md_text: str) -> str:
        """Convert vmatrix/Vmatrix to left/right delimited forms.

        This fixes the vertical line height issues in Word.
        """
        # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
        md_text = re.sub(
            r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
            r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
            md_text,
            flags=re.DOTALL,
        )

        # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
        md_text = re.sub(
            r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
            r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
            md_text,
            flags=re.DOTALL,
        )

        return md_text

    def _fix_brace_spacing(self, md_text: str) -> str:
        """Fix spacing issues with braces in equation systems.

        Removes whitespace and adds negative space for proper alignment in Word/OMML.
        """
        # Fix \left\{ spacing
        md_text = re.sub(
            r"\\left\\\{\s+",
            r"\\left\\{\\!",
            md_text,
        )

        # Fix \right\} spacing
        md_text = re.sub(
            r"\s+\\right\\\}",
            r"\\!\\right\\}",
            md_text,
        )

        return md_text

    def _convert_special_environments(self, md_text: str) -> str:
        """Convert cases and aligned environments to array format.

        These environments have better rendering support in Word/OMML.
        """

        def convert_cases(match: re.Match) -> str:
            content = match.group(1)
            return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."

        md_text = re.sub(
            r"\\begin\{cases\}(.*?)\\end\{cases\}",
            convert_cases,
            md_text,
            flags=re.DOTALL,
        )

        def convert_aligned_to_array(match: re.Match) -> str:
            content = match.group(1)
            # Remove leading & alignment markers (not needed in array{l})
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."

        md_text = re.sub(
            r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
            convert_aligned_to_array,
            md_text,
            flags=re.DOTALL,
        )

        def convert_standalone_aligned(match: re.Match) -> str:
            content = match.group(1)
            content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
            return r"\begin{array}{l}" + content + r"\end{array}"

        md_text = re.sub(
            r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
            convert_standalone_aligned,
            md_text,
            flags=re.DOTALL,
        )

        return md_text

    def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
        """Export markdown to docx or pdf file.

        Args:
            md_text: Markdown text to export.
            export_type: Export format, either 'docx' or 'pdf'.

        Returns:
            bytes of the exported file.

        Raises:
            ValueError: If export_type is not supported.
            RuntimeError: If export fails.

        """

        # Preprocess markdown
        cleaned_md = self.preprocess_for_export(md_text)

        # Create temp file for input
        with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
            f_in.write(cleaned_md.encode("utf-8"))
            md_path = f_in.name

        output_file = md_path + "." + export_type

        try:
            if export_type == "docx":
                self._export_docx(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()
            else:  # pdf
                self._export_pdf(md_path, output_file)
                with open(output_file, "rb") as f:
                    return f.read()

        except Exception as e:
            # Cleanup on error
            self._cleanup_files(md_path, output_file)
            raise RuntimeError(f"Export failed: {e}") from e
        finally:
            # Always cleanup input file
            if os.path.exists(md_path):
                os.remove(md_path)

    def _export_docx(self, input_path: str, output_path: str) -> None:
        """Export to DOCX format using pypandoc."""
        extra_args = [
            "--highlight-style=pygments",
            f"--reference-doc=app/pkg/reference.docx",
        ]
        pypandoc.convert_file(
            input_path,
            "docx",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _export_pdf(self, input_path: str, output_path: str) -> None:
        """Export to PDF format using pypandoc with XeLaTeX."""
        extra_args = [
            "--pdf-engine=xelatex",
            "-V",
            "mainfont=Noto Sans CJK SC",
            "--highlight-style=pygments",
        ]
        pypandoc.convert_file(
            input_path,
            "pdf",
            format=self.INPUT_FORMAT,
            outputfile=output_path,
            extra_args=extra_args,
        )

    def _cleanup_files(self, *paths: str) -> None:
        """Remove files if they exist."""
        for path in paths:
            if os.path.exists(path):
                os.remove(path)

    def cleanup_export_file(self, file_path: str) -> None:
        """Cleanup exported file after sending response.

        Call this after sending the file to the client.

        Args:
            file_path: Path to the exported file.
        """
        if os.path.exists(file_path):
            os.remove(file_path)