doc_processer/app/services/docx_converter.py

"""Markdown to DOCX conversion service.

Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
"""

import io
import re
from dataclasses import dataclass

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Inches, Pt


@dataclass
class MarkdownElement:
    """Parsed markdown element."""

    type: str  # heading, paragraph, list_item, code_block, table, math
    content: str
    level: int = 0  # For headings and lists
    language: str = ""  # For code blocks


class DocxConverter:
    """Converts markdown content to DOCX format."""

    def __init__(self):
        """Initialize the converter."""
        self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
        self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
        self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
        self.code_block_pattern = re.compile(r"^```(\w*)$")
        self.inline_code_pattern = re.compile(r"`([^`]+)`")
        self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
        self.italic_pattern = re.compile(r"\*([^*]+)\*")
        self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
        self.inline_math_pattern = re.compile(r"\$([^$]+)\$")

    def convert(self, markdown: str) -> bytes:
        """Convert markdown content to DOCX.

        Args:
            markdown: Markdown content to convert.

        Returns:
            DOCX file as bytes.
        """
        doc = Document()
        elements = self._parse_markdown(markdown)

        for element in elements:
            self._add_element_to_doc(doc, element)

        # Save to bytes
        buffer = io.BytesIO()
        doc.save(buffer)
        buffer.seek(0)
        return buffer.getvalue()

    def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
        """Parse markdown into elements.

        Args:
            markdown: Markdown content.

        Returns:
            List of parsed elements.
        """
        elements: list[MarkdownElement] = []
        lines = markdown.split("\n")
        i = 0
        in_code_block = False
        code_content = []
        code_language = ""

        while i < len(lines):
            line = lines[i]

            # Code block handling
            code_match = self.code_block_pattern.match(line)
            if code_match:
                if in_code_block:
                    elements.append(
                        MarkdownElement(
                            type="code_block",
                            content="\n".join(code_content),
                            language=code_language,
                        )
                    )
                    code_content = []
                    in_code_block = False
                else:
                    in_code_block = True
                    code_language = code_match.group(1)
                i += 1
                continue

            if in_code_block:
                code_content.append(line)
                i += 1
                continue

            # Math block ($$...$$)
            if line.strip().startswith("$$"):
                math_content = []
                if line.strip() == "$$":
                    i += 1
                    while i < len(lines) and lines[i].strip() != "$$":
                        math_content.append(lines[i])
                        i += 1
                else:
                    # Single line $$...$$ or start
                    content = line.strip()[2:]
                    if content.endswith("$$"):
                        math_content.append(content[:-2])
                    else:
                        math_content.append(content)
                        i += 1
                        while i < len(lines):
                            if lines[i].strip().endswith("$$"):
                                math_content.append(lines[i].strip()[:-2])
                                break
                            math_content.append(lines[i])
                            i += 1

                elements.append(
                    MarkdownElement(type="math", content="\n".join(math_content))
                )
                i += 1
                continue

            # Heading
            heading_match = self.heading_pattern.match(line)
            if heading_match:
                level = len(heading_match.group(1))
                content = heading_match.group(2)
                elements.append(
                    MarkdownElement(type="heading", content=content, level=level)
                )
                i += 1
                continue

            # Unordered list
            list_match = self.list_pattern.match(line)
            if list_match:
                indent = len(list_match.group(1))
                content = list_match.group(2)
                elements.append(
                    MarkdownElement(type="list_item", content=content, level=indent // 2)
                )
                i += 1
                continue

            # Ordered list
            ordered_match = self.ordered_list_pattern.match(line)
            if ordered_match:
                indent = len(ordered_match.group(1))
                content = ordered_match.group(2)
                elements.append(
                    MarkdownElement(
                        type="ordered_list_item", content=content, level=indent // 2
                    )
                )
                i += 1
                continue

            # Table (simple detection)
            if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
                table_lines = [line]
                i += 1
                while i < len(lines) and "|" in lines[i]:
                    table_lines.append(lines[i])
                    i += 1
                elements.append(
                    MarkdownElement(type="table", content="\n".join(table_lines))
                )
                continue

            # Regular paragraph
            if line.strip():
                elements.append(MarkdownElement(type="paragraph", content=line))

            i += 1

        return elements

    def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
        """Add a markdown element to the document.

        Args:
            doc: Word document.
            element: Parsed markdown element.
        """
        if element.type == "heading":
            self._add_heading(doc, element.content, element.level)
        elif element.type == "paragraph":
            self._add_paragraph(doc, element.content)
        elif element.type == "list_item":
            self._add_list_item(doc, element.content, element.level, ordered=False)
        elif element.type == "ordered_list_item":
            self._add_list_item(doc, element.content, element.level, ordered=True)
        elif element.type == "code_block":
            self._add_code_block(doc, element.content)
        elif element.type == "table":
            self._add_table(doc, element.content)
        elif element.type == "math":
            self._add_math(doc, element.content)

    def _add_heading(self, doc: Document, content: str, level: int) -> None:
        """Add a heading to the document."""
        # Map markdown levels to Word heading styles
        heading_level = min(level, 9)  # Word supports up to Heading 9
        doc.add_heading(content, level=heading_level)

    def _add_paragraph(self, doc: Document, content: str) -> None:
        """Add a paragraph with inline formatting."""
        para = doc.add_paragraph()
        self._add_formatted_text(para, content)

    def _add_formatted_text(self, para, content: str) -> None:
        """Add text with inline formatting (bold, italic, code)."""
        # Simple approach: process inline patterns
        remaining = content

        while remaining:
            # Find next formatting marker
            bold_match = self.bold_pattern.search(remaining)
            italic_match = self.italic_pattern.search(remaining)
            code_match = self.inline_code_pattern.search(remaining)
            math_match = self.inline_math_pattern.search(remaining)

            matches = [
                (bold_match, "bold"),
                (italic_match, "italic"),
                (code_match, "code"),
                (math_match, "math"),
            ]
            matches = [(m, t) for m, t in matches if m]

            if not matches:
                para.add_run(remaining)
                break

            # Find earliest match
            earliest = min(matches, key=lambda x: x[0].start())
            match, match_type = earliest

            # Add text before match
            if match.start() > 0:
                para.add_run(remaining[: match.start()])

            # Add formatted text
            run = para.add_run(match.group(1))
            if match_type == "bold":
                run.bold = True
            elif match_type == "italic":
                run.italic = True
            elif match_type == "code":
                run.font.name = "Courier New"
                run.font.size = Pt(10)
            elif match_type == "math":
                run.italic = True

            remaining = remaining[match.end() :]

    def _add_list_item(
        self, doc: Document, content: str, level: int, ordered: bool
    ) -> None:
        """Add a list item."""
        para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
        para.paragraph_format.left_indent = Inches(0.25 * level)
        self._add_formatted_text(para, content)

    def _add_code_block(self, doc: Document, content: str) -> None:
        """Add a code block."""
        para = doc.add_paragraph()
        para.paragraph_format.left_indent = Inches(0.5)

        run = para.add_run(content)
        run.font.name = "Courier New"
        run.font.size = Pt(9)

        # Add shading
        shading = OxmlElement("w:shd")
        shading.set(qn("w:val"), "clear")
        shading.set(qn("w:fill"), "F0F0F0")
        para._p.get_or_add_pPr().append(shading)

    def _add_table(self, doc: Document, content: str) -> None:
        """Add a table from markdown table format."""
        lines = [l.strip() for l in content.split("\n") if l.strip()]
        if len(lines) < 2:
            return

        # Parse header
        header = [c.strip() for c in lines[0].split("|") if c.strip()]

        # Skip separator line
        data_lines = lines[2:] if len(lines) > 2 else []

        # Create table
        table = doc.add_table(rows=1, cols=len(header))
        table.style = "Table Grid"

        # Add header
        header_cells = table.rows[0].cells
        for i, text in enumerate(header):
            header_cells[i].text = text
            header_cells[i].paragraphs[0].runs[0].bold = True

        # Add data rows
        for line in data_lines:
            cells = [c.strip() for c in line.split("|") if c.strip()]
            row_cells = table.add_row().cells
            for i, text in enumerate(cells):
                if i < len(row_cells):
                    row_cells[i].text = text

    def _add_math(self, doc: Document, content: str) -> None:
        """Add a math block.

        For proper OMML rendering, this would need more complex conversion.
        Currently renders as italic text with the LaTeX source.
        """
        para = doc.add_paragraph()
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER

        run = para.add_run(content)
        run.italic = True
        run.font.name = "Cambria Math"
        run.font.size = Pt(12)