"""Markdown to DOCX conversion service. Reference implementation based on https://github.com/YogeLiu/markdown_2_docx """ import io import re from dataclasses import dataclass from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Inches, Pt @dataclass class MarkdownElement: """Parsed markdown element.""" type: str # heading, paragraph, list_item, code_block, table, math content: str level: int = 0 # For headings and lists language: str = "" # For code blocks class DocxConverter: """Converts markdown content to DOCX format.""" def __init__(self): """Initialize the converter.""" self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$") self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$") self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$") self.code_block_pattern = re.compile(r"^```(\w*)$") self.inline_code_pattern = re.compile(r"`([^`]+)`") self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*") self.italic_pattern = re.compile(r"\*([^*]+)\*") self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL) self.inline_math_pattern = re.compile(r"\$([^$]+)\$") def convert(self, markdown: str) -> bytes: """Convert markdown content to DOCX. Args: markdown: Markdown content to convert. Returns: DOCX file as bytes. """ doc = Document() elements = self._parse_markdown(markdown) for element in elements: self._add_element_to_doc(doc, element) # Save to bytes buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) return buffer.getvalue() def _parse_markdown(self, markdown: str) -> list[MarkdownElement]: """Parse markdown into elements. Args: markdown: Markdown content. Returns: List of parsed elements. """ elements: list[MarkdownElement] = [] lines = markdown.split("\n") i = 0 in_code_block = False code_content = [] code_language = "" while i < len(lines): line = lines[i] # Code block handling code_match = self.code_block_pattern.match(line) if code_match: if in_code_block: elements.append( MarkdownElement( type="code_block", content="\n".join(code_content), language=code_language, ) ) code_content = [] in_code_block = False else: in_code_block = True code_language = code_match.group(1) i += 1 continue if in_code_block: code_content.append(line) i += 1 continue # Math block ($$...$$) if line.strip().startswith("$$"): math_content = [] if line.strip() == "$$": i += 1 while i < len(lines) and lines[i].strip() != "$$": math_content.append(lines[i]) i += 1 else: # Single line $$...$$ or start content = line.strip()[2:] if content.endswith("$$"): math_content.append(content[:-2]) else: math_content.append(content) i += 1 while i < len(lines): if lines[i].strip().endswith("$$"): math_content.append(lines[i].strip()[:-2]) break math_content.append(lines[i]) i += 1 elements.append( MarkdownElement(type="math", content="\n".join(math_content)) ) i += 1 continue # Heading heading_match = self.heading_pattern.match(line) if heading_match: level = len(heading_match.group(1)) content = heading_match.group(2) elements.append( MarkdownElement(type="heading", content=content, level=level) ) i += 1 continue # Unordered list list_match = self.list_pattern.match(line) if list_match: indent = len(list_match.group(1)) content = list_match.group(2) elements.append( MarkdownElement(type="list_item", content=content, level=indent // 2) ) i += 1 continue # Ordered list ordered_match = self.ordered_list_pattern.match(line) if ordered_match: indent = len(ordered_match.group(1)) content = ordered_match.group(2) elements.append( MarkdownElement( type="ordered_list_item", content=content, level=indent // 2 ) ) i += 1 continue # Table (simple detection) if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]: table_lines = [line] i += 1 while i < len(lines) and "|" in lines[i]: table_lines.append(lines[i]) i += 1 elements.append( MarkdownElement(type="table", content="\n".join(table_lines)) ) continue # Regular paragraph if line.strip(): elements.append(MarkdownElement(type="paragraph", content=line)) i += 1 return elements def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None: """Add a markdown element to the document. Args: doc: Word document. element: Parsed markdown element. """ if element.type == "heading": self._add_heading(doc, element.content, element.level) elif element.type == "paragraph": self._add_paragraph(doc, element.content) elif element.type == "list_item": self._add_list_item(doc, element.content, element.level, ordered=False) elif element.type == "ordered_list_item": self._add_list_item(doc, element.content, element.level, ordered=True) elif element.type == "code_block": self._add_code_block(doc, element.content) elif element.type == "table": self._add_table(doc, element.content) elif element.type == "math": self._add_math(doc, element.content) def _add_heading(self, doc: Document, content: str, level: int) -> None: """Add a heading to the document.""" # Map markdown levels to Word heading styles heading_level = min(level, 9) # Word supports up to Heading 9 doc.add_heading(content, level=heading_level) def _add_paragraph(self, doc: Document, content: str) -> None: """Add a paragraph with inline formatting.""" para = doc.add_paragraph() self._add_formatted_text(para, content) def _add_formatted_text(self, para, content: str) -> None: """Add text with inline formatting (bold, italic, code).""" # Simple approach: process inline patterns remaining = content while remaining: # Find next formatting marker bold_match = self.bold_pattern.search(remaining) italic_match = self.italic_pattern.search(remaining) code_match = self.inline_code_pattern.search(remaining) math_match = self.inline_math_pattern.search(remaining) matches = [ (bold_match, "bold"), (italic_match, "italic"), (code_match, "code"), (math_match, "math"), ] matches = [(m, t) for m, t in matches if m] if not matches: para.add_run(remaining) break # Find earliest match earliest = min(matches, key=lambda x: x[0].start()) match, match_type = earliest # Add text before match if match.start() > 0: para.add_run(remaining[: match.start()]) # Add formatted text run = para.add_run(match.group(1)) if match_type == "bold": run.bold = True elif match_type == "italic": run.italic = True elif match_type == "code": run.font.name = "Courier New" run.font.size = Pt(10) elif match_type == "math": run.italic = True remaining = remaining[match.end() :] def _add_list_item( self, doc: Document, content: str, level: int, ordered: bool ) -> None: """Add a list item.""" para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number") para.paragraph_format.left_indent = Inches(0.25 * level) self._add_formatted_text(para, content) def _add_code_block(self, doc: Document, content: str) -> None: """Add a code block.""" para = doc.add_paragraph() para.paragraph_format.left_indent = Inches(0.5) run = para.add_run(content) run.font.name = "Courier New" run.font.size = Pt(9) # Add shading shading = OxmlElement("w:shd") shading.set(qn("w:val"), "clear") shading.set(qn("w:fill"), "F0F0F0") para._p.get_or_add_pPr().append(shading) def _add_table(self, doc: Document, content: str) -> None: """Add a table from markdown table format.""" lines = [l.strip() for l in content.split("\n") if l.strip()] if len(lines) < 2: return # Parse header header = [c.strip() for c in lines[0].split("|") if c.strip()] # Skip separator line data_lines = lines[2:] if len(lines) > 2 else [] # Create table table = doc.add_table(rows=1, cols=len(header)) table.style = "Table Grid" # Add header header_cells = table.rows[0].cells for i, text in enumerate(header): header_cells[i].text = text header_cells[i].paragraphs[0].runs[0].bold = True # Add data rows for line in data_lines: cells = [c.strip() for c in line.split("|") if c.strip()] row_cells = table.add_row().cells for i, text in enumerate(cells): if i < len(row_cells): row_cells[i].text = text def _add_math(self, doc: Document, content: str) -> None: """Add a math block. For proper OMML rendering, this would need more complex conversion. Currently renders as italic text with the LaTeX source. """ para = doc.add_paragraph() para.alignment = WD_ALIGN_PARAGRAPH.CENTER run = para.add_run(content) run.italic = True run.font.name = "Cambria Math" run.font.size = Pt(12)