Files
doc_processer/app/services/docx_converter.py

336 lines
11 KiB
Python
Raw Normal View History

2025-12-29 17:34:58 +08:00
"""Markdown to DOCX conversion service.
Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
"""
import io
import re
from dataclasses import dataclass
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Inches, Pt
@dataclass
class MarkdownElement:
"""Parsed markdown element."""
type: str # heading, paragraph, list_item, code_block, table, math
content: str
level: int = 0 # For headings and lists
language: str = "" # For code blocks
class DocxConverter:
"""Converts markdown content to DOCX format."""
def __init__(self):
"""Initialize the converter."""
self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
self.code_block_pattern = re.compile(r"^```(\w*)$")
self.inline_code_pattern = re.compile(r"`([^`]+)`")
self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
self.italic_pattern = re.compile(r"\*([^*]+)\*")
self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
self.inline_math_pattern = re.compile(r"\$([^$]+)\$")
def convert(self, markdown: str) -> bytes:
"""Convert markdown content to DOCX.
Args:
markdown: Markdown content to convert.
Returns:
DOCX file as bytes.
"""
doc = Document()
elements = self._parse_markdown(markdown)
for element in elements:
self._add_element_to_doc(doc, element)
# Save to bytes
buffer = io.BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer.getvalue()
def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
"""Parse markdown into elements.
Args:
markdown: Markdown content.
Returns:
List of parsed elements.
"""
elements: list[MarkdownElement] = []
lines = markdown.split("\n")
i = 0
in_code_block = False
code_content = []
code_language = ""
while i < len(lines):
line = lines[i]
# Code block handling
code_match = self.code_block_pattern.match(line)
if code_match:
if in_code_block:
elements.append(
MarkdownElement(
type="code_block",
content="\n".join(code_content),
language=code_language,
)
)
code_content = []
in_code_block = False
else:
in_code_block = True
code_language = code_match.group(1)
i += 1
continue
if in_code_block:
code_content.append(line)
i += 1
continue
# Math block ($$...$$)
if line.strip().startswith("$$"):
math_content = []
if line.strip() == "$$":
i += 1
while i < len(lines) and lines[i].strip() != "$$":
math_content.append(lines[i])
i += 1
else:
# Single line $$...$$ or start
content = line.strip()[2:]
if content.endswith("$$"):
math_content.append(content[:-2])
else:
math_content.append(content)
i += 1
while i < len(lines):
if lines[i].strip().endswith("$$"):
math_content.append(lines[i].strip()[:-2])
break
math_content.append(lines[i])
i += 1
elements.append(
MarkdownElement(type="math", content="\n".join(math_content))
)
i += 1
continue
# Heading
heading_match = self.heading_pattern.match(line)
if heading_match:
level = len(heading_match.group(1))
content = heading_match.group(2)
elements.append(
MarkdownElement(type="heading", content=content, level=level)
)
i += 1
continue
# Unordered list
list_match = self.list_pattern.match(line)
if list_match:
indent = len(list_match.group(1))
content = list_match.group(2)
elements.append(
MarkdownElement(type="list_item", content=content, level=indent // 2)
)
i += 1
continue
# Ordered list
ordered_match = self.ordered_list_pattern.match(line)
if ordered_match:
indent = len(ordered_match.group(1))
content = ordered_match.group(2)
elements.append(
MarkdownElement(
type="ordered_list_item", content=content, level=indent // 2
)
)
i += 1
continue
# Table (simple detection)
if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
table_lines = [line]
i += 1
while i < len(lines) and "|" in lines[i]:
table_lines.append(lines[i])
i += 1
elements.append(
MarkdownElement(type="table", content="\n".join(table_lines))
)
continue
# Regular paragraph
if line.strip():
elements.append(MarkdownElement(type="paragraph", content=line))
i += 1
return elements
def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
"""Add a markdown element to the document.
Args:
doc: Word document.
element: Parsed markdown element.
"""
if element.type == "heading":
self._add_heading(doc, element.content, element.level)
elif element.type == "paragraph":
self._add_paragraph(doc, element.content)
elif element.type == "list_item":
self._add_list_item(doc, element.content, element.level, ordered=False)
elif element.type == "ordered_list_item":
self._add_list_item(doc, element.content, element.level, ordered=True)
elif element.type == "code_block":
self._add_code_block(doc, element.content)
elif element.type == "table":
self._add_table(doc, element.content)
elif element.type == "math":
self._add_math(doc, element.content)
def _add_heading(self, doc: Document, content: str, level: int) -> None:
"""Add a heading to the document."""
# Map markdown levels to Word heading styles
heading_level = min(level, 9) # Word supports up to Heading 9
doc.add_heading(content, level=heading_level)
def _add_paragraph(self, doc: Document, content: str) -> None:
"""Add a paragraph with inline formatting."""
para = doc.add_paragraph()
self._add_formatted_text(para, content)
def _add_formatted_text(self, para, content: str) -> None:
"""Add text with inline formatting (bold, italic, code)."""
# Simple approach: process inline patterns
remaining = content
while remaining:
# Find next formatting marker
bold_match = self.bold_pattern.search(remaining)
italic_match = self.italic_pattern.search(remaining)
code_match = self.inline_code_pattern.search(remaining)
math_match = self.inline_math_pattern.search(remaining)
matches = [
(bold_match, "bold"),
(italic_match, "italic"),
(code_match, "code"),
(math_match, "math"),
]
matches = [(m, t) for m, t in matches if m]
if not matches:
para.add_run(remaining)
break
# Find earliest match
earliest = min(matches, key=lambda x: x[0].start())
match, match_type = earliest
# Add text before match
if match.start() > 0:
para.add_run(remaining[: match.start()])
# Add formatted text
run = para.add_run(match.group(1))
if match_type == "bold":
run.bold = True
elif match_type == "italic":
run.italic = True
elif match_type == "code":
run.font.name = "Courier New"
run.font.size = Pt(10)
elif match_type == "math":
run.italic = True
remaining = remaining[match.end() :]
def _add_list_item(
self, doc: Document, content: str, level: int, ordered: bool
) -> None:
"""Add a list item."""
para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
para.paragraph_format.left_indent = Inches(0.25 * level)
self._add_formatted_text(para, content)
def _add_code_block(self, doc: Document, content: str) -> None:
"""Add a code block."""
para = doc.add_paragraph()
para.paragraph_format.left_indent = Inches(0.5)
run = para.add_run(content)
run.font.name = "Courier New"
run.font.size = Pt(9)
# Add shading
shading = OxmlElement("w:shd")
shading.set(qn("w:val"), "clear")
shading.set(qn("w:fill"), "F0F0F0")
para._p.get_or_add_pPr().append(shading)
def _add_table(self, doc: Document, content: str) -> None:
"""Add a table from markdown table format."""
lines = [l.strip() for l in content.split("\n") if l.strip()]
if len(lines) < 2:
return
# Parse header
header = [c.strip() for c in lines[0].split("|") if c.strip()]
# Skip separator line
data_lines = lines[2:] if len(lines) > 2 else []
# Create table
table = doc.add_table(rows=1, cols=len(header))
table.style = "Table Grid"
# Add header
header_cells = table.rows[0].cells
for i, text in enumerate(header):
header_cells[i].text = text
header_cells[i].paragraphs[0].runs[0].bold = True
# Add data rows
for line in data_lines:
cells = [c.strip() for c in line.split("|") if c.strip()]
row_cells = table.add_row().cells
for i, text in enumerate(cells):
if i < len(row_cells):
row_cells[i].text = text
def _add_math(self, doc: Document, content: str) -> None:
"""Add a math block.
For proper OMML rendering, this would need more complex conversion.
Currently renders as italic text with the LaTeX source.
"""
para = doc.add_paragraph()
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = para.add_run(content)
run.italic = True
run.font.name = "Cambria Math"
run.font.size = Pt(12)