init repo
This commit is contained in:
335
app/services/docx_converter.py
Normal file
335
app/services/docx_converter.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Markdown to DOCX conversion service.
|
||||
|
||||
Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
|
||||
"""
|
||||
|
||||
import io
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
from docx import Document
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
from docx.shared import Inches, Pt
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarkdownElement:
|
||||
"""Parsed markdown element."""
|
||||
|
||||
type: str # heading, paragraph, list_item, code_block, table, math
|
||||
content: str
|
||||
level: int = 0 # For headings and lists
|
||||
language: str = "" # For code blocks
|
||||
|
||||
|
||||
class DocxConverter:
|
||||
"""Converts markdown content to DOCX format."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the converter."""
|
||||
self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
|
||||
self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
|
||||
self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
|
||||
self.code_block_pattern = re.compile(r"^```(\w*)$")
|
||||
self.inline_code_pattern = re.compile(r"`([^`]+)`")
|
||||
self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
|
||||
self.italic_pattern = re.compile(r"\*([^*]+)\*")
|
||||
self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
|
||||
self.inline_math_pattern = re.compile(r"\$([^$]+)\$")
|
||||
|
||||
def convert(self, markdown: str) -> bytes:
|
||||
"""Convert markdown content to DOCX.
|
||||
|
||||
Args:
|
||||
markdown: Markdown content to convert.
|
||||
|
||||
Returns:
|
||||
DOCX file as bytes.
|
||||
"""
|
||||
doc = Document()
|
||||
elements = self._parse_markdown(markdown)
|
||||
|
||||
for element in elements:
|
||||
self._add_element_to_doc(doc, element)
|
||||
|
||||
# Save to bytes
|
||||
buffer = io.BytesIO()
|
||||
doc.save(buffer)
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
|
||||
def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
|
||||
"""Parse markdown into elements.
|
||||
|
||||
Args:
|
||||
markdown: Markdown content.
|
||||
|
||||
Returns:
|
||||
List of parsed elements.
|
||||
"""
|
||||
elements: list[MarkdownElement] = []
|
||||
lines = markdown.split("\n")
|
||||
i = 0
|
||||
in_code_block = False
|
||||
code_content = []
|
||||
code_language = ""
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# Code block handling
|
||||
code_match = self.code_block_pattern.match(line)
|
||||
if code_match:
|
||||
if in_code_block:
|
||||
elements.append(
|
||||
MarkdownElement(
|
||||
type="code_block",
|
||||
content="\n".join(code_content),
|
||||
language=code_language,
|
||||
)
|
||||
)
|
||||
code_content = []
|
||||
in_code_block = False
|
||||
else:
|
||||
in_code_block = True
|
||||
code_language = code_match.group(1)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if in_code_block:
|
||||
code_content.append(line)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Math block ($$...$$)
|
||||
if line.strip().startswith("$$"):
|
||||
math_content = []
|
||||
if line.strip() == "$$":
|
||||
i += 1
|
||||
while i < len(lines) and lines[i].strip() != "$$":
|
||||
math_content.append(lines[i])
|
||||
i += 1
|
||||
else:
|
||||
# Single line $$...$$ or start
|
||||
content = line.strip()[2:]
|
||||
if content.endswith("$$"):
|
||||
math_content.append(content[:-2])
|
||||
else:
|
||||
math_content.append(content)
|
||||
i += 1
|
||||
while i < len(lines):
|
||||
if lines[i].strip().endswith("$$"):
|
||||
math_content.append(lines[i].strip()[:-2])
|
||||
break
|
||||
math_content.append(lines[i])
|
||||
i += 1
|
||||
|
||||
elements.append(
|
||||
MarkdownElement(type="math", content="\n".join(math_content))
|
||||
)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Heading
|
||||
heading_match = self.heading_pattern.match(line)
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
content = heading_match.group(2)
|
||||
elements.append(
|
||||
MarkdownElement(type="heading", content=content, level=level)
|
||||
)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Unordered list
|
||||
list_match = self.list_pattern.match(line)
|
||||
if list_match:
|
||||
indent = len(list_match.group(1))
|
||||
content = list_match.group(2)
|
||||
elements.append(
|
||||
MarkdownElement(type="list_item", content=content, level=indent // 2)
|
||||
)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Ordered list
|
||||
ordered_match = self.ordered_list_pattern.match(line)
|
||||
if ordered_match:
|
||||
indent = len(ordered_match.group(1))
|
||||
content = ordered_match.group(2)
|
||||
elements.append(
|
||||
MarkdownElement(
|
||||
type="ordered_list_item", content=content, level=indent // 2
|
||||
)
|
||||
)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Table (simple detection)
|
||||
if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
|
||||
table_lines = [line]
|
||||
i += 1
|
||||
while i < len(lines) and "|" in lines[i]:
|
||||
table_lines.append(lines[i])
|
||||
i += 1
|
||||
elements.append(
|
||||
MarkdownElement(type="table", content="\n".join(table_lines))
|
||||
)
|
||||
continue
|
||||
|
||||
# Regular paragraph
|
||||
if line.strip():
|
||||
elements.append(MarkdownElement(type="paragraph", content=line))
|
||||
|
||||
i += 1
|
||||
|
||||
return elements
|
||||
|
||||
def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
|
||||
"""Add a markdown element to the document.
|
||||
|
||||
Args:
|
||||
doc: Word document.
|
||||
element: Parsed markdown element.
|
||||
"""
|
||||
if element.type == "heading":
|
||||
self._add_heading(doc, element.content, element.level)
|
||||
elif element.type == "paragraph":
|
||||
self._add_paragraph(doc, element.content)
|
||||
elif element.type == "list_item":
|
||||
self._add_list_item(doc, element.content, element.level, ordered=False)
|
||||
elif element.type == "ordered_list_item":
|
||||
self._add_list_item(doc, element.content, element.level, ordered=True)
|
||||
elif element.type == "code_block":
|
||||
self._add_code_block(doc, element.content)
|
||||
elif element.type == "table":
|
||||
self._add_table(doc, element.content)
|
||||
elif element.type == "math":
|
||||
self._add_math(doc, element.content)
|
||||
|
||||
def _add_heading(self, doc: Document, content: str, level: int) -> None:
|
||||
"""Add a heading to the document."""
|
||||
# Map markdown levels to Word heading styles
|
||||
heading_level = min(level, 9) # Word supports up to Heading 9
|
||||
doc.add_heading(content, level=heading_level)
|
||||
|
||||
def _add_paragraph(self, doc: Document, content: str) -> None:
|
||||
"""Add a paragraph with inline formatting."""
|
||||
para = doc.add_paragraph()
|
||||
self._add_formatted_text(para, content)
|
||||
|
||||
def _add_formatted_text(self, para, content: str) -> None:
|
||||
"""Add text with inline formatting (bold, italic, code)."""
|
||||
# Simple approach: process inline patterns
|
||||
remaining = content
|
||||
|
||||
while remaining:
|
||||
# Find next formatting marker
|
||||
bold_match = self.bold_pattern.search(remaining)
|
||||
italic_match = self.italic_pattern.search(remaining)
|
||||
code_match = self.inline_code_pattern.search(remaining)
|
||||
math_match = self.inline_math_pattern.search(remaining)
|
||||
|
||||
matches = [
|
||||
(bold_match, "bold"),
|
||||
(italic_match, "italic"),
|
||||
(code_match, "code"),
|
||||
(math_match, "math"),
|
||||
]
|
||||
matches = [(m, t) for m, t in matches if m]
|
||||
|
||||
if not matches:
|
||||
para.add_run(remaining)
|
||||
break
|
||||
|
||||
# Find earliest match
|
||||
earliest = min(matches, key=lambda x: x[0].start())
|
||||
match, match_type = earliest
|
||||
|
||||
# Add text before match
|
||||
if match.start() > 0:
|
||||
para.add_run(remaining[: match.start()])
|
||||
|
||||
# Add formatted text
|
||||
run = para.add_run(match.group(1))
|
||||
if match_type == "bold":
|
||||
run.bold = True
|
||||
elif match_type == "italic":
|
||||
run.italic = True
|
||||
elif match_type == "code":
|
||||
run.font.name = "Courier New"
|
||||
run.font.size = Pt(10)
|
||||
elif match_type == "math":
|
||||
run.italic = True
|
||||
|
||||
remaining = remaining[match.end() :]
|
||||
|
||||
def _add_list_item(
|
||||
self, doc: Document, content: str, level: int, ordered: bool
|
||||
) -> None:
|
||||
"""Add a list item."""
|
||||
para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
|
||||
para.paragraph_format.left_indent = Inches(0.25 * level)
|
||||
self._add_formatted_text(para, content)
|
||||
|
||||
def _add_code_block(self, doc: Document, content: str) -> None:
|
||||
"""Add a code block."""
|
||||
para = doc.add_paragraph()
|
||||
para.paragraph_format.left_indent = Inches(0.5)
|
||||
|
||||
run = para.add_run(content)
|
||||
run.font.name = "Courier New"
|
||||
run.font.size = Pt(9)
|
||||
|
||||
# Add shading
|
||||
shading = OxmlElement("w:shd")
|
||||
shading.set(qn("w:val"), "clear")
|
||||
shading.set(qn("w:fill"), "F0F0F0")
|
||||
para._p.get_or_add_pPr().append(shading)
|
||||
|
||||
def _add_table(self, doc: Document, content: str) -> None:
|
||||
"""Add a table from markdown table format."""
|
||||
lines = [l.strip() for l in content.split("\n") if l.strip()]
|
||||
if len(lines) < 2:
|
||||
return
|
||||
|
||||
# Parse header
|
||||
header = [c.strip() for c in lines[0].split("|") if c.strip()]
|
||||
|
||||
# Skip separator line
|
||||
data_lines = lines[2:] if len(lines) > 2 else []
|
||||
|
||||
# Create table
|
||||
table = doc.add_table(rows=1, cols=len(header))
|
||||
table.style = "Table Grid"
|
||||
|
||||
# Add header
|
||||
header_cells = table.rows[0].cells
|
||||
for i, text in enumerate(header):
|
||||
header_cells[i].text = text
|
||||
header_cells[i].paragraphs[0].runs[0].bold = True
|
||||
|
||||
# Add data rows
|
||||
for line in data_lines:
|
||||
cells = [c.strip() for c in line.split("|") if c.strip()]
|
||||
row_cells = table.add_row().cells
|
||||
for i, text in enumerate(cells):
|
||||
if i < len(row_cells):
|
||||
row_cells[i].text = text
|
||||
|
||||
def _add_math(self, doc: Document, content: str) -> None:
|
||||
"""Add a math block.
|
||||
|
||||
For proper OMML rendering, this would need more complex conversion.
|
||||
Currently renders as italic text with the LaTeX source.
|
||||
"""
|
||||
para = doc.add_paragraph()
|
||||
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
run = para.add_run(content)
|
||||
run.italic = True
|
||||
run.font.name = "Cambria Math"
|
||||
run.font.size = Pt(12)
|
||||
|
||||
Reference in New Issue
Block a user