fix: refact logic
This commit is contained in:
55
.dockerignore
Normal file
55
.dockerignore
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
# Git
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
|
||||||
|
# Python
|
||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
*.egg-info/
|
||||||
|
.eggs/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
test/
|
||||||
|
tests/
|
||||||
|
|
||||||
|
# Linting & IDE
|
||||||
|
.ruff_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.cursor/
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
|
||||||
|
# Documentation (not needed in container)
|
||||||
|
*.md
|
||||||
|
!README.md
|
||||||
|
openspec/
|
||||||
|
|
||||||
|
# Models (mounted at runtime, not built into image)
|
||||||
|
app/model/doclayout/*.pdiparams
|
||||||
|
app/model/DocLayout/
|
||||||
|
app/model/PP-DocLayout/
|
||||||
|
|
||||||
|
# Misc
|
||||||
|
*.log
|
||||||
|
*.tmp
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
test/
|
||||||
|
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -71,3 +71,5 @@ htmlcov/
|
|||||||
uv.lock
|
uv.lock
|
||||||
|
|
||||||
model/
|
model/
|
||||||
|
|
||||||
|
test/
|
||||||
81
Dockerfile
81
Dockerfile
@@ -1,54 +1,73 @@
|
|||||||
# DocProcesser Dockerfile
|
# DocProcesser Dockerfile
|
||||||
# Optimized for RTX 5080 GPU deployment
|
# Optimized for RTX 5080 GPU deployment
|
||||||
|
|
||||||
# Use NVIDIA CUDA base image with Python 3.11
|
# Use NVIDIA CUDA base image with Python 3.10
|
||||||
FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04
|
FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04
|
||||||
|
|
||||||
# Set environment variables
|
# Set environment variables
|
||||||
ENV PYTHONUNBUFFERED=1 \
|
ENV PYTHONUNBUFFERED=1 \
|
||||||
PYTHONDONTWRITEBYTECODE=1 \
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PIP_NO_CACHE_DIR=1 \
|
PIP_NO_CACHE_DIR=1 \
|
||||||
PIP_DISABLE_PIP_VERSION_CHECK=1
|
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||||
|
# Model cache directories - mount these at runtime
|
||||||
|
MODELSCOPE_CACHE=/root/.cache/modelscope \
|
||||||
|
HF_HOME=/root/.cache/huggingface \
|
||||||
|
# Application config (override defaults for container)
|
||||||
|
# Use 127.0.0.1 for --network host mode, or override with -e for bridge mode
|
||||||
|
PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \
|
||||||
|
PADDLEOCR_VL_URL=http://127.0.0.1:8000/v1
|
||||||
|
|
||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install system dependencies
|
# Install system dependencies and Python 3.10 from deadsnakes PPA
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
python3.11 \
|
software-properties-common \
|
||||||
python3.11-venv \
|
&& add-apt-repository -y ppa:deadsnakes/ppa \
|
||||||
python3.11-dev \
|
&& apt-get update && apt-get install -y --no-install-recommends \
|
||||||
python3-pip \
|
python3.10 \
|
||||||
libgl1-mesa-glx \
|
python3.10-venv \
|
||||||
|
python3.10-dev \
|
||||||
|
python3.10-distutils \
|
||||||
|
libgl1 \
|
||||||
libglib2.0-0 \
|
libglib2.0-0 \
|
||||||
libsm6 \
|
libsm6 \
|
||||||
libxext6 \
|
libxext6 \
|
||||||
libxrender-dev \
|
libxrender-dev \
|
||||||
libgomp1 \
|
libgomp1 \
|
||||||
curl \
|
curl \
|
||||||
|
pandoc \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& ln -sf /usr/bin/python3.11 /usr/bin/python \
|
&& ln -sf /usr/bin/python3.10 /usr/bin/python \
|
||||||
&& ln -sf /usr/bin/python3.11 /usr/bin/python3
|
&& ln -sf /usr/bin/python3.10 /usr/bin/python3 \
|
||||||
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
|
||||||
|
|
||||||
# Install uv for fast package management
|
# Install uv via pip (more reliable than install script)
|
||||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
ENV PATH="/root/.local/bin:$PATH"
|
|
||||||
|
|
||||||
# Copy dependency files first for better caching
|
|
||||||
COPY pyproject.toml ./
|
|
||||||
|
|
||||||
# Create virtual environment and install dependencies
|
|
||||||
RUN uv venv /app/.venv
|
|
||||||
ENV PATH="/app/.venv/bin:$PATH"
|
ENV PATH="/app/.venv/bin:$PATH"
|
||||||
ENV VIRTUAL_ENV="/app/.venv"
|
ENV VIRTUAL_ENV="/app/.venv"
|
||||||
|
|
||||||
RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e .
|
# Copy dependency files first for better caching
|
||||||
|
COPY pyproject.toml ./
|
||||||
|
COPY wheels/ ./wheels/
|
||||||
|
|
||||||
|
# Create virtual environment and install dependencies
|
||||||
|
RUN uv venv /app/.venv --python python3.10 \
|
||||||
|
&& uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \
|
||||||
|
&& rm -rf ./wheels
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY app/ ./app/
|
COPY app/ ./app/
|
||||||
|
|
||||||
# Create model directories (models should be mounted at runtime)
|
# Create model cache directories (mount from host at runtime)
|
||||||
RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout
|
RUN mkdir -p /root/.cache/modelscope \
|
||||||
|
/root/.cache/huggingface \
|
||||||
|
/root/.paddlex \
|
||||||
|
/app/app/model/DocLayout \
|
||||||
|
/app/app/model/PP-DocLayout
|
||||||
|
|
||||||
|
# Declare volumes for model cache (mount at runtime to avoid re-downloading)
|
||||||
|
VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"]
|
||||||
|
|
||||||
# Expose port
|
# Expose port
|
||||||
EXPOSE 8053
|
EXPOSE 8053
|
||||||
@@ -60,3 +79,21 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|||||||
# Run the application
|
# Run the application
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Usage: Mount local model cache to avoid downloading
|
||||||
|
#
|
||||||
|
# Option 1: Use host network (simplest, can access localhost services)
|
||||||
|
# docker run --gpus all --network host \
|
||||||
|
# -v /home/yoge/.paddlex:/root/.paddlex:ro \
|
||||||
|
# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
|
||||||
|
# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
|
||||||
|
# doc_processer:latest
|
||||||
|
#
|
||||||
|
# Option 2: Use bridge network with host.docker.internal (Linux needs --add-host)
|
||||||
|
# docker run --gpus all -p 8053:8053 \
|
||||||
|
# --add-host=host.docker.internal:host-gateway \
|
||||||
|
# -v /home/yoge/.paddlex:/root/.paddlex:ro \
|
||||||
|
# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
|
||||||
|
# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
|
||||||
|
# doc_processer:latest
|
||||||
|
# =============================================================================
|
||||||
|
|||||||
@@ -3,34 +3,28 @@
|
|||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import Response
|
||||||
|
|
||||||
from app.core.dependencies import get_docx_converter
|
from app.core.dependencies import get_converter
|
||||||
from app.schemas.convert import MarkdownToDocxRequest
|
from app.schemas.convert import MarkdownToDocxRequest
|
||||||
from app.services.docx_converter import DocxConverter
|
from app.services.converter import Converter
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
@router.post("/docx")
|
@router.post("/file")
|
||||||
async def convert_markdown_to_docx(
|
async def convert_markdown_to_docx(
|
||||||
request: MarkdownToDocxRequest,
|
request: MarkdownToDocxRequest,
|
||||||
converter: DocxConverter = Depends(get_docx_converter),
|
converter: Converter = Depends(get_converter),
|
||||||
) -> Response:
|
) -> Response:
|
||||||
"""Convert markdown content to DOCX file.
|
"""Convert markdown content to DOCX file.
|
||||||
|
|
||||||
Returns the generated DOCX file as a binary download.
|
Returns the generated DOCX file as a binary response.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
docx_bytes = converter.convert(request.markdown)
|
docx_bytes = converter.export_to_file(request.markdown, export_type="docx")
|
||||||
|
return Response(
|
||||||
|
content=docx_bytes,
|
||||||
|
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
headers={"Content-Disposition": f'attachment; filename="{request.filename}.docx"'},
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
|
raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
|
||||||
|
|
||||||
# Determine filename
|
|
||||||
filename = request.filename or "output"
|
|
||||||
if not filename.endswith(".docx"):
|
|
||||||
filename = f"{filename}.docx"
|
|
||||||
|
|
||||||
return Response(
|
|
||||||
content=docx_bytes,
|
|
||||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
||||||
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -28,24 +28,15 @@ async def process_image_ocr(
|
|||||||
- Otherwise: use PaddleOCR-VL with formula prompt
|
- Otherwise: use PaddleOCR-VL with formula prompt
|
||||||
4. Convert output to LaTeX, Markdown, and MathML formats
|
4. Convert output to LaTeX, Markdown, and MathML formats
|
||||||
"""
|
"""
|
||||||
try:
|
|
||||||
# 1. Load and preprocess image
|
|
||||||
image = image_processor.preprocess(
|
|
||||||
image_url=request.image_url,
|
|
||||||
image_base64=request.image_base64,
|
|
||||||
)
|
|
||||||
except ValueError as e:
|
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
|
||||||
|
|
||||||
try:
|
image = image_processor.preprocess(
|
||||||
# 2. Detect layout
|
image_url=request.image_url,
|
||||||
layout_info = layout_detector.detect(image)
|
image_base64=request.image_base64,
|
||||||
except RuntimeError as e:
|
)
|
||||||
raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 3. Perform OCR based on layout
|
# 3. Perform OCR based on layout
|
||||||
ocr_result = ocr_service.recognize(image, layout_info)
|
ocr_result = ocr_service.recognize(image)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
raise HTTPException(status_code=503, detail=str(e))
|
raise HTTPException(status_code=503, detail=str(e))
|
||||||
|
|
||||||
@@ -54,6 +45,4 @@ async def process_image_ocr(
|
|||||||
latex=ocr_result.get("latex", ""),
|
latex=ocr_result.get("latex", ""),
|
||||||
markdown=ocr_result.get("markdown", ""),
|
markdown=ocr_result.get("markdown", ""),
|
||||||
mathml=ocr_result.get("mathml", ""),
|
mathml=ocr_result.get("mathml", ""),
|
||||||
layout_info=layout_info,
|
|
||||||
recognition_mode=ocr_result.get("recognition_mode", ""),
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
import torch
|
import torch
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
@@ -21,11 +22,10 @@ class Settings(BaseSettings):
|
|||||||
debug: bool = False
|
debug: bool = False
|
||||||
|
|
||||||
# PaddleOCR-VL Settings
|
# PaddleOCR-VL Settings
|
||||||
paddleocr_vl_url: str = "http://localhost:8080/v1"
|
paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"
|
||||||
|
|
||||||
# Model Paths
|
# Model Paths
|
||||||
doclayout_model_path: str = "app/model/DocLayout/best.pt"
|
pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"
|
||||||
pp_doclayout_model_dir: str = "app/model/PP-DocLayout/PP-DocLayoutV2"
|
|
||||||
|
|
||||||
# Image Processing
|
# Image Processing
|
||||||
max_image_size_mb: int = 10
|
max_image_size_mb: int = 10
|
||||||
@@ -37,11 +37,6 @@ class Settings(BaseSettings):
|
|||||||
host: str = "0.0.0.0"
|
host: str = "0.0.0.0"
|
||||||
port: int = 8053
|
port: int = 8053
|
||||||
|
|
||||||
@property
|
|
||||||
def doclayout_model_file(self) -> Path:
|
|
||||||
"""Get the DocLayout model file path."""
|
|
||||||
return Path(self.doclayout_model_path)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pp_doclayout_dir(self) -> Path:
|
def pp_doclayout_dir(self) -> Path:
|
||||||
"""Get the PP-DocLayout model directory path."""
|
"""Get the PP-DocLayout model directory path."""
|
||||||
|
|||||||
@@ -3,20 +3,20 @@
|
|||||||
from app.services.image_processor import ImageProcessor
|
from app.services.image_processor import ImageProcessor
|
||||||
from app.services.layout_detector import LayoutDetector
|
from app.services.layout_detector import LayoutDetector
|
||||||
from app.services.ocr_service import OCRService
|
from app.services.ocr_service import OCRService
|
||||||
from app.services.docx_converter import DocxConverter
|
from app.services.converter import Converter
|
||||||
|
from app.core.config import get_settings
|
||||||
|
|
||||||
# Global instances (initialized on startup)
|
# Global instances (initialized on startup)
|
||||||
_layout_detector: LayoutDetector | None = None
|
_layout_detector: LayoutDetector | None = None
|
||||||
|
|
||||||
|
|
||||||
def init_layout_detector(model_path: str) -> None:
|
def init_layout_detector() -> None:
|
||||||
"""Initialize the global layout detector.
|
"""Initialize the global layout detector.
|
||||||
|
|
||||||
Called during application startup.
|
Called during application startup.
|
||||||
"""
|
"""
|
||||||
global _layout_detector
|
global _layout_detector
|
||||||
_layout_detector = LayoutDetector(model_path=model_path)
|
_layout_detector = LayoutDetector()
|
||||||
_layout_detector.load_model()
|
|
||||||
|
|
||||||
|
|
||||||
def get_layout_detector() -> LayoutDetector:
|
def get_layout_detector() -> LayoutDetector:
|
||||||
@@ -33,10 +33,15 @@ def get_image_processor() -> ImageProcessor:
|
|||||||
|
|
||||||
def get_ocr_service() -> OCRService:
|
def get_ocr_service() -> OCRService:
|
||||||
"""Get an OCR service instance."""
|
"""Get an OCR service instance."""
|
||||||
return OCRService()
|
return OCRService(
|
||||||
|
vl_server_url=get_settings().paddleocr_vl_url,
|
||||||
|
layout_detector=get_layout_detector(),
|
||||||
|
image_processor=get_image_processor(),
|
||||||
|
converter=get_converter(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_docx_converter() -> DocxConverter:
|
def get_converter() -> Converter:
|
||||||
"""Get a DOCX converter instance."""
|
"""Get a DOCX converter instance."""
|
||||||
return DocxConverter()
|
return Converter()
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ settings = get_settings()
|
|||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
"""Application lifespan handler for startup/shutdown."""
|
"""Application lifespan handler for startup/shutdown."""
|
||||||
# Startup: Load models
|
# Startup: Load models
|
||||||
init_layout_detector(model_path=settings.doclayout_model_path)
|
init_layout_detector()
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
@@ -37,3 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix)
|
|||||||
async def health_check():
|
async def health_check():
|
||||||
"""Health check endpoint."""
|
"""Health check endpoint."""
|
||||||
return {"status": "healthy"}
|
return {"status": "healthy"}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8053)
|
||||||
BIN
app/pkg/reference.docx
Normal file
BIN
app/pkg/reference.docx
Normal file
Binary file not shown.
@@ -7,7 +7,7 @@ class MarkdownToDocxRequest(BaseModel):
|
|||||||
"""Request body for markdown to DOCX conversion endpoint."""
|
"""Request body for markdown to DOCX conversion endpoint."""
|
||||||
|
|
||||||
markdown: str = Field(..., description="Markdown content to convert")
|
markdown: str = Field(..., description="Markdown content to convert")
|
||||||
filename: str | None = Field(None, description="Optional output filename (without extension)")
|
filename: str = Field("texpixel", description="Optional output filename (without extension)")
|
||||||
|
|
||||||
@field_validator("markdown")
|
@field_validator("markdown")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -9,14 +9,15 @@ class LayoutRegion(BaseModel):
|
|||||||
type: str = Field(..., description="Region type: text, formula, table, figure")
|
type: str = Field(..., description="Region type: text, formula, table, figure")
|
||||||
bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]")
|
bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]")
|
||||||
confidence: float = Field(..., description="Detection confidence score")
|
confidence: float = Field(..., description="Detection confidence score")
|
||||||
|
score: float = Field(..., description="Detection score")
|
||||||
|
|
||||||
|
|
||||||
class LayoutInfo(BaseModel):
|
class LayoutInfo(BaseModel):
|
||||||
"""Layout detection information."""
|
"""Layout detection information."""
|
||||||
|
|
||||||
regions: list[LayoutRegion] = Field(default_factory=list)
|
regions: list[LayoutRegion] = Field(default_factory=list)
|
||||||
has_plain_text: bool = Field(False, description="Whether plain text was detected")
|
MixedRecognition: bool = Field(False, description="Whether mixed recognition was used")
|
||||||
has_formula: bool = Field(False, description="Whether formulas were detected")
|
# FormulaRecognition: bool = Field(False, description="Whether formula recognition (with prompt) was used")
|
||||||
|
|
||||||
|
|
||||||
class ImageOCRRequest(BaseModel):
|
class ImageOCRRequest(BaseModel):
|
||||||
|
|||||||
312
app/services/converter.py
Normal file
312
app/services/converter.py
Normal file
@@ -0,0 +1,312 @@
|
|||||||
|
"""Markdown conversion and export service using pypandoc."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
import pypandoc
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ConvertResult:
|
||||||
|
"""Result of markdown conversion."""
|
||||||
|
|
||||||
|
latex: str
|
||||||
|
mathml: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExportResult:
|
||||||
|
"""Result of markdown export."""
|
||||||
|
|
||||||
|
file_path: str
|
||||||
|
content_type: str
|
||||||
|
download_name: str
|
||||||
|
|
||||||
|
|
||||||
|
ExportType = Literal["docx", "pdf"]
|
||||||
|
|
||||||
|
|
||||||
|
class Converter:
|
||||||
|
"""Service for conversion and export operations."""
|
||||||
|
|
||||||
|
# Pandoc input format with LaTeX math extensions
|
||||||
|
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize converter."""
|
||||||
|
|
||||||
|
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
||||||
|
"""Convert markdown to LaTeX and MathML formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
md_text: Markdown text to convert.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ConvertResult with latex and mathml fields.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If md_text is empty.
|
||||||
|
RuntimeError: If conversion fails.
|
||||||
|
"""
|
||||||
|
if md_text == "":
|
||||||
|
return ConvertResult(latex="", mathml="")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Convert to LaTeX
|
||||||
|
latex_output = pypandoc.convert_text(
|
||||||
|
md_text,
|
||||||
|
"latex",
|
||||||
|
format=self.INPUT_FORMAT,
|
||||||
|
).rstrip("\n")
|
||||||
|
|
||||||
|
# Convert to HTML with MathML
|
||||||
|
mathml_output = pypandoc.convert_text(
|
||||||
|
md_text,
|
||||||
|
"html",
|
||||||
|
format=self.INPUT_FORMAT,
|
||||||
|
extra_args=["--mathml"],
|
||||||
|
).rstrip("\n")
|
||||||
|
|
||||||
|
return ConvertResult(latex=latex_output, mathml=mathml_output)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Conversion failed: {e}") from e
|
||||||
|
|
||||||
|
def preprocess_for_export(self, md_text: str) -> str:
|
||||||
|
"""Preprocess markdown text for export to docx/pdf.
|
||||||
|
|
||||||
|
Handles LaTeX formula formatting, matrix environments, and
|
||||||
|
other transformations needed for proper Word/PDF rendering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
md_text: Raw markdown text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Preprocessed markdown text.
|
||||||
|
"""
|
||||||
|
# Replace \[1mm] => \vspace{1mm}
|
||||||
|
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
|
||||||
|
|
||||||
|
# Add blank lines around \[...\] block formulas
|
||||||
|
md_text = re.sub(
|
||||||
|
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
|
||||||
|
r"\1\n\n\\[\3\\]\n\n\4",
|
||||||
|
md_text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
md_text = re.sub(
|
||||||
|
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
|
||||||
|
r"\n\\[\2\\]\n",
|
||||||
|
md_text,
|
||||||
|
flags=re.MULTILINE | re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove arithmatex span wrappers
|
||||||
|
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
|
||||||
|
|
||||||
|
# Convert inline formulas: \( \) => $ $
|
||||||
|
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
|
||||||
|
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
|
||||||
|
|
||||||
|
# Convert block formulas: \[ \] => $$ $$
|
||||||
|
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
|
||||||
|
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
|
||||||
|
|
||||||
|
# Remove spaces between $ and formula content
|
||||||
|
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
|
||||||
|
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
|
||||||
|
|
||||||
|
# Convert matrix environments for better Word rendering
|
||||||
|
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
||||||
|
|
||||||
|
# Fix brace spacing for equation systems
|
||||||
|
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
||||||
|
|
||||||
|
# Convert cases and aligned environments
|
||||||
|
cleaned_md = self._convert_special_environments(cleaned_md)
|
||||||
|
|
||||||
|
return cleaned_md
|
||||||
|
|
||||||
|
def _convert_matrix_environments(self, md_text: str) -> str:
|
||||||
|
"""Convert vmatrix/Vmatrix to left/right delimited forms.
|
||||||
|
|
||||||
|
This fixes the vertical line height issues in Word.
|
||||||
|
"""
|
||||||
|
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
|
||||||
|
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
||||||
|
md_text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
|
||||||
|
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
||||||
|
md_text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
return md_text
|
||||||
|
|
||||||
|
def _fix_brace_spacing(self, md_text: str) -> str:
|
||||||
|
"""Fix spacing issues with braces in equation systems.
|
||||||
|
|
||||||
|
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
||||||
|
"""
|
||||||
|
# Fix \left\{ spacing
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\\left\\\{\s+",
|
||||||
|
r"\\left\\{\\!",
|
||||||
|
md_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fix \right\} spacing
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\s+\\right\\\}",
|
||||||
|
r"\\!\\right\\}",
|
||||||
|
md_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
return md_text
|
||||||
|
|
||||||
|
def _convert_special_environments(self, md_text: str) -> str:
|
||||||
|
"""Convert cases and aligned environments to array format.
|
||||||
|
|
||||||
|
These environments have better rendering support in Word/OMML.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert_cases(match: re.Match) -> str:
|
||||||
|
content = match.group(1)
|
||||||
|
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
||||||
|
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\\begin\{cases\}(.*?)\\end\{cases\}",
|
||||||
|
convert_cases,
|
||||||
|
md_text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert_aligned_to_array(match: re.Match) -> str:
|
||||||
|
content = match.group(1)
|
||||||
|
# Remove leading & alignment markers (not needed in array{l})
|
||||||
|
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||||||
|
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
||||||
|
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
|
||||||
|
convert_aligned_to_array,
|
||||||
|
md_text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert_standalone_aligned(match: re.Match) -> str:
|
||||||
|
content = match.group(1)
|
||||||
|
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||||||
|
return r"\begin{array}{l}" + content + r"\end{array}"
|
||||||
|
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
|
||||||
|
convert_standalone_aligned,
|
||||||
|
md_text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
return md_text
|
||||||
|
|
||||||
|
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
||||||
|
"""Export markdown to docx or pdf file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
md_text: Markdown text to export.
|
||||||
|
export_type: Export format, either 'docx' or 'pdf'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bytes of the exported file.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If export_type is not supported.
|
||||||
|
RuntimeError: If export fails.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Preprocess markdown
|
||||||
|
cleaned_md = self.preprocess_for_export(md_text)
|
||||||
|
|
||||||
|
# Create temp file for input
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
|
||||||
|
f_in.write(cleaned_md.encode("utf-8"))
|
||||||
|
md_path = f_in.name
|
||||||
|
|
||||||
|
output_file = md_path + "." + export_type
|
||||||
|
|
||||||
|
try:
|
||||||
|
if export_type == "docx":
|
||||||
|
self._export_docx(md_path, output_file)
|
||||||
|
with open(output_file, "rb") as f:
|
||||||
|
return f.read()
|
||||||
|
else: # pdf
|
||||||
|
self._export_pdf(md_path, output_file)
|
||||||
|
with open(output_file, "rb") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Cleanup on error
|
||||||
|
self._cleanup_files(md_path, output_file)
|
||||||
|
raise RuntimeError(f"Export failed: {e}") from e
|
||||||
|
finally:
|
||||||
|
# Always cleanup input file
|
||||||
|
if os.path.exists(md_path):
|
||||||
|
os.remove(md_path)
|
||||||
|
|
||||||
|
def _export_docx(self, input_path: str, output_path: str) -> None:
|
||||||
|
"""Export to DOCX format using pypandoc."""
|
||||||
|
extra_args = [
|
||||||
|
"--highlight-style=pygments",
|
||||||
|
f"--reference-doc=app/pkg/reference.docx",
|
||||||
|
]
|
||||||
|
pypandoc.convert_file(
|
||||||
|
input_path,
|
||||||
|
"docx",
|
||||||
|
format=self.INPUT_FORMAT,
|
||||||
|
outputfile=output_path,
|
||||||
|
extra_args=extra_args,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _export_pdf(self, input_path: str, output_path: str) -> None:
|
||||||
|
"""Export to PDF format using pypandoc with XeLaTeX."""
|
||||||
|
extra_args = [
|
||||||
|
"--pdf-engine=xelatex",
|
||||||
|
"-V",
|
||||||
|
"mainfont=Noto Sans CJK SC",
|
||||||
|
"--highlight-style=pygments",
|
||||||
|
]
|
||||||
|
pypandoc.convert_file(
|
||||||
|
input_path,
|
||||||
|
"pdf",
|
||||||
|
format=self.INPUT_FORMAT,
|
||||||
|
outputfile=output_path,
|
||||||
|
extra_args=extra_args,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _cleanup_files(self, *paths: str) -> None:
|
||||||
|
"""Remove files if they exist."""
|
||||||
|
for path in paths:
|
||||||
|
if os.path.exists(path):
|
||||||
|
os.remove(path)
|
||||||
|
|
||||||
|
def cleanup_export_file(self, file_path: str) -> None:
|
||||||
|
"""Cleanup exported file after sending response.
|
||||||
|
|
||||||
|
Call this after sending the file to the client.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the exported file.
|
||||||
|
"""
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
@@ -1,335 +0,0 @@
|
|||||||
"""Markdown to DOCX conversion service.
|
|
||||||
|
|
||||||
Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
|
|
||||||
"""
|
|
||||||
|
|
||||||
import io
|
|
||||||
import re
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
from docx import Document
|
|
||||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
||||||
from docx.oxml import OxmlElement
|
|
||||||
from docx.oxml.ns import qn
|
|
||||||
from docx.shared import Inches, Pt
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class MarkdownElement:
|
|
||||||
"""Parsed markdown element."""
|
|
||||||
|
|
||||||
type: str # heading, paragraph, list_item, code_block, table, math
|
|
||||||
content: str
|
|
||||||
level: int = 0 # For headings and lists
|
|
||||||
language: str = "" # For code blocks
|
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter:
|
|
||||||
"""Converts markdown content to DOCX format."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the converter."""
|
|
||||||
self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
|
|
||||||
self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
|
|
||||||
self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
|
|
||||||
self.code_block_pattern = re.compile(r"^```(\w*)$")
|
|
||||||
self.inline_code_pattern = re.compile(r"`([^`]+)`")
|
|
||||||
self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
|
|
||||||
self.italic_pattern = re.compile(r"\*([^*]+)\*")
|
|
||||||
self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
|
|
||||||
self.inline_math_pattern = re.compile(r"\$([^$]+)\$")
|
|
||||||
|
|
||||||
def convert(self, markdown: str) -> bytes:
|
|
||||||
"""Convert markdown content to DOCX.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
markdown: Markdown content to convert.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DOCX file as bytes.
|
|
||||||
"""
|
|
||||||
doc = Document()
|
|
||||||
elements = self._parse_markdown(markdown)
|
|
||||||
|
|
||||||
for element in elements:
|
|
||||||
self._add_element_to_doc(doc, element)
|
|
||||||
|
|
||||||
# Save to bytes
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
doc.save(buffer)
|
|
||||||
buffer.seek(0)
|
|
||||||
return buffer.getvalue()
|
|
||||||
|
|
||||||
def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
|
|
||||||
"""Parse markdown into elements.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
markdown: Markdown content.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of parsed elements.
|
|
||||||
"""
|
|
||||||
elements: list[MarkdownElement] = []
|
|
||||||
lines = markdown.split("\n")
|
|
||||||
i = 0
|
|
||||||
in_code_block = False
|
|
||||||
code_content = []
|
|
||||||
code_language = ""
|
|
||||||
|
|
||||||
while i < len(lines):
|
|
||||||
line = lines[i]
|
|
||||||
|
|
||||||
# Code block handling
|
|
||||||
code_match = self.code_block_pattern.match(line)
|
|
||||||
if code_match:
|
|
||||||
if in_code_block:
|
|
||||||
elements.append(
|
|
||||||
MarkdownElement(
|
|
||||||
type="code_block",
|
|
||||||
content="\n".join(code_content),
|
|
||||||
language=code_language,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
code_content = []
|
|
||||||
in_code_block = False
|
|
||||||
else:
|
|
||||||
in_code_block = True
|
|
||||||
code_language = code_match.group(1)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if in_code_block:
|
|
||||||
code_content.append(line)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Math block ($$...$$)
|
|
||||||
if line.strip().startswith("$$"):
|
|
||||||
math_content = []
|
|
||||||
if line.strip() == "$$":
|
|
||||||
i += 1
|
|
||||||
while i < len(lines) and lines[i].strip() != "$$":
|
|
||||||
math_content.append(lines[i])
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
# Single line $$...$$ or start
|
|
||||||
content = line.strip()[2:]
|
|
||||||
if content.endswith("$$"):
|
|
||||||
math_content.append(content[:-2])
|
|
||||||
else:
|
|
||||||
math_content.append(content)
|
|
||||||
i += 1
|
|
||||||
while i < len(lines):
|
|
||||||
if lines[i].strip().endswith("$$"):
|
|
||||||
math_content.append(lines[i].strip()[:-2])
|
|
||||||
break
|
|
||||||
math_content.append(lines[i])
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
elements.append(
|
|
||||||
MarkdownElement(type="math", content="\n".join(math_content))
|
|
||||||
)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Heading
|
|
||||||
heading_match = self.heading_pattern.match(line)
|
|
||||||
if heading_match:
|
|
||||||
level = len(heading_match.group(1))
|
|
||||||
content = heading_match.group(2)
|
|
||||||
elements.append(
|
|
||||||
MarkdownElement(type="heading", content=content, level=level)
|
|
||||||
)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Unordered list
|
|
||||||
list_match = self.list_pattern.match(line)
|
|
||||||
if list_match:
|
|
||||||
indent = len(list_match.group(1))
|
|
||||||
content = list_match.group(2)
|
|
||||||
elements.append(
|
|
||||||
MarkdownElement(type="list_item", content=content, level=indent // 2)
|
|
||||||
)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Ordered list
|
|
||||||
ordered_match = self.ordered_list_pattern.match(line)
|
|
||||||
if ordered_match:
|
|
||||||
indent = len(ordered_match.group(1))
|
|
||||||
content = ordered_match.group(2)
|
|
||||||
elements.append(
|
|
||||||
MarkdownElement(
|
|
||||||
type="ordered_list_item", content=content, level=indent // 2
|
|
||||||
)
|
|
||||||
)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Table (simple detection)
|
|
||||||
if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
|
|
||||||
table_lines = [line]
|
|
||||||
i += 1
|
|
||||||
while i < len(lines) and "|" in lines[i]:
|
|
||||||
table_lines.append(lines[i])
|
|
||||||
i += 1
|
|
||||||
elements.append(
|
|
||||||
MarkdownElement(type="table", content="\n".join(table_lines))
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Regular paragraph
|
|
||||||
if line.strip():
|
|
||||||
elements.append(MarkdownElement(type="paragraph", content=line))
|
|
||||||
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
|
|
||||||
"""Add a markdown element to the document.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
doc: Word document.
|
|
||||||
element: Parsed markdown element.
|
|
||||||
"""
|
|
||||||
if element.type == "heading":
|
|
||||||
self._add_heading(doc, element.content, element.level)
|
|
||||||
elif element.type == "paragraph":
|
|
||||||
self._add_paragraph(doc, element.content)
|
|
||||||
elif element.type == "list_item":
|
|
||||||
self._add_list_item(doc, element.content, element.level, ordered=False)
|
|
||||||
elif element.type == "ordered_list_item":
|
|
||||||
self._add_list_item(doc, element.content, element.level, ordered=True)
|
|
||||||
elif element.type == "code_block":
|
|
||||||
self._add_code_block(doc, element.content)
|
|
||||||
elif element.type == "table":
|
|
||||||
self._add_table(doc, element.content)
|
|
||||||
elif element.type == "math":
|
|
||||||
self._add_math(doc, element.content)
|
|
||||||
|
|
||||||
def _add_heading(self, doc: Document, content: str, level: int) -> None:
|
|
||||||
"""Add a heading to the document."""
|
|
||||||
# Map markdown levels to Word heading styles
|
|
||||||
heading_level = min(level, 9) # Word supports up to Heading 9
|
|
||||||
doc.add_heading(content, level=heading_level)
|
|
||||||
|
|
||||||
def _add_paragraph(self, doc: Document, content: str) -> None:
|
|
||||||
"""Add a paragraph with inline formatting."""
|
|
||||||
para = doc.add_paragraph()
|
|
||||||
self._add_formatted_text(para, content)
|
|
||||||
|
|
||||||
def _add_formatted_text(self, para, content: str) -> None:
|
|
||||||
"""Add text with inline formatting (bold, italic, code)."""
|
|
||||||
# Simple approach: process inline patterns
|
|
||||||
remaining = content
|
|
||||||
|
|
||||||
while remaining:
|
|
||||||
# Find next formatting marker
|
|
||||||
bold_match = self.bold_pattern.search(remaining)
|
|
||||||
italic_match = self.italic_pattern.search(remaining)
|
|
||||||
code_match = self.inline_code_pattern.search(remaining)
|
|
||||||
math_match = self.inline_math_pattern.search(remaining)
|
|
||||||
|
|
||||||
matches = [
|
|
||||||
(bold_match, "bold"),
|
|
||||||
(italic_match, "italic"),
|
|
||||||
(code_match, "code"),
|
|
||||||
(math_match, "math"),
|
|
||||||
]
|
|
||||||
matches = [(m, t) for m, t in matches if m]
|
|
||||||
|
|
||||||
if not matches:
|
|
||||||
para.add_run(remaining)
|
|
||||||
break
|
|
||||||
|
|
||||||
# Find earliest match
|
|
||||||
earliest = min(matches, key=lambda x: x[0].start())
|
|
||||||
match, match_type = earliest
|
|
||||||
|
|
||||||
# Add text before match
|
|
||||||
if match.start() > 0:
|
|
||||||
para.add_run(remaining[: match.start()])
|
|
||||||
|
|
||||||
# Add formatted text
|
|
||||||
run = para.add_run(match.group(1))
|
|
||||||
if match_type == "bold":
|
|
||||||
run.bold = True
|
|
||||||
elif match_type == "italic":
|
|
||||||
run.italic = True
|
|
||||||
elif match_type == "code":
|
|
||||||
run.font.name = "Courier New"
|
|
||||||
run.font.size = Pt(10)
|
|
||||||
elif match_type == "math":
|
|
||||||
run.italic = True
|
|
||||||
|
|
||||||
remaining = remaining[match.end() :]
|
|
||||||
|
|
||||||
def _add_list_item(
|
|
||||||
self, doc: Document, content: str, level: int, ordered: bool
|
|
||||||
) -> None:
|
|
||||||
"""Add a list item."""
|
|
||||||
para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
|
|
||||||
para.paragraph_format.left_indent = Inches(0.25 * level)
|
|
||||||
self._add_formatted_text(para, content)
|
|
||||||
|
|
||||||
def _add_code_block(self, doc: Document, content: str) -> None:
|
|
||||||
"""Add a code block."""
|
|
||||||
para = doc.add_paragraph()
|
|
||||||
para.paragraph_format.left_indent = Inches(0.5)
|
|
||||||
|
|
||||||
run = para.add_run(content)
|
|
||||||
run.font.name = "Courier New"
|
|
||||||
run.font.size = Pt(9)
|
|
||||||
|
|
||||||
# Add shading
|
|
||||||
shading = OxmlElement("w:shd")
|
|
||||||
shading.set(qn("w:val"), "clear")
|
|
||||||
shading.set(qn("w:fill"), "F0F0F0")
|
|
||||||
para._p.get_or_add_pPr().append(shading)
|
|
||||||
|
|
||||||
def _add_table(self, doc: Document, content: str) -> None:
|
|
||||||
"""Add a table from markdown table format."""
|
|
||||||
lines = [l.strip() for l in content.split("\n") if l.strip()]
|
|
||||||
if len(lines) < 2:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Parse header
|
|
||||||
header = [c.strip() for c in lines[0].split("|") if c.strip()]
|
|
||||||
|
|
||||||
# Skip separator line
|
|
||||||
data_lines = lines[2:] if len(lines) > 2 else []
|
|
||||||
|
|
||||||
# Create table
|
|
||||||
table = doc.add_table(rows=1, cols=len(header))
|
|
||||||
table.style = "Table Grid"
|
|
||||||
|
|
||||||
# Add header
|
|
||||||
header_cells = table.rows[0].cells
|
|
||||||
for i, text in enumerate(header):
|
|
||||||
header_cells[i].text = text
|
|
||||||
header_cells[i].paragraphs[0].runs[0].bold = True
|
|
||||||
|
|
||||||
# Add data rows
|
|
||||||
for line in data_lines:
|
|
||||||
cells = [c.strip() for c in line.split("|") if c.strip()]
|
|
||||||
row_cells = table.add_row().cells
|
|
||||||
for i, text in enumerate(cells):
|
|
||||||
if i < len(row_cells):
|
|
||||||
row_cells[i].text = text
|
|
||||||
|
|
||||||
def _add_math(self, doc: Document, content: str) -> None:
|
|
||||||
"""Add a math block.
|
|
||||||
|
|
||||||
For proper OMML rendering, this would need more complex conversion.
|
|
||||||
Currently renders as italic text with the LaTeX source.
|
|
||||||
"""
|
|
||||||
para = doc.add_paragraph()
|
|
||||||
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
||||||
|
|
||||||
run = para.add_run(content)
|
|
||||||
run.italic = True
|
|
||||||
run.font.name = "Cambria Math"
|
|
||||||
run.font.size = Pt(12)
|
|
||||||
|
|
||||||
@@ -116,7 +116,7 @@ class ImageProcessor:
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Either image_url or image_base64 must be provided")
|
raise ValueError("Either image_url or image_base64 must be provided")
|
||||||
|
|
||||||
return self.add_padding(image)
|
return image
|
||||||
|
|
||||||
def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str:
|
def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str:
|
||||||
"""Convert numpy image to base64 string.
|
"""Convert numpy image to base64 string.
|
||||||
|
|||||||
@@ -1,122 +1,157 @@
|
|||||||
"""DocLayout-YOLO wrapper for document layout detection."""
|
"""PP-DocLayoutV2 wrapper for document layout detection."""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from app.schemas.image import LayoutInfo, LayoutRegion
|
from app.schemas.image import LayoutInfo, LayoutRegion
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
|
from paddleocr import LayoutDetection
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
|
||||||
|
|
||||||
class LayoutDetector:
|
class LayoutDetector:
|
||||||
"""Wrapper for DocLayout-YOLO model."""
|
"""Layout detector for PP-DocLayoutV2."""
|
||||||
|
|
||||||
# Class names from DocLayout-YOLO
|
_layout_detector: Optional[LayoutDetection] = None
|
||||||
CLASS_NAMES = {
|
|
||||||
0: "title",
|
# PP-DocLayoutV2 class ID to label mapping
|
||||||
1: "plain_text",
|
CLS_ID_TO_LABEL: dict[int, str] = {
|
||||||
2: "abandon",
|
0: "abstract",
|
||||||
3: "figure",
|
1: "algorithm",
|
||||||
4: "figure_caption",
|
2: "aside_text",
|
||||||
5: "table",
|
3: "chart",
|
||||||
6: "table_caption",
|
4: "content",
|
||||||
7: "table_footnote",
|
5: "display_formula",
|
||||||
8: "isolate_formula",
|
6: "doc_title",
|
||||||
9: "formula_caption",
|
7: "figure_title",
|
||||||
|
8: "footer",
|
||||||
|
9: "footer_image",
|
||||||
|
10: "footnote",
|
||||||
|
11: "formula_number",
|
||||||
|
12: "header",
|
||||||
|
13: "header_image",
|
||||||
|
14: "image",
|
||||||
|
15: "inline_formula",
|
||||||
|
16: "number",
|
||||||
|
17: "paragraph_title",
|
||||||
|
18: "reference",
|
||||||
|
19: "reference_content",
|
||||||
|
20: "seal",
|
||||||
|
21: "table",
|
||||||
|
22: "text",
|
||||||
|
23: "vertical_text",
|
||||||
|
24: "vision_footnote",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Classes considered as plain text
|
# Mapping from raw labels to normalized region types
|
||||||
PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"}
|
LABEL_TO_TYPE: dict[str, str] = {
|
||||||
|
# Text types
|
||||||
|
"abstract": "text",
|
||||||
|
"algorithm": "text",
|
||||||
|
"aside_text": "text",
|
||||||
|
"content": "text",
|
||||||
|
"doc_title": "text",
|
||||||
|
"footer": "text",
|
||||||
|
"footnote": "text",
|
||||||
|
"header": "text",
|
||||||
|
"number": "text",
|
||||||
|
"paragraph_title": "text",
|
||||||
|
"reference": "text",
|
||||||
|
"reference_content": "text",
|
||||||
|
"text": "text",
|
||||||
|
"vertical_text": "text",
|
||||||
|
"vision_footnote": "text",
|
||||||
|
# Formula types
|
||||||
|
"display_formula": "formula",
|
||||||
|
"inline_formula": "formula",
|
||||||
|
"formula_number": "formula",
|
||||||
|
# Table types
|
||||||
|
"table": "table",
|
||||||
|
# Figure types
|
||||||
|
"chart": "figure",
|
||||||
|
"figure_title": "figure",
|
||||||
|
"footer_image": "figure",
|
||||||
|
"header_image": "figure",
|
||||||
|
"image": "figure",
|
||||||
|
"seal": "figure",
|
||||||
|
}
|
||||||
|
|
||||||
# Classes considered as formula
|
def __init__(self):
|
||||||
FORMULA_CLASSES = {"isolate_formula", "formula_caption"}
|
"""Initialize layout detector.
|
||||||
|
|
||||||
def __init__(self, model_path: str, confidence_threshold: float = 0.2):
|
|
||||||
"""Initialize the layout detector.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_path: Path to the DocLayout-YOLO model weights.
|
|
||||||
confidence_threshold: Minimum confidence for detections.
|
|
||||||
"""
|
"""
|
||||||
self.model_path = model_path
|
_ = self._get_layout_detector()
|
||||||
self.confidence_threshold = confidence_threshold
|
|
||||||
self.model = None
|
|
||||||
|
|
||||||
def load_model(self) -> None:
|
def _get_layout_detector(self):
|
||||||
"""Load the DocLayout-YOLO model.
|
"""Get or create LayoutDetection instance."""
|
||||||
|
if LayoutDetector._layout_detector is None:
|
||||||
|
LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
|
||||||
|
return LayoutDetector._layout_detector
|
||||||
|
|
||||||
Raises:
|
def detect(self, image: np.ndarray) -> LayoutInfo:
|
||||||
RuntimeError: If model cannot be loaded.
|
"""Detect layout of the image using PP-DocLayoutV2.
|
||||||
"""
|
|
||||||
try:
|
|
||||||
from doclayout_yolo import YOLOv10
|
|
||||||
|
|
||||||
self.model = YOLOv10(self.model_path)
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e
|
|
||||||
|
|
||||||
def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo:
|
|
||||||
"""Detect document layout regions.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image: Input image as numpy array in BGR format.
|
image: Input image as numpy array.
|
||||||
image_size: Image size for prediction.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
LayoutInfo with detected regions.
|
LayoutInfo with detected regions and flags.
|
||||||
|
|
||||||
Raises:
|
|
||||||
RuntimeError: If model not loaded.
|
|
||||||
"""
|
"""
|
||||||
if self.model is None:
|
layout_detector = self._get_layout_detector()
|
||||||
raise RuntimeError("Model not loaded. Call load_model() first.")
|
result = layout_detector.predict(image)
|
||||||
|
|
||||||
# Run prediction
|
|
||||||
results = self.model.predict(
|
|
||||||
image,
|
|
||||||
imgsz=image_size,
|
|
||||||
conf=self.confidence_threshold,
|
|
||||||
device=settings.device,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
# Parse the result
|
||||||
regions: list[LayoutRegion] = []
|
regions: list[LayoutRegion] = []
|
||||||
has_plain_text = False
|
mixed_recognition = False
|
||||||
has_formula = False
|
|
||||||
|
|
||||||
if results and len(results) > 0:
|
# Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]
|
||||||
result = results[0]
|
if isinstance(result, list) and len(result) > 0:
|
||||||
if result.boxes is not None:
|
first_result = result[0]
|
||||||
for box in result.boxes:
|
if isinstance(first_result, dict) and "boxes" in first_result:
|
||||||
cls_id = int(box.cls[0].item())
|
boxes = first_result.get("boxes", [])
|
||||||
confidence = float(box.conf[0].item())
|
else:
|
||||||
bbox = box.xyxy[0].tolist()
|
boxes = []
|
||||||
|
else:
|
||||||
|
boxes = []
|
||||||
|
|
||||||
class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}")
|
for box in boxes:
|
||||||
|
cls_id = box.get("cls_id")
|
||||||
|
label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")
|
||||||
|
score = box.get("score", 0.0)
|
||||||
|
coordinate = box.get("coordinate", [0, 0, 0, 0])
|
||||||
|
|
||||||
# Map to simplified type
|
# Normalize label to region type
|
||||||
if class_name in self.PLAIN_TEXT_CLASSES:
|
region_type = self.LABEL_TO_TYPE.get(label, "text")
|
||||||
region_type = "text"
|
|
||||||
has_plain_text = True
|
|
||||||
elif class_name in self.FORMULA_CLASSES:
|
|
||||||
region_type = "formula"
|
|
||||||
has_formula = True
|
|
||||||
elif class_name in {"figure"}:
|
|
||||||
region_type = "figure"
|
|
||||||
elif class_name in {"table"}:
|
|
||||||
region_type = "table"
|
|
||||||
else:
|
|
||||||
region_type = class_name
|
|
||||||
|
|
||||||
regions.append(
|
regions.append(LayoutRegion(
|
||||||
LayoutRegion(
|
type=region_type,
|
||||||
type=region_type,
|
bbox=coordinate,
|
||||||
bbox=bbox,
|
confidence=score,
|
||||||
confidence=confidence,
|
score=score,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
return LayoutInfo(
|
|
||||||
regions=regions,
|
mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)
|
||||||
has_plain_text=has_plain_text,
|
|
||||||
has_formula=has_formula,
|
return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)
|
||||||
)
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import cv2
|
||||||
|
from app.services.image_processor import ImageProcessor
|
||||||
|
|
||||||
|
layout_detector = LayoutDetector()
|
||||||
|
image_path = "test/timeout.png"
|
||||||
|
|
||||||
|
image = cv2.imread(image_path)
|
||||||
|
image_processor = ImageProcessor(padding_ratio=0.15)
|
||||||
|
image = image_processor.add_padding(image)
|
||||||
|
|
||||||
|
# Save the padded image for debugging
|
||||||
|
cv2.imwrite("debug_padded_image.png", image)
|
||||||
|
|
||||||
|
|
||||||
|
layout_info = layout_detector.detect(image)
|
||||||
|
print(layout_info)
|
||||||
@@ -1,14 +1,12 @@
|
|||||||
"""PaddleOCR-VL client service for text and formula recognition."""
|
"""PaddleOCR-VL client service for text and formula recognition."""
|
||||||
|
|
||||||
import io
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
from app.schemas.image import LayoutInfo
|
from paddleocr import PaddleOCRVL
|
||||||
|
from typing import Optional
|
||||||
|
from app.services.layout_detector import LayoutDetector
|
||||||
|
from app.services.image_processor import ImageProcessor
|
||||||
|
from app.services.converter import Converter
|
||||||
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
|
||||||
@@ -16,52 +14,40 @@ settings = get_settings()
|
|||||||
class OCRService:
|
class OCRService:
|
||||||
"""Service for OCR using PaddleOCR-VL."""
|
"""Service for OCR using PaddleOCR-VL."""
|
||||||
|
|
||||||
FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
|
_pipeline: Optional[PaddleOCRVL] = None
|
||||||
|
_layout_detector: Optional[LayoutDetector] = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vl_server_url: str | None = None,
|
vl_server_url: str,
|
||||||
pp_doclayout_model_dir: str | None = None,
|
layout_detector: LayoutDetector,
|
||||||
|
image_processor: ImageProcessor,
|
||||||
|
converter: Converter,
|
||||||
):
|
):
|
||||||
"""Initialize OCR service.
|
"""Initialize OCR service.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
|
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
|
||||||
pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
|
layout_detector: Layout detector instance.
|
||||||
|
image_processor: Image processor instance.
|
||||||
"""
|
"""
|
||||||
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
|
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
|
||||||
self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
|
self.layout_detector = layout_detector
|
||||||
self._pipeline = None
|
self.image_processor = image_processor
|
||||||
|
self.converter = converter
|
||||||
def _get_pipeline(self):
|
def _get_pipeline(self):
|
||||||
"""Get or create PaddleOCR-VL pipeline.
|
"""Get or create PaddleOCR-VL pipeline.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
PaddleOCRVL pipeline instance.
|
PaddleOCRVL pipeline instance.
|
||||||
"""
|
"""
|
||||||
if self._pipeline is None:
|
if OCRService._pipeline is None:
|
||||||
from paddleocr import PaddleOCRVL
|
OCRService._pipeline = PaddleOCRVL(
|
||||||
|
|
||||||
self._pipeline = PaddleOCRVL(
|
|
||||||
vl_rec_backend="vllm-server",
|
vl_rec_backend="vllm-server",
|
||||||
vl_rec_server_url=self.vl_server_url,
|
vl_rec_server_url=self.vl_server_url,
|
||||||
layout_detection_model_name="PP-DocLayoutV2",
|
layout_detection_model_name="PP-DocLayoutV2",
|
||||||
layout_detection_model_dir=self.pp_doclayout_model_dir,
|
|
||||||
)
|
)
|
||||||
return self._pipeline
|
return OCRService._pipeline
|
||||||
|
|
||||||
def _save_temp_image(self, image: np.ndarray) -> str:
|
|
||||||
"""Save image to a temporary file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image: Image as numpy array in BGR format.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path to temporary file.
|
|
||||||
"""
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
|
||||||
cv2.imwrite(f.name, image)
|
|
||||||
return f.name
|
|
||||||
|
|
||||||
def recognize_mixed(self, image: np.ndarray) -> dict:
|
def recognize_mixed(self, image: np.ndarray) -> dict:
|
||||||
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
|
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
|
||||||
@@ -77,30 +63,21 @@ class OCRService:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
pipeline = self._get_pipeline()
|
pipeline = self._get_pipeline()
|
||||||
temp_path = self._save_temp_image(image)
|
|
||||||
|
|
||||||
try:
|
output = pipeline.predict(image, use_layout_detection=True)
|
||||||
results = list(pipeline.predict(temp_path))
|
|
||||||
|
|
||||||
markdown_content = ""
|
markdown_content = ""
|
||||||
for result in results:
|
|
||||||
# PaddleOCR-VL results can be saved to markdown
|
|
||||||
md_buffer = io.StringIO()
|
|
||||||
result.save_to_markdown(save_path=md_buffer)
|
|
||||||
markdown_content += md_buffer.getvalue()
|
|
||||||
|
|
||||||
# Convert markdown to other formats
|
for res in output:
|
||||||
latex = self._markdown_to_latex(markdown_content)
|
markdown_content += res.markdown.get("markdown_texts", "")
|
||||||
mathml = self._extract_mathml(markdown_content)
|
|
||||||
|
|
||||||
return {
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||||
"markdown": markdown_content,
|
|
||||||
"latex": latex,
|
|
||||||
"mathml": mathml,
|
|
||||||
}
|
|
||||||
finally:
|
|
||||||
Path(temp_path).unlink(missing_ok=True)
|
|
||||||
|
|
||||||
|
return {
|
||||||
|
"markdown": markdown_content,
|
||||||
|
"latex": convert_result.latex,
|
||||||
|
"mathml": convert_result.mathml,
|
||||||
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
||||||
|
|
||||||
@@ -116,188 +93,49 @@ class OCRService:
|
|||||||
Dict with 'latex', 'markdown', 'mathml' keys.
|
Dict with 'latex', 'markdown', 'mathml' keys.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import httpx
|
pipeline = self._get_pipeline()
|
||||||
|
|
||||||
temp_path = self._save_temp_image(image)
|
output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")
|
||||||
|
|
||||||
try:
|
markdown_content = ""
|
||||||
# Use vLLM API directly for formula recognition
|
|
||||||
import base64
|
|
||||||
|
|
||||||
with open(temp_path, "rb") as f:
|
for res in output:
|
||||||
image_base64 = base64.b64encode(f.read()).decode("utf-8")
|
markdown_content += res.markdown.get("markdown_texts", "")
|
||||||
|
|
||||||
# Call vLLM server with formula prompt
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||||
response = httpx.post(
|
|
||||||
f"{self.vl_server_url}/chat/completions",
|
|
||||||
json={
|
|
||||||
"model": "paddleocr-vl",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "text", "text": self.FORMULA_PROMPT},
|
|
||||||
{
|
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"max_tokens": 1024,
|
|
||||||
},
|
|
||||||
timeout=60.0,
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
latex = result["choices"][0]["message"]["content"].strip()
|
return {
|
||||||
|
"latex": convert_result.latex,
|
||||||
# Convert latex to other formats
|
"mathml": convert_result.mathml,
|
||||||
markdown = self._latex_to_markdown(latex)
|
"markdown": markdown_content,
|
||||||
mathml = self._latex_to_mathml(latex)
|
}
|
||||||
|
|
||||||
return {
|
|
||||||
"latex": latex,
|
|
||||||
"markdown": markdown,
|
|
||||||
"mathml": mathml,
|
|
||||||
}
|
|
||||||
finally:
|
|
||||||
Path(temp_path).unlink(missing_ok=True)
|
|
||||||
|
|
||||||
except httpx.HTTPStatusError as e:
|
|
||||||
raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Formula recognition failed: {e}") from e
|
raise RuntimeError(f"Formula recognition failed: {e}") from e
|
||||||
|
|
||||||
def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
|
def recognize(self, image: np.ndarray) -> dict:
|
||||||
"""Recognize content based on layout detection results.
|
"""Recognize content using PaddleOCR-VL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image: Input image as numpy array in BGR format.
|
image: Input image as numpy array in BGR format.
|
||||||
layout_info: Layout detection results.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict with recognition results including mode used.
|
Dict with 'latex', 'markdown', 'mathml' keys.
|
||||||
"""
|
"""
|
||||||
# Decision logic:
|
padded_image = self.image_processor.add_padding(image)
|
||||||
# - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
|
layout_info = self.layout_detector.detect(padded_image)
|
||||||
# - Otherwise -> use formula_recognition (VL with prompt)
|
if layout_info.MixedRecognition:
|
||||||
if layout_info.has_plain_text:
|
return self.recognize_mixed(image)
|
||||||
result = self.recognize_mixed(image)
|
|
||||||
result["recognition_mode"] = "mixed_recognition"
|
|
||||||
else:
|
else:
|
||||||
result = self.recognize_formula(image)
|
return self.recognize_formula(image)
|
||||||
result["recognition_mode"] = "formula_recognition"
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _markdown_to_latex(self, markdown: str) -> str:
|
if __name__ == "__main__":
|
||||||
"""Convert markdown to LaTeX.
|
import cv2
|
||||||
|
from app.services.image_processor import ImageProcessor
|
||||||
Simple conversion - wraps content in LaTeX document structure.
|
from app.services.layout_detector import LayoutDetector
|
||||||
|
image_processor = ImageProcessor(padding_ratio=0.15)
|
||||||
Args:
|
layout_detector = LayoutDetector()
|
||||||
markdown: Markdown content.
|
ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
|
||||||
|
image = cv2.imread("test/image.png")
|
||||||
Returns:
|
ocr_result = ocr_service.recognize(image)
|
||||||
LaTeX representation.
|
print(ocr_result)
|
||||||
"""
|
|
||||||
# Basic conversion: preserve math blocks, convert structure
|
|
||||||
lines = []
|
|
||||||
in_code_block = False
|
|
||||||
|
|
||||||
for line in markdown.split("\n"):
|
|
||||||
if line.startswith("```"):
|
|
||||||
in_code_block = not in_code_block
|
|
||||||
if in_code_block:
|
|
||||||
lines.append("\\begin{verbatim}")
|
|
||||||
else:
|
|
||||||
lines.append("\\end{verbatim}")
|
|
||||||
elif in_code_block:
|
|
||||||
lines.append(line)
|
|
||||||
elif line.startswith("# "):
|
|
||||||
lines.append(f"\\section{{{line[2:]}}}")
|
|
||||||
elif line.startswith("## "):
|
|
||||||
lines.append(f"\\subsection{{{line[3:]}}}")
|
|
||||||
elif line.startswith("### "):
|
|
||||||
lines.append(f"\\subsubsection{{{line[4:]}}}")
|
|
||||||
elif line.startswith("- "):
|
|
||||||
lines.append(f"\\item {line[2:]}")
|
|
||||||
elif line.startswith("$$"):
|
|
||||||
lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
|
|
||||||
elif "$" in line:
|
|
||||||
# Keep inline math as-is
|
|
||||||
lines.append(line)
|
|
||||||
else:
|
|
||||||
lines.append(line)
|
|
||||||
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
def _latex_to_markdown(self, latex: str) -> str:
|
|
||||||
"""Convert LaTeX to markdown.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
latex: LaTeX content.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Markdown representation.
|
|
||||||
"""
|
|
||||||
# Wrap LaTeX in markdown math block
|
|
||||||
if latex.strip():
|
|
||||||
return f"$$\n{latex}\n$$"
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _latex_to_mathml(self, latex: str) -> str:
|
|
||||||
"""Convert LaTeX to MathML.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
latex: LaTeX content.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
MathML representation.
|
|
||||||
"""
|
|
||||||
# Basic LaTeX to MathML conversion
|
|
||||||
# For production, consider using latex2mathml library
|
|
||||||
if not latex.strip():
|
|
||||||
return ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Try to use latex2mathml if available
|
|
||||||
from latex2mathml.converter import convert
|
|
||||||
|
|
||||||
return convert(latex)
|
|
||||||
except ImportError:
|
|
||||||
# Fallback: wrap in basic MathML structure
|
|
||||||
return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
|
|
||||||
except Exception:
|
|
||||||
return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
|
|
||||||
|
|
||||||
def _extract_mathml(self, markdown: str) -> str:
|
|
||||||
"""Extract and convert math from markdown to MathML.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
markdown: Markdown content.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
MathML for any math content found.
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
|
|
||||||
# Find all math blocks
|
|
||||||
math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
|
|
||||||
inline_math = re.findall(r"\$([^$]+)\$", markdown)
|
|
||||||
|
|
||||||
all_math = math_blocks + inline_math
|
|
||||||
|
|
||||||
if not all_math:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
# Convert each to MathML and combine
|
|
||||||
mathml_parts = []
|
|
||||||
for latex in all_math:
|
|
||||||
mathml = self._latex_to_mathml(latex.strip())
|
|
||||||
if mathml:
|
|
||||||
mathml_parts.append(mathml)
|
|
||||||
|
|
||||||
return "\n".join(mathml_parts)
|
|
||||||
@@ -2,30 +2,36 @@
|
|||||||
name = "doc-processer"
|
name = "doc-processer"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX"
|
description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX"
|
||||||
readme = "README.md"
|
requires-python = ">=3.10"
|
||||||
requires-python = ">=3.11"
|
|
||||||
license = { text = "MIT" }
|
license = { text = "MIT" }
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "YogeLiu" }
|
{ name = "YogeLiu" }
|
||||||
]
|
]
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fastapi>=0.115.0",
|
"fastapi==0.128.0",
|
||||||
"uvicorn[standard]>=0.32.0",
|
"uvicorn[standard]==0.40.0",
|
||||||
"opencv-python>=4.10.0",
|
"opencv-python==4.12.0.88",
|
||||||
"python-multipart>=0.0.12",
|
"python-multipart==0.0.21",
|
||||||
"pydantic>=2.10.0",
|
"pydantic==2.12.5",
|
||||||
"pydantic-settings>=2.6.0",
|
"pydantic-settings==2.12.0",
|
||||||
"httpx>=0.28.0",
|
"httpx==0.28.1",
|
||||||
"numpy>=1.26.0",
|
"numpy==2.2.6",
|
||||||
"pillow>=10.4.0",
|
"pillow==12.0.0",
|
||||||
"python-docx>=1.1.0",
|
"python-docx==1.2.0",
|
||||||
"paddleocr>=2.9.0",
|
"paddleocr==3.3.2",
|
||||||
"doclayout-yolo>=0.0.2",
|
"doclayout-yolo==0.0.4",
|
||||||
"latex2mathml>=3.77.0",
|
"latex2mathml==3.78.1",
|
||||||
"paddle>=1.2.0",
|
"paddle==1.2.0",
|
||||||
|
"pypandoc==1.16.2",
|
||||||
|
"paddlepaddle",
|
||||||
|
"paddleocr[doc-parser]",
|
||||||
|
"safetensors"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.uv.sources]
|
||||||
|
paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
dev = [
|
dev = [
|
||||||
"pytest>=8.0.0",
|
"pytest>=8.0.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user