diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..c01ecbc
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,55 @@
+# Git
+.git
+.gitignore
+
+# Python
+.venv/
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+.eggs/
+dist/
+build/
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+test/
+tests/
+
+# Linting & IDE
+.ruff_cache/
+.mypy_cache/
+.cursor/
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Environment
+.env
+.env.*
+!.env.example
+
+# Documentation (not needed in container)
+*.md
+!README.md
+openspec/
+
+# Models (mounted at runtime, not built into image)
+app/model/doclayout/*.pdiparams
+app/model/DocLayout/
+app/model/PP-DocLayout/
+
+# Misc
+*.log
+*.tmp
+.DS_Store
+Thumbs.db
+
+test/
+
diff --git a/.gitignore b/.gitignore
index e49f677..d9d72c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,3 +71,5 @@ htmlcov/
uv.lock
model/
+
+test/
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 3f3b60c..1586f2b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,54 +1,73 @@
# DocProcesser Dockerfile
# Optimized for RTX 5080 GPU deployment
-# Use NVIDIA CUDA base image with Python 3.11
+# Use NVIDIA CUDA base image with Python 3.10
FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04
# Set environment variables
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
- PIP_DISABLE_PIP_VERSION_CHECK=1
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
+ # Model cache directories - mount these at runtime
+ MODELSCOPE_CACHE=/root/.cache/modelscope \
+ HF_HOME=/root/.cache/huggingface \
+ # Application config (override defaults for container)
+ # Use 127.0.0.1 for --network host mode, or override with -e for bridge mode
+ PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \
+ PADDLEOCR_VL_URL=http://127.0.0.1:8000/v1
# Set working directory
WORKDIR /app
-# Install system dependencies
+# Install system dependencies and Python 3.10 from deadsnakes PPA
RUN apt-get update && apt-get install -y --no-install-recommends \
- python3.11 \
- python3.11-venv \
- python3.11-dev \
- python3-pip \
- libgl1-mesa-glx \
+ software-properties-common \
+ && add-apt-repository -y ppa:deadsnakes/ppa \
+ && apt-get update && apt-get install -y --no-install-recommends \
+ python3.10 \
+ python3.10-venv \
+ python3.10-dev \
+ python3.10-distutils \
+ libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender-dev \
libgomp1 \
curl \
+ pandoc \
&& rm -rf /var/lib/apt/lists/* \
- && ln -sf /usr/bin/python3.11 /usr/bin/python \
- && ln -sf /usr/bin/python3.11 /usr/bin/python3
+ && ln -sf /usr/bin/python3.10 /usr/bin/python \
+ && ln -sf /usr/bin/python3.10 /usr/bin/python3 \
+ && curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
-# Install uv for fast package management
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-ENV PATH="/root/.local/bin:$PATH"
-
-# Copy dependency files first for better caching
-COPY pyproject.toml ./
-
-# Create virtual environment and install dependencies
-RUN uv venv /app/.venv
+# Install uv via pip (more reliable than install script)
+RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
ENV PATH="/app/.venv/bin:$PATH"
ENV VIRTUAL_ENV="/app/.venv"
-RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e .
+# Copy dependency files first for better caching
+COPY pyproject.toml ./
+COPY wheels/ ./wheels/
+
+# Create virtual environment and install dependencies
+RUN uv venv /app/.venv --python python3.10 \
+ && uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \
+ && rm -rf ./wheels
# Copy application code
COPY app/ ./app/
-# Create model directories (models should be mounted at runtime)
-RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout
+# Create model cache directories (mount from host at runtime)
+RUN mkdir -p /root/.cache/modelscope \
+ /root/.cache/huggingface \
+ /root/.paddlex \
+ /app/app/model/DocLayout \
+ /app/app/model/PP-DocLayout
+
+# Declare volumes for model cache (mount at runtime to avoid re-downloading)
+VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"]
# Expose port
EXPOSE 8053
@@ -60,3 +79,21 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
# Run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]
+# =============================================================================
+# Usage: Mount local model cache to avoid downloading
+#
+# Option 1: Use host network (simplest, can access localhost services)
+# docker run --gpus all --network host \
+# -v /home/yoge/.paddlex:/root/.paddlex:ro \
+# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
+# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
+# doc_processer:latest
+#
+# Option 2: Use bridge network with host.docker.internal (Linux needs --add-host)
+# docker run --gpus all -p 8053:8053 \
+# --add-host=host.docker.internal:host-gateway \
+# -v /home/yoge/.paddlex:/root/.paddlex:ro \
+# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \
+# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \
+# doc_processer:latest
+# =============================================================================
diff --git a/app/api/v1/endpoints/convert.py b/app/api/v1/endpoints/convert.py
index 256c085..ea381fd 100644
--- a/app/api/v1/endpoints/convert.py
+++ b/app/api/v1/endpoints/convert.py
@@ -3,34 +3,28 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
-from app.core.dependencies import get_docx_converter
+from app.core.dependencies import get_converter
from app.schemas.convert import MarkdownToDocxRequest
-from app.services.docx_converter import DocxConverter
+from app.services.converter import Converter
router = APIRouter()
-@router.post("/docx")
+@router.post("/file")
async def convert_markdown_to_docx(
request: MarkdownToDocxRequest,
- converter: DocxConverter = Depends(get_docx_converter),
+ converter: Converter = Depends(get_converter),
) -> Response:
"""Convert markdown content to DOCX file.
- Returns the generated DOCX file as a binary download.
+ Returns the generated DOCX file as a binary response.
"""
try:
- docx_bytes = converter.convert(request.markdown)
+ docx_bytes = converter.export_to_file(request.markdown, export_type="docx")
+ return Response(
+ content=docx_bytes,
+ media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ headers={"Content-Disposition": f'attachment; filename="{request.filename}.docx"'},
+ )
except Exception as e:
raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
-
- # Determine filename
- filename = request.filename or "output"
- if not filename.endswith(".docx"):
- filename = f"{filename}.docx"
-
- return Response(
- content=docx_bytes,
- media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- headers={"Content-Disposition": f'attachment; filename="{filename}"'},
- )
diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py
index c194213..635ebf7 100644
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -28,24 +28,15 @@ async def process_image_ocr(
- Otherwise: use PaddleOCR-VL with formula prompt
4. Convert output to LaTeX, Markdown, and MathML formats
"""
- try:
- # 1. Load and preprocess image
- image = image_processor.preprocess(
- image_url=request.image_url,
- image_base64=request.image_base64,
- )
- except ValueError as e:
- raise HTTPException(status_code=400, detail=str(e))
- try:
- # 2. Detect layout
- layout_info = layout_detector.detect(image)
- except RuntimeError as e:
- raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}")
+ image = image_processor.preprocess(
+ image_url=request.image_url,
+ image_base64=request.image_base64,
+ )
try:
# 3. Perform OCR based on layout
- ocr_result = ocr_service.recognize(image, layout_info)
+ ocr_result = ocr_service.recognize(image)
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))
@@ -54,6 +45,4 @@ async def process_image_ocr(
latex=ocr_result.get("latex", ""),
markdown=ocr_result.get("markdown", ""),
mathml=ocr_result.get("mathml", ""),
- layout_info=layout_info,
- recognition_mode=ocr_result.get("recognition_mode", ""),
)
diff --git a/app/core/config.py b/app/core/config.py
index af18a14..c3d81a7 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -5,6 +5,7 @@ from pathlib import Path
from pydantic_settings import BaseSettings, SettingsConfigDict
import torch
+from typing import Optional
class Settings(BaseSettings):
@@ -21,11 +22,10 @@ class Settings(BaseSettings):
debug: bool = False
# PaddleOCR-VL Settings
- paddleocr_vl_url: str = "http://localhost:8080/v1"
+ paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"
# Model Paths
- doclayout_model_path: str = "app/model/DocLayout/best.pt"
- pp_doclayout_model_dir: str = "app/model/PP-DocLayout/PP-DocLayoutV2"
+ pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"
# Image Processing
max_image_size_mb: int = 10
@@ -37,11 +37,6 @@ class Settings(BaseSettings):
host: str = "0.0.0.0"
port: int = 8053
- @property
- def doclayout_model_file(self) -> Path:
- """Get the DocLayout model file path."""
- return Path(self.doclayout_model_path)
-
@property
def pp_doclayout_dir(self) -> Path:
"""Get the PP-DocLayout model directory path."""
diff --git a/app/core/dependencies.py b/app/core/dependencies.py
index dcd04ae..ea19022 100644
--- a/app/core/dependencies.py
+++ b/app/core/dependencies.py
@@ -3,20 +3,20 @@
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
from app.services.ocr_service import OCRService
-from app.services.docx_converter import DocxConverter
+from app.services.converter import Converter
+from app.core.config import get_settings
# Global instances (initialized on startup)
_layout_detector: LayoutDetector | None = None
-def init_layout_detector(model_path: str) -> None:
+def init_layout_detector() -> None:
"""Initialize the global layout detector.
Called during application startup.
"""
global _layout_detector
- _layout_detector = LayoutDetector(model_path=model_path)
- _layout_detector.load_model()
+ _layout_detector = LayoutDetector()
def get_layout_detector() -> LayoutDetector:
@@ -33,10 +33,15 @@ def get_image_processor() -> ImageProcessor:
def get_ocr_service() -> OCRService:
"""Get an OCR service instance."""
- return OCRService()
+ return OCRService(
+ vl_server_url=get_settings().paddleocr_vl_url,
+ layout_detector=get_layout_detector(),
+ image_processor=get_image_processor(),
+ converter=get_converter(),
+ )
-def get_docx_converter() -> DocxConverter:
+def get_converter() -> Converter:
"""Get a DOCX converter instance."""
- return DocxConverter()
+ return Converter()
diff --git a/app/main.py b/app/main.py
index 174b5ae..88d9fe2 100644
--- a/app/main.py
+++ b/app/main.py
@@ -15,7 +15,7 @@ settings = get_settings()
async def lifespan(app: FastAPI):
"""Application lifespan handler for startup/shutdown."""
# Startup: Load models
- init_layout_detector(model_path=settings.doclayout_model_path)
+ init_layout_detector()
yield
@@ -37,3 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix)
async def health_check():
"""Health check endpoint."""
return {"status": "healthy"}
+
+
+
+if __name__ == "__main__":
+ import uvicorn
+ uvicorn.run(app, host="0.0.0.0", port=8053)
\ No newline at end of file
diff --git a/app/pkg/reference.docx b/app/pkg/reference.docx
new file mode 100644
index 0000000..9f8bdb4
Binary files /dev/null and b/app/pkg/reference.docx differ
diff --git a/app/schemas/convert.py b/app/schemas/convert.py
index 09661e3..97f933e 100644
--- a/app/schemas/convert.py
+++ b/app/schemas/convert.py
@@ -7,7 +7,7 @@ class MarkdownToDocxRequest(BaseModel):
"""Request body for markdown to DOCX conversion endpoint."""
markdown: str = Field(..., description="Markdown content to convert")
- filename: str | None = Field(None, description="Optional output filename (without extension)")
+ filename: str = Field("texpixel", description="Optional output filename (without extension)")
@field_validator("markdown")
@classmethod
diff --git a/app/schemas/image.py b/app/schemas/image.py
index ed81233..3378843 100644
--- a/app/schemas/image.py
+++ b/app/schemas/image.py
@@ -9,14 +9,15 @@ class LayoutRegion(BaseModel):
type: str = Field(..., description="Region type: text, formula, table, figure")
bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]")
confidence: float = Field(..., description="Detection confidence score")
+ score: float = Field(..., description="Detection score")
class LayoutInfo(BaseModel):
"""Layout detection information."""
regions: list[LayoutRegion] = Field(default_factory=list)
- has_plain_text: bool = Field(False, description="Whether plain text was detected")
- has_formula: bool = Field(False, description="Whether formulas were detected")
+ MixedRecognition: bool = Field(False, description="Whether mixed recognition was used")
+ # FormulaRecognition: bool = Field(False, description="Whether formula recognition (with prompt) was used")
class ImageOCRRequest(BaseModel):
diff --git a/app/services/converter.py b/app/services/converter.py
new file mode 100644
index 0000000..4cf73a8
--- /dev/null
+++ b/app/services/converter.py
@@ -0,0 +1,312 @@
+"""Markdown conversion and export service using pypandoc."""
+
+import os
+import re
+import tempfile
+from dataclasses import dataclass
+from typing import Literal
+
+import pypandoc
+
+
+@dataclass
+class ConvertResult:
+ """Result of markdown conversion."""
+
+ latex: str
+ mathml: str
+
+
+@dataclass
+class ExportResult:
+ """Result of markdown export."""
+
+ file_path: str
+ content_type: str
+ download_name: str
+
+
+ExportType = Literal["docx", "pdf"]
+
+
+class Converter:
+ """Service for conversion and export operations."""
+
+ # Pandoc input format with LaTeX math extensions
+ INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
+
+ def __init__(self):
+ """Initialize converter."""
+
+ def convert_to_formats(self, md_text: str) -> ConvertResult:
+ """Convert markdown to LaTeX and MathML formats.
+
+ Args:
+ md_text: Markdown text to convert.
+
+ Returns:
+ ConvertResult with latex and mathml fields.
+
+ Raises:
+ ValueError: If md_text is empty.
+ RuntimeError: If conversion fails.
+ """
+ if md_text == "":
+ return ConvertResult(latex="", mathml="")
+
+ try:
+ # Convert to LaTeX
+ latex_output = pypandoc.convert_text(
+ md_text,
+ "latex",
+ format=self.INPUT_FORMAT,
+ ).rstrip("\n")
+
+ # Convert to HTML with MathML
+ mathml_output = pypandoc.convert_text(
+ md_text,
+ "html",
+ format=self.INPUT_FORMAT,
+ extra_args=["--mathml"],
+ ).rstrip("\n")
+
+ return ConvertResult(latex=latex_output, mathml=mathml_output)
+
+ except Exception as e:
+ raise RuntimeError(f"Conversion failed: {e}") from e
+
+ def preprocess_for_export(self, md_text: str) -> str:
+ """Preprocess markdown text for export to docx/pdf.
+
+ Handles LaTeX formula formatting, matrix environments, and
+ other transformations needed for proper Word/PDF rendering.
+
+ Args:
+ md_text: Raw markdown text.
+
+ Returns:
+ Preprocessed markdown text.
+ """
+ # Replace \[1mm] => \vspace{1mm}
+ md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
+
+ # Add blank lines around \[...\] block formulas
+ md_text = re.sub(
+ r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
+ r"\1\n\n\\[\3\\]\n\n\4",
+ md_text,
+ flags=re.DOTALL,
+ )
+ md_text = re.sub(
+ r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
+ r"\n\\[\2\\]\n",
+ md_text,
+ flags=re.MULTILINE | re.DOTALL,
+ )
+
+ # Remove arithmatex span wrappers
+ cleaned_md = re.sub(r'(.*?)', r"\1", md_text)
+
+ # Convert inline formulas: \( \) => $ $
+ cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
+ cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
+
+ # Convert block formulas: \[ \] => $$ $$
+ cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
+ cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
+
+ # Remove spaces between $ and formula content
+ # Use negative lookahead/lookbehind to avoid matching $$ block formulas
+ cleaned_md = re.sub(r"(? str:
+ """Convert vmatrix/Vmatrix to left/right delimited forms.
+
+ This fixes the vertical line height issues in Word.
+ """
+ # vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
+ md_text = re.sub(
+ r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
+ r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
+ md_text,
+ flags=re.DOTALL,
+ )
+
+ # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
+ md_text = re.sub(
+ r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
+ r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
+ md_text,
+ flags=re.DOTALL,
+ )
+
+ return md_text
+
+ def _fix_brace_spacing(self, md_text: str) -> str:
+ """Fix spacing issues with braces in equation systems.
+
+ Removes whitespace and adds negative space for proper alignment in Word/OMML.
+ """
+ # Fix \left\{ spacing
+ md_text = re.sub(
+ r"\\left\\\{\s+",
+ r"\\left\\{\\!",
+ md_text,
+ )
+
+ # Fix \right\} spacing
+ md_text = re.sub(
+ r"\s+\\right\\\}",
+ r"\\!\\right\\}",
+ md_text,
+ )
+
+ return md_text
+
+ def _convert_special_environments(self, md_text: str) -> str:
+ """Convert cases and aligned environments to array format.
+
+ These environments have better rendering support in Word/OMML.
+ """
+
+ def convert_cases(match: re.Match) -> str:
+ content = match.group(1)
+ return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
+
+ md_text = re.sub(
+ r"\\begin\{cases\}(.*?)\\end\{cases\}",
+ convert_cases,
+ md_text,
+ flags=re.DOTALL,
+ )
+
+ def convert_aligned_to_array(match: re.Match) -> str:
+ content = match.group(1)
+ # Remove leading & alignment markers (not needed in array{l})
+ content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+ return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
+
+ md_text = re.sub(
+ r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
+ convert_aligned_to_array,
+ md_text,
+ flags=re.DOTALL,
+ )
+
+ def convert_standalone_aligned(match: re.Match) -> str:
+ content = match.group(1)
+ content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
+ return r"\begin{array}{l}" + content + r"\end{array}"
+
+ md_text = re.sub(
+ r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
+ convert_standalone_aligned,
+ md_text,
+ flags=re.DOTALL,
+ )
+
+ return md_text
+
+ def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
+ """Export markdown to docx or pdf file.
+
+ Args:
+ md_text: Markdown text to export.
+ export_type: Export format, either 'docx' or 'pdf'.
+
+ Returns:
+ bytes of the exported file.
+
+ Raises:
+ ValueError: If export_type is not supported.
+ RuntimeError: If export fails.
+
+ """
+
+ # Preprocess markdown
+ cleaned_md = self.preprocess_for_export(md_text)
+
+ # Create temp file for input
+ with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in:
+ f_in.write(cleaned_md.encode("utf-8"))
+ md_path = f_in.name
+
+ output_file = md_path + "." + export_type
+
+ try:
+ if export_type == "docx":
+ self._export_docx(md_path, output_file)
+ with open(output_file, "rb") as f:
+ return f.read()
+ else: # pdf
+ self._export_pdf(md_path, output_file)
+ with open(output_file, "rb") as f:
+ return f.read()
+
+ except Exception as e:
+ # Cleanup on error
+ self._cleanup_files(md_path, output_file)
+ raise RuntimeError(f"Export failed: {e}") from e
+ finally:
+ # Always cleanup input file
+ if os.path.exists(md_path):
+ os.remove(md_path)
+
+ def _export_docx(self, input_path: str, output_path: str) -> None:
+ """Export to DOCX format using pypandoc."""
+ extra_args = [
+ "--highlight-style=pygments",
+ f"--reference-doc=app/pkg/reference.docx",
+ ]
+ pypandoc.convert_file(
+ input_path,
+ "docx",
+ format=self.INPUT_FORMAT,
+ outputfile=output_path,
+ extra_args=extra_args,
+ )
+
+ def _export_pdf(self, input_path: str, output_path: str) -> None:
+ """Export to PDF format using pypandoc with XeLaTeX."""
+ extra_args = [
+ "--pdf-engine=xelatex",
+ "-V",
+ "mainfont=Noto Sans CJK SC",
+ "--highlight-style=pygments",
+ ]
+ pypandoc.convert_file(
+ input_path,
+ "pdf",
+ format=self.INPUT_FORMAT,
+ outputfile=output_path,
+ extra_args=extra_args,
+ )
+
+ def _cleanup_files(self, *paths: str) -> None:
+ """Remove files if they exist."""
+ for path in paths:
+ if os.path.exists(path):
+ os.remove(path)
+
+ def cleanup_export_file(self, file_path: str) -> None:
+ """Cleanup exported file after sending response.
+
+ Call this after sending the file to the client.
+
+ Args:
+ file_path: Path to the exported file.
+ """
+ if os.path.exists(file_path):
+ os.remove(file_path)
+
diff --git a/app/services/docx_converter.py b/app/services/docx_converter.py
deleted file mode 100644
index 6364507..0000000
--- a/app/services/docx_converter.py
+++ /dev/null
@@ -1,335 +0,0 @@
-"""Markdown to DOCX conversion service.
-
-Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
-"""
-
-import io
-import re
-from dataclasses import dataclass
-
-from docx import Document
-from docx.enum.text import WD_ALIGN_PARAGRAPH
-from docx.oxml import OxmlElement
-from docx.oxml.ns import qn
-from docx.shared import Inches, Pt
-
-
-@dataclass
-class MarkdownElement:
- """Parsed markdown element."""
-
- type: str # heading, paragraph, list_item, code_block, table, math
- content: str
- level: int = 0 # For headings and lists
- language: str = "" # For code blocks
-
-
-class DocxConverter:
- """Converts markdown content to DOCX format."""
-
- def __init__(self):
- """Initialize the converter."""
- self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
- self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
- self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
- self.code_block_pattern = re.compile(r"^```(\w*)$")
- self.inline_code_pattern = re.compile(r"`([^`]+)`")
- self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
- self.italic_pattern = re.compile(r"\*([^*]+)\*")
- self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
- self.inline_math_pattern = re.compile(r"\$([^$]+)\$")
-
- def convert(self, markdown: str) -> bytes:
- """Convert markdown content to DOCX.
-
- Args:
- markdown: Markdown content to convert.
-
- Returns:
- DOCX file as bytes.
- """
- doc = Document()
- elements = self._parse_markdown(markdown)
-
- for element in elements:
- self._add_element_to_doc(doc, element)
-
- # Save to bytes
- buffer = io.BytesIO()
- doc.save(buffer)
- buffer.seek(0)
- return buffer.getvalue()
-
- def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
- """Parse markdown into elements.
-
- Args:
- markdown: Markdown content.
-
- Returns:
- List of parsed elements.
- """
- elements: list[MarkdownElement] = []
- lines = markdown.split("\n")
- i = 0
- in_code_block = False
- code_content = []
- code_language = ""
-
- while i < len(lines):
- line = lines[i]
-
- # Code block handling
- code_match = self.code_block_pattern.match(line)
- if code_match:
- if in_code_block:
- elements.append(
- MarkdownElement(
- type="code_block",
- content="\n".join(code_content),
- language=code_language,
- )
- )
- code_content = []
- in_code_block = False
- else:
- in_code_block = True
- code_language = code_match.group(1)
- i += 1
- continue
-
- if in_code_block:
- code_content.append(line)
- i += 1
- continue
-
- # Math block ($$...$$)
- if line.strip().startswith("$$"):
- math_content = []
- if line.strip() == "$$":
- i += 1
- while i < len(lines) and lines[i].strip() != "$$":
- math_content.append(lines[i])
- i += 1
- else:
- # Single line $$...$$ or start
- content = line.strip()[2:]
- if content.endswith("$$"):
- math_content.append(content[:-2])
- else:
- math_content.append(content)
- i += 1
- while i < len(lines):
- if lines[i].strip().endswith("$$"):
- math_content.append(lines[i].strip()[:-2])
- break
- math_content.append(lines[i])
- i += 1
-
- elements.append(
- MarkdownElement(type="math", content="\n".join(math_content))
- )
- i += 1
- continue
-
- # Heading
- heading_match = self.heading_pattern.match(line)
- if heading_match:
- level = len(heading_match.group(1))
- content = heading_match.group(2)
- elements.append(
- MarkdownElement(type="heading", content=content, level=level)
- )
- i += 1
- continue
-
- # Unordered list
- list_match = self.list_pattern.match(line)
- if list_match:
- indent = len(list_match.group(1))
- content = list_match.group(2)
- elements.append(
- MarkdownElement(type="list_item", content=content, level=indent // 2)
- )
- i += 1
- continue
-
- # Ordered list
- ordered_match = self.ordered_list_pattern.match(line)
- if ordered_match:
- indent = len(ordered_match.group(1))
- content = ordered_match.group(2)
- elements.append(
- MarkdownElement(
- type="ordered_list_item", content=content, level=indent // 2
- )
- )
- i += 1
- continue
-
- # Table (simple detection)
- if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
- table_lines = [line]
- i += 1
- while i < len(lines) and "|" in lines[i]:
- table_lines.append(lines[i])
- i += 1
- elements.append(
- MarkdownElement(type="table", content="\n".join(table_lines))
- )
- continue
-
- # Regular paragraph
- if line.strip():
- elements.append(MarkdownElement(type="paragraph", content=line))
-
- i += 1
-
- return elements
-
- def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
- """Add a markdown element to the document.
-
- Args:
- doc: Word document.
- element: Parsed markdown element.
- """
- if element.type == "heading":
- self._add_heading(doc, element.content, element.level)
- elif element.type == "paragraph":
- self._add_paragraph(doc, element.content)
- elif element.type == "list_item":
- self._add_list_item(doc, element.content, element.level, ordered=False)
- elif element.type == "ordered_list_item":
- self._add_list_item(doc, element.content, element.level, ordered=True)
- elif element.type == "code_block":
- self._add_code_block(doc, element.content)
- elif element.type == "table":
- self._add_table(doc, element.content)
- elif element.type == "math":
- self._add_math(doc, element.content)
-
- def _add_heading(self, doc: Document, content: str, level: int) -> None:
- """Add a heading to the document."""
- # Map markdown levels to Word heading styles
- heading_level = min(level, 9) # Word supports up to Heading 9
- doc.add_heading(content, level=heading_level)
-
- def _add_paragraph(self, doc: Document, content: str) -> None:
- """Add a paragraph with inline formatting."""
- para = doc.add_paragraph()
- self._add_formatted_text(para, content)
-
- def _add_formatted_text(self, para, content: str) -> None:
- """Add text with inline formatting (bold, italic, code)."""
- # Simple approach: process inline patterns
- remaining = content
-
- while remaining:
- # Find next formatting marker
- bold_match = self.bold_pattern.search(remaining)
- italic_match = self.italic_pattern.search(remaining)
- code_match = self.inline_code_pattern.search(remaining)
- math_match = self.inline_math_pattern.search(remaining)
-
- matches = [
- (bold_match, "bold"),
- (italic_match, "italic"),
- (code_match, "code"),
- (math_match, "math"),
- ]
- matches = [(m, t) for m, t in matches if m]
-
- if not matches:
- para.add_run(remaining)
- break
-
- # Find earliest match
- earliest = min(matches, key=lambda x: x[0].start())
- match, match_type = earliest
-
- # Add text before match
- if match.start() > 0:
- para.add_run(remaining[: match.start()])
-
- # Add formatted text
- run = para.add_run(match.group(1))
- if match_type == "bold":
- run.bold = True
- elif match_type == "italic":
- run.italic = True
- elif match_type == "code":
- run.font.name = "Courier New"
- run.font.size = Pt(10)
- elif match_type == "math":
- run.italic = True
-
- remaining = remaining[match.end() :]
-
- def _add_list_item(
- self, doc: Document, content: str, level: int, ordered: bool
- ) -> None:
- """Add a list item."""
- para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
- para.paragraph_format.left_indent = Inches(0.25 * level)
- self._add_formatted_text(para, content)
-
- def _add_code_block(self, doc: Document, content: str) -> None:
- """Add a code block."""
- para = doc.add_paragraph()
- para.paragraph_format.left_indent = Inches(0.5)
-
- run = para.add_run(content)
- run.font.name = "Courier New"
- run.font.size = Pt(9)
-
- # Add shading
- shading = OxmlElement("w:shd")
- shading.set(qn("w:val"), "clear")
- shading.set(qn("w:fill"), "F0F0F0")
- para._p.get_or_add_pPr().append(shading)
-
- def _add_table(self, doc: Document, content: str) -> None:
- """Add a table from markdown table format."""
- lines = [l.strip() for l in content.split("\n") if l.strip()]
- if len(lines) < 2:
- return
-
- # Parse header
- header = [c.strip() for c in lines[0].split("|") if c.strip()]
-
- # Skip separator line
- data_lines = lines[2:] if len(lines) > 2 else []
-
- # Create table
- table = doc.add_table(rows=1, cols=len(header))
- table.style = "Table Grid"
-
- # Add header
- header_cells = table.rows[0].cells
- for i, text in enumerate(header):
- header_cells[i].text = text
- header_cells[i].paragraphs[0].runs[0].bold = True
-
- # Add data rows
- for line in data_lines:
- cells = [c.strip() for c in line.split("|") if c.strip()]
- row_cells = table.add_row().cells
- for i, text in enumerate(cells):
- if i < len(row_cells):
- row_cells[i].text = text
-
- def _add_math(self, doc: Document, content: str) -> None:
- """Add a math block.
-
- For proper OMML rendering, this would need more complex conversion.
- Currently renders as italic text with the LaTeX source.
- """
- para = doc.add_paragraph()
- para.alignment = WD_ALIGN_PARAGRAPH.CENTER
-
- run = para.add_run(content)
- run.italic = True
- run.font.name = "Cambria Math"
- run.font.size = Pt(12)
-
diff --git a/app/services/image_processor.py b/app/services/image_processor.py
index 34a6419..d7abed1 100644
--- a/app/services/image_processor.py
+++ b/app/services/image_processor.py
@@ -116,7 +116,7 @@ class ImageProcessor:
else:
raise ValueError("Either image_url or image_base64 must be provided")
- return self.add_padding(image)
+ return image
def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str:
"""Convert numpy image to base64 string.
diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py
index b7ed407..3cd8446 100644
--- a/app/services/layout_detector.py
+++ b/app/services/layout_detector.py
@@ -1,122 +1,157 @@
-"""DocLayout-YOLO wrapper for document layout detection."""
+"""PP-DocLayoutV2 wrapper for document layout detection."""
import numpy as np
from app.schemas.image import LayoutInfo, LayoutRegion
from app.core.config import get_settings
+from paddleocr import LayoutDetection
+from typing import Optional
settings = get_settings()
class LayoutDetector:
- """Wrapper for DocLayout-YOLO model."""
+ """Layout detector for PP-DocLayoutV2."""
- # Class names from DocLayout-YOLO
- CLASS_NAMES = {
- 0: "title",
- 1: "plain_text",
- 2: "abandon",
- 3: "figure",
- 4: "figure_caption",
- 5: "table",
- 6: "table_caption",
- 7: "table_footnote",
- 8: "isolate_formula",
- 9: "formula_caption",
+ _layout_detector: Optional[LayoutDetection] = None
+
+ # PP-DocLayoutV2 class ID to label mapping
+ CLS_ID_TO_LABEL: dict[int, str] = {
+ 0: "abstract",
+ 1: "algorithm",
+ 2: "aside_text",
+ 3: "chart",
+ 4: "content",
+ 5: "display_formula",
+ 6: "doc_title",
+ 7: "figure_title",
+ 8: "footer",
+ 9: "footer_image",
+ 10: "footnote",
+ 11: "formula_number",
+ 12: "header",
+ 13: "header_image",
+ 14: "image",
+ 15: "inline_formula",
+ 16: "number",
+ 17: "paragraph_title",
+ 18: "reference",
+ 19: "reference_content",
+ 20: "seal",
+ 21: "table",
+ 22: "text",
+ 23: "vertical_text",
+ 24: "vision_footnote",
}
- # Classes considered as plain text
- PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"}
+ # Mapping from raw labels to normalized region types
+ LABEL_TO_TYPE: dict[str, str] = {
+ # Text types
+ "abstract": "text",
+ "algorithm": "text",
+ "aside_text": "text",
+ "content": "text",
+ "doc_title": "text",
+ "footer": "text",
+ "footnote": "text",
+ "header": "text",
+ "number": "text",
+ "paragraph_title": "text",
+ "reference": "text",
+ "reference_content": "text",
+ "text": "text",
+ "vertical_text": "text",
+ "vision_footnote": "text",
+ # Formula types
+ "display_formula": "formula",
+ "inline_formula": "formula",
+ "formula_number": "formula",
+ # Table types
+ "table": "table",
+ # Figure types
+ "chart": "figure",
+ "figure_title": "figure",
+ "footer_image": "figure",
+ "header_image": "figure",
+ "image": "figure",
+ "seal": "figure",
+ }
- # Classes considered as formula
- FORMULA_CLASSES = {"isolate_formula", "formula_caption"}
-
- def __init__(self, model_path: str, confidence_threshold: float = 0.2):
- """Initialize the layout detector.
+ def __init__(self):
+ """Initialize layout detector.
Args:
- model_path: Path to the DocLayout-YOLO model weights.
- confidence_threshold: Minimum confidence for detections.
"""
- self.model_path = model_path
- self.confidence_threshold = confidence_threshold
- self.model = None
+ _ = self._get_layout_detector()
- def load_model(self) -> None:
- """Load the DocLayout-YOLO model.
+ def _get_layout_detector(self):
+ """Get or create LayoutDetection instance."""
+ if LayoutDetector._layout_detector is None:
+ LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2")
+ return LayoutDetector._layout_detector
- Raises:
- RuntimeError: If model cannot be loaded.
- """
- try:
- from doclayout_yolo import YOLOv10
-
- self.model = YOLOv10(self.model_path)
- except Exception as e:
- raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e
-
- def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo:
- """Detect document layout regions.
+ def detect(self, image: np.ndarray) -> LayoutInfo:
+ """Detect layout of the image using PP-DocLayoutV2.
Args:
- image: Input image as numpy array in BGR format.
- image_size: Image size for prediction.
+ image: Input image as numpy array.
Returns:
- LayoutInfo with detected regions.
-
- Raises:
- RuntimeError: If model not loaded.
+ LayoutInfo with detected regions and flags.
"""
- if self.model is None:
- raise RuntimeError("Model not loaded. Call load_model() first.")
-
- # Run prediction
- results = self.model.predict(
- image,
- imgsz=image_size,
- conf=self.confidence_threshold,
- device=settings.device,
- )
+ layout_detector = self._get_layout_detector()
+ result = layout_detector.predict(image)
+ # Parse the result
regions: list[LayoutRegion] = []
- has_plain_text = False
- has_formula = False
+ mixed_recognition = False
- if results and len(results) > 0:
- result = results[0]
- if result.boxes is not None:
- for box in result.boxes:
- cls_id = int(box.cls[0].item())
- confidence = float(box.conf[0].item())
- bbox = box.xyxy[0].tolist()
+ # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}]
+ if isinstance(result, list) and len(result) > 0:
+ first_result = result[0]
+ if isinstance(first_result, dict) and "boxes" in first_result:
+ boxes = first_result.get("boxes", [])
+ else:
+ boxes = []
+ else:
+ boxes = []
- class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}")
+ for box in boxes:
+ cls_id = box.get("cls_id")
+ label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other")
+ score = box.get("score", 0.0)
+ coordinate = box.get("coordinate", [0, 0, 0, 0])
- # Map to simplified type
- if class_name in self.PLAIN_TEXT_CLASSES:
- region_type = "text"
- has_plain_text = True
- elif class_name in self.FORMULA_CLASSES:
- region_type = "formula"
- has_formula = True
- elif class_name in {"figure"}:
- region_type = "figure"
- elif class_name in {"table"}:
- region_type = "table"
- else:
- region_type = class_name
+ # Normalize label to region type
+ region_type = self.LABEL_TO_TYPE.get(label, "text")
- regions.append(
- LayoutRegion(
- type=region_type,
- bbox=bbox,
- confidence=confidence,
- )
- )
+ regions.append(LayoutRegion(
+ type=region_type,
+ bbox=coordinate,
+ confidence=score,
+ score=score,
+ ))
- return LayoutInfo(
- regions=regions,
- has_plain_text=has_plain_text,
- has_formula=has_formula,
- )
+
+ mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)
+
+ return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)
+
+
+if __name__ == "__main__":
+ import cv2
+ from app.services.image_processor import ImageProcessor
+
+ layout_detector = LayoutDetector()
+ image_path = "test/timeout.png"
+
+ image = cv2.imread(image_path)
+ image_processor = ImageProcessor(padding_ratio=0.15)
+ image = image_processor.add_padding(image)
+
+ # Save the padded image for debugging
+ cv2.imwrite("debug_padded_image.png", image)
+
+
+ layout_info = layout_detector.detect(image)
+ print(layout_info)
\ No newline at end of file
diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py
index 8c7fe41..5b65798 100644
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -1,14 +1,12 @@
"""PaddleOCR-VL client service for text and formula recognition."""
-import io
-import tempfile
-from pathlib import Path
-
-import cv2
import numpy as np
-
from app.core.config import get_settings
-from app.schemas.image import LayoutInfo
+from paddleocr import PaddleOCRVL
+from typing import Optional
+from app.services.layout_detector import LayoutDetector
+from app.services.image_processor import ImageProcessor
+from app.services.converter import Converter
settings = get_settings()
@@ -16,52 +14,40 @@ settings = get_settings()
class OCRService:
"""Service for OCR using PaddleOCR-VL."""
- FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
+ _pipeline: Optional[PaddleOCRVL] = None
+ _layout_detector: Optional[LayoutDetector] = None
def __init__(
self,
- vl_server_url: str | None = None,
- pp_doclayout_model_dir: str | None = None,
+ vl_server_url: str,
+ layout_detector: LayoutDetector,
+ image_processor: ImageProcessor,
+ converter: Converter,
):
"""Initialize OCR service.
Args:
vl_server_url: URL of the vLLM server for PaddleOCR-VL.
- pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
+ layout_detector: Layout detector instance.
+ image_processor: Image processor instance.
"""
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
- self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
- self._pipeline = None
-
- def _get_pipeline(self):
+ self.layout_detector = layout_detector
+ self.image_processor = image_processor
+ self.converter = converter
+ def _get_pipeline(self):
"""Get or create PaddleOCR-VL pipeline.
Returns:
PaddleOCRVL pipeline instance.
"""
- if self._pipeline is None:
- from paddleocr import PaddleOCRVL
-
- self._pipeline = PaddleOCRVL(
+ if OCRService._pipeline is None:
+ OCRService._pipeline = PaddleOCRVL(
vl_rec_backend="vllm-server",
vl_rec_server_url=self.vl_server_url,
layout_detection_model_name="PP-DocLayoutV2",
- layout_detection_model_dir=self.pp_doclayout_model_dir,
)
- return self._pipeline
-
- def _save_temp_image(self, image: np.ndarray) -> str:
- """Save image to a temporary file.
-
- Args:
- image: Image as numpy array in BGR format.
-
- Returns:
- Path to temporary file.
- """
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
- cv2.imwrite(f.name, image)
- return f.name
+ return OCRService._pipeline
def recognize_mixed(self, image: np.ndarray) -> dict:
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
@@ -77,30 +63,21 @@ class OCRService:
"""
try:
pipeline = self._get_pipeline()
- temp_path = self._save_temp_image(image)
- try:
- results = list(pipeline.predict(temp_path))
+ output = pipeline.predict(image, use_layout_detection=True)
- markdown_content = ""
- for result in results:
- # PaddleOCR-VL results can be saved to markdown
- md_buffer = io.StringIO()
- result.save_to_markdown(save_path=md_buffer)
- markdown_content += md_buffer.getvalue()
+ markdown_content = ""
- # Convert markdown to other formats
- latex = self._markdown_to_latex(markdown_content)
- mathml = self._extract_mathml(markdown_content)
+ for res in output:
+ markdown_content += res.markdown.get("markdown_texts", "")
- return {
- "markdown": markdown_content,
- "latex": latex,
- "mathml": mathml,
- }
- finally:
- Path(temp_path).unlink(missing_ok=True)
+ convert_result = self.converter.convert_to_formats(markdown_content)
+ return {
+ "markdown": markdown_content,
+ "latex": convert_result.latex,
+ "mathml": convert_result.mathml,
+ }
except Exception as e:
raise RuntimeError(f"Mixed recognition failed: {e}") from e
@@ -116,188 +93,49 @@ class OCRService:
Dict with 'latex', 'markdown', 'mathml' keys.
"""
try:
- import httpx
+ pipeline = self._get_pipeline()
- temp_path = self._save_temp_image(image)
+ output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula")
- try:
- # Use vLLM API directly for formula recognition
- import base64
+ markdown_content = ""
- with open(temp_path, "rb") as f:
- image_base64 = base64.b64encode(f.read()).decode("utf-8")
+ for res in output:
+ markdown_content += res.markdown.get("markdown_texts", "")
- # Call vLLM server with formula prompt
- response = httpx.post(
- f"{self.vl_server_url}/chat/completions",
- json={
- "model": "paddleocr-vl",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": self.FORMULA_PROMPT},
- {
- "type": "image_url",
- "image_url": {"url": f"data:image/png;base64,{image_base64}"},
- },
- ],
- }
- ],
- "max_tokens": 1024,
- },
- timeout=60.0,
- )
- response.raise_for_status()
- result = response.json()
+ convert_result = self.converter.convert_to_formats(markdown_content)
- latex = result["choices"][0]["message"]["content"].strip()
-
- # Convert latex to other formats
- markdown = self._latex_to_markdown(latex)
- mathml = self._latex_to_mathml(latex)
-
- return {
- "latex": latex,
- "markdown": markdown,
- "mathml": mathml,
- }
- finally:
- Path(temp_path).unlink(missing_ok=True)
-
- except httpx.HTTPStatusError as e:
- raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
+ return {
+ "latex": convert_result.latex,
+ "mathml": convert_result.mathml,
+ "markdown": markdown_content,
+ }
except Exception as e:
raise RuntimeError(f"Formula recognition failed: {e}") from e
- def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
- """Recognize content based on layout detection results.
+ def recognize(self, image: np.ndarray) -> dict:
+ """Recognize content using PaddleOCR-VL.
Args:
image: Input image as numpy array in BGR format.
- layout_info: Layout detection results.
Returns:
- Dict with recognition results including mode used.
+ Dict with 'latex', 'markdown', 'mathml' keys.
"""
- # Decision logic:
- # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
- # - Otherwise -> use formula_recognition (VL with prompt)
- if layout_info.has_plain_text:
- result = self.recognize_mixed(image)
- result["recognition_mode"] = "mixed_recognition"
+ padded_image = self.image_processor.add_padding(image)
+ layout_info = self.layout_detector.detect(padded_image)
+ if layout_info.MixedRecognition:
+ return self.recognize_mixed(image)
else:
- result = self.recognize_formula(image)
- result["recognition_mode"] = "formula_recognition"
+ return self.recognize_formula(image)
- return result
- def _markdown_to_latex(self, markdown: str) -> str:
- """Convert markdown to LaTeX.
-
- Simple conversion - wraps content in LaTeX document structure.
-
- Args:
- markdown: Markdown content.
-
- Returns:
- LaTeX representation.
- """
- # Basic conversion: preserve math blocks, convert structure
- lines = []
- in_code_block = False
-
- for line in markdown.split("\n"):
- if line.startswith("```"):
- in_code_block = not in_code_block
- if in_code_block:
- lines.append("\\begin{verbatim}")
- else:
- lines.append("\\end{verbatim}")
- elif in_code_block:
- lines.append(line)
- elif line.startswith("# "):
- lines.append(f"\\section{{{line[2:]}}}")
- elif line.startswith("## "):
- lines.append(f"\\subsection{{{line[3:]}}}")
- elif line.startswith("### "):
- lines.append(f"\\subsubsection{{{line[4:]}}}")
- elif line.startswith("- "):
- lines.append(f"\\item {line[2:]}")
- elif line.startswith("$$"):
- lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
- elif "$" in line:
- # Keep inline math as-is
- lines.append(line)
- else:
- lines.append(line)
-
- return "\n".join(lines)
-
- def _latex_to_markdown(self, latex: str) -> str:
- """Convert LaTeX to markdown.
-
- Args:
- latex: LaTeX content.
-
- Returns:
- Markdown representation.
- """
- # Wrap LaTeX in markdown math block
- if latex.strip():
- return f"$$\n{latex}\n$$"
- return ""
-
- def _latex_to_mathml(self, latex: str) -> str:
- """Convert LaTeX to MathML.
-
- Args:
- latex: LaTeX content.
-
- Returns:
- MathML representation.
- """
- # Basic LaTeX to MathML conversion
- # For production, consider using latex2mathml library
- if not latex.strip():
- return ""
-
- try:
- # Try to use latex2mathml if available
- from latex2mathml.converter import convert
-
- return convert(latex)
- except ImportError:
- # Fallback: wrap in basic MathML structure
- return f''
- except Exception:
- return f''
-
- def _extract_mathml(self, markdown: str) -> str:
- """Extract and convert math from markdown to MathML.
-
- Args:
- markdown: Markdown content.
-
- Returns:
- MathML for any math content found.
- """
- import re
-
- # Find all math blocks
- math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
- inline_math = re.findall(r"\$([^$]+)\$", markdown)
-
- all_math = math_blocks + inline_math
-
- if not all_math:
- return ""
-
- # Convert each to MathML and combine
- mathml_parts = []
- for latex in all_math:
- mathml = self._latex_to_mathml(latex.strip())
- if mathml:
- mathml_parts.append(mathml)
-
- return "\n".join(mathml_parts)
+if __name__ == "__main__":
+ import cv2
+ from app.services.image_processor import ImageProcessor
+ from app.services.layout_detector import LayoutDetector
+ image_processor = ImageProcessor(padding_ratio=0.15)
+ layout_detector = LayoutDetector()
+ ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
+ image = cv2.imread("test/image.png")
+ ocr_result = ocr_service.recognize(image)
+ print(ocr_result)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 92c9177..50a6860 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,30 +2,36 @@
name = "doc-processer"
version = "0.1.0"
description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX"
-readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.10"
license = { text = "MIT" }
authors = [
{ name = "YogeLiu" }
]
dependencies = [
- "fastapi>=0.115.0",
- "uvicorn[standard]>=0.32.0",
- "opencv-python>=4.10.0",
- "python-multipart>=0.0.12",
- "pydantic>=2.10.0",
- "pydantic-settings>=2.6.0",
- "httpx>=0.28.0",
- "numpy>=1.26.0",
- "pillow>=10.4.0",
- "python-docx>=1.1.0",
- "paddleocr>=2.9.0",
- "doclayout-yolo>=0.0.2",
- "latex2mathml>=3.77.0",
- "paddle>=1.2.0",
+ "fastapi==0.128.0",
+ "uvicorn[standard]==0.40.0",
+ "opencv-python==4.12.0.88",
+ "python-multipart==0.0.21",
+ "pydantic==2.12.5",
+ "pydantic-settings==2.12.0",
+ "httpx==0.28.1",
+ "numpy==2.2.6",
+ "pillow==12.0.0",
+ "python-docx==1.2.0",
+ "paddleocr==3.3.2",
+ "doclayout-yolo==0.0.4",
+ "latex2mathml==3.78.1",
+ "paddle==1.2.0",
+ "pypandoc==1.16.2",
+ "paddlepaddle",
+ "paddleocr[doc-parser]",
+ "safetensors"
]
+[tool.uv.sources]
+paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
+
[project.optional-dependencies]
dev = [
"pytest>=8.0.0",