commit 874fd383ccdafbcdeb205c774b30a19fca242eda
Author: liuyuanchuang <yuanchuang_liu@qingsongchou.com>
Date:   Mon Dec 29 17:34:58 2025 +0800

    init repo

diff --git a/.cursor/commands/openspec-apply.md b/.cursor/commands/openspec-apply.md
new file mode 100644
index 0000000..99a9148
--- /dev/null
+++ b/.cursor/commands/openspec-apply.md
@@ -0,0 +1,23 @@
+---
+name: /openspec-apply
+id: openspec-apply
+category: OpenSpec
+description: Implement an approved OpenSpec change and keep tasks in sync.
+---
+<!-- OPENSPEC:START -->
+**Guardrails**
+- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required.
+- Keep changes tightly scoped to the requested outcome.
+- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications.
+
+**Steps**
+Track these steps as TODOs and complete them one by one.
+1. Read `changes/<id>/proposal.md`, `design.md` (if present), and `tasks.md` to confirm scope and acceptance criteria.
+2. Work through tasks sequentially, keeping edits minimal and focused on the requested change.
+3. Confirm completion before updating statuses—make sure every item in `tasks.md` is finished.
+4. Update the checklist after all work is done so each task is marked `- [x]` and reflects reality.
+5. Reference `openspec list` or `openspec show <item>` when additional context is required.
+
+**Reference**
+- Use `openspec show <id> --json --deltas-only` if you need additional context from the proposal while implementing.
+<!-- OPENSPEC:END -->
diff --git a/.cursor/commands/openspec-archive.md b/.cursor/commands/openspec-archive.md
new file mode 100644
index 0000000..492b3ed
--- /dev/null
+++ b/.cursor/commands/openspec-archive.md
@@ -0,0 +1,27 @@
+---
+name: /openspec-archive
+id: openspec-archive
+category: OpenSpec
+description: Archive a deployed OpenSpec change and update specs.
+---
+<!-- OPENSPEC:START -->
+**Guardrails**
+- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required.
+- Keep changes tightly scoped to the requested outcome.
+- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications.
+
+**Steps**
+1. Determine the change ID to archive:
+   - If this prompt already includes a specific change ID (for example inside a `<ChangeId>` block populated by slash-command arguments), use that value after trimming whitespace.
+   - If the conversation references a change loosely (for example by title or summary), run `openspec list` to surface likely IDs, share the relevant candidates, and confirm which one the user intends.
+   - Otherwise, review the conversation, run `openspec list`, and ask the user which change to archive; wait for a confirmed change ID before proceeding.
+   - If you still cannot identify a single change ID, stop and tell the user you cannot archive anything yet.
+2. Validate the change ID by running `openspec list` (or `openspec show <id>`) and stop if the change is missing, already archived, or otherwise not ready to archive.
+3. Run `openspec archive <id> --yes` so the CLI moves the change and applies spec updates without prompts (use `--skip-specs` only for tooling-only work).
+4. Review the command output to confirm the target specs were updated and the change landed in `changes/archive/`.
+5. Validate with `openspec validate --strict` and inspect with `openspec show <id>` if anything looks off.
+
+**Reference**
+- Use `openspec list` to confirm change IDs before archiving.
+- Inspect refreshed specs with `openspec list --specs` and address any validation issues before handing off.
+<!-- OPENSPEC:END -->
diff --git a/.cursor/commands/openspec-proposal.md b/.cursor/commands/openspec-proposal.md
new file mode 100644
index 0000000..25f1a3f
--- /dev/null
+++ b/.cursor/commands/openspec-proposal.md
@@ -0,0 +1,28 @@
+---
+name: /openspec-proposal
+id: openspec-proposal
+category: OpenSpec
+description: Scaffold a new OpenSpec change and validate strictly.
+---
+<!-- OPENSPEC:START -->
+**Guardrails**
+- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required.
+- Keep changes tightly scoped to the requested outcome.
+- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications.
+- Identify any vague or ambiguous details and ask the necessary follow-up questions before editing files.
+- Do not write any code during the proposal stage. Only create design documents (proposal.md, tasks.md, design.md, and spec deltas). Implementation happens in the apply stage after approval.
+
+**Steps**
+1. Review `openspec/project.md`, run `openspec list` and `openspec list --specs`, and inspect related code or docs (e.g., via `rg`/`ls`) to ground the proposal in current behaviour; note any gaps that require clarification.
+2. Choose a unique verb-led `change-id` and scaffold `proposal.md`, `tasks.md`, and `design.md` (when needed) under `openspec/changes/<id>/`.
+3. Map the change into concrete capabilities or requirements, breaking multi-scope efforts into distinct spec deltas with clear relationships and sequencing.
+4. Capture architectural reasoning in `design.md` when the solution spans multiple systems, introduces new patterns, or demands trade-off discussion before committing to specs.
+5. Draft spec deltas in `changes/<id>/specs/<capability>/spec.md` (one folder per capability) using `## ADDED|MODIFIED|REMOVED Requirements` with at least one `#### Scenario:` per requirement and cross-reference related capabilities when relevant.
+6. Draft `tasks.md` as an ordered list of small, verifiable work items that deliver user-visible progress, include validation (tests, tooling), and highlight dependencies or parallelizable work.
+7. Validate with `openspec validate <id> --strict` and resolve every issue before sharing the proposal.
+
+**Reference**
+- Use `openspec show <id> --json --deltas-only` or `openspec show <spec> --type spec` to inspect details when validation fails.
+- Search existing requirements with `rg -n "Requirement:|Scenario:" openspec/specs` before writing new ones.
+- Explore the codebase with `rg <keyword>`, `ls`, or direct file reads so proposals align with current implementation realities.
+<!-- OPENSPEC:END -->
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f8ab18c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,73 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual environments
+.venv/
+venv/
+ENV/
+env/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+
+# Environment
+.env
+.env.local
+.env.*.local
+
+# Models (large files - download separately)
+models/
+*.pt
+*.onnx
+*.pdmodel
+*.pdiparams
+
+# Logs
+*.log
+logs/
+
+# Temporary files
+tmp/
+temp/
+*.tmp
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Test
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+
+# Docker
+.docker/
+
+# uv
+uv.lock
+
+model/*
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..0669699
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,18 @@
+<!-- OPENSPEC:START -->
+# OpenSpec Instructions
+
+These instructions are for AI assistants working in this project.
+
+Always open `@/openspec/AGENTS.md` when the request:
+- Mentions planning or proposals (words like proposal, spec, change, plan)
+- Introduces new capabilities, breaking changes, architecture shifts, or big performance/security work
+- Sounds ambiguous and you need the authoritative spec before coding
+
+Use `@/openspec/AGENTS.md` to learn:
+- How to create and apply change proposals
+- Spec format and conventions
+- Project structure and guidelines
+
+Keep this managed block so 'openspec update' can refresh the instructions.
+
+<!-- OPENSPEC:END -->
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3f3b60c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,62 @@
+# DocProcesser Dockerfile
+# Optimized for RTX 5080 GPU deployment
+
+# Use NVIDIA CUDA base image with Python 3.11
+FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3.11-venv \
+    python3.11-dev \
+    python3-pip \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3.11 /usr/bin/python \
+    && ln -sf /usr/bin/python3.11 /usr/bin/python3
+
+# Install uv for fast package management
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+# Copy dependency files first for better caching
+COPY pyproject.toml ./
+
+# Create virtual environment and install dependencies
+RUN uv venv /app/.venv
+ENV PATH="/app/.venv/bin:$PATH"
+ENV VIRTUAL_ENV="/app/.venv"
+
+RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e .
+
+# Copy application code
+COPY app/ ./app/
+
+# Create model directories (models should be mounted at runtime)
+RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout
+
+# Expose port
+EXPOSE 8053
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8053/health || exit 1
+
+# Run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"]
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b6f8c9a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,199 @@
+# DocProcesser
+
+Document processing API built with FastAPI. Converts images to LaTeX/Markdown/MathML and Markdown to DOCX.
+
+## Features
+
+- **Image OCR API** (`POST /doc_process/v1/image/ocr`)
+  - Accept images via URL or base64
+  - Automatic layout detection using DocLayout-YOLO
+  - Text and formula recognition via PaddleOCR-VL
+  - Output in LaTeX, Markdown, and MathML formats
+
+- **Markdown to DOCX API** (`POST /doc_process/v1/convert/docx`)
+  - Convert markdown content to Word documents
+  - Preserve formatting, tables, and code blocks
+
+## Prerequisites
+
+- Python 3.11+
+- NVIDIA GPU with CUDA support (RTX 5080 recommended)
+- PaddleOCR-VL service running via vLLM (default: `http://localhost:8080/v1`)
+- Pre-downloaded models:
+  - DocLayout-YOLO
+  - PP-DocLayoutV2
+
+## Quick Start
+
+### 1. Install Dependencies
+
+Using [uv](https://github.com/astral-sh/uv):
+
+```bash
+# Install uv if not already installed
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Create virtual environment and install dependencies
+uv venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+uv pip install -e .
+```
+
+### 2. Download Models
+
+Download the required models and place them in the `models/` directory:
+
+```bash
+mkdir -p models/DocLayout models/PP-DocLayout
+
+# DocLayout-YOLO (from HuggingFace)
+# https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench
+# Place the .pt file in models/DocLayout/
+
+# PP-DocLayoutV2 (from PaddlePaddle)
+# Place the model files in models/PP-DocLayout/
+```
+
+### 3. Configure Environment
+
+Create a `.env` file:
+
+```bash
+# PaddleOCR-VL vLLM server URL
+PADDLEOCR_VL_URL=http://localhost:8080/v1
+
+# Model paths
+DOCLAYOUT_MODEL_PATH=models/DocLayout/doclayout_yolo_docstructbench_imgsz1024.pt
+PP_DOCLAYOUT_MODEL_DIR=models/PP-DocLayout
+
+# Server settings
+HOST=0.0.0.0
+PORT=8053
+```
+
+### 4. Run the Server
+
+```bash
+uvicorn app.main:app --host 0.0.0.0 --port 8053
+```
+
+## Docker Deployment
+
+### Build and Run with GPU
+
+```bash
+# Build the image
+docker build -t doc-processer .
+
+# Run with GPU support
+docker run --gpus all -p 8053:8053 \
+  -v ./models/DocLayout:/app/models/DocLayout:ro \
+  -v ./models/PP-DocLayout:/app/models/PP-DocLayout:ro \
+  -e PADDLEOCR_VL_URL=http://host.docker.internal:8080/v1 \
+  doc-processer
+```
+
+### Using Docker Compose
+
+```bash
+# Start the service with GPU
+docker-compose up -d doc-processer
+
+# Or without GPU (CPU mode)
+docker-compose --profile cpu up -d doc-processer-cpu
+```
+
+## API Usage
+
+### Image OCR
+
+```bash
+# Using image URL
+curl -X POST http://localhost:8053/doc_process/v1/image/ocr \
+  -H "Content-Type: application/json" \
+  -d '{"image_url": "https://example.com/document.png"}'
+
+# Using base64 image
+curl -X POST http://localhost:8053/doc_process/v1/image/ocr \
+  -H "Content-Type: application/json" \
+  -d '{"image_base64": "iVBORw0KGgo..."}'
+```
+
+Response:
+```json
+{
+  "latex": "\\section{Title}...",
+  "markdown": "# Title\n...",
+  "mathml": "<math>...</math>",
+  "layout_info": {
+    "regions": [
+      {"type": "text", "bbox": [10, 20, 100, 50], "confidence": 0.95}
+    ],
+    "has_plain_text": true,
+    "has_formula": false
+  },
+  "recognition_mode": "mixed_recognition"
+}
+```
+
+### Markdown to DOCX
+
+```bash
+curl -X POST http://localhost:8053/doc_process/v1/convert/docx \
+  -H "Content-Type: application/json" \
+  -d '{"markdown": "# Hello World\n\nThis is a test.", "filename": "output"}' \
+  --output output.docx
+```
+
+## Project Structure
+
+```
+doc_processer/
+├── app/
+│   ├── api/v1/
+│   │   ├── endpoints/
+│   │   │   ├── image.py      # Image OCR endpoint
+│   │   │   └── convert.py    # Markdown to DOCX endpoint
+│   │   └── router.py
+│   ├── core/
+│   │   ├── config.py         # Settings
+│   │   └── dependencies.py   # DI providers
+│   ├── services/
+│   │   ├── image_processor.py    # OpenCV preprocessing
+│   │   ├── layout_detector.py    # DocLayout-YOLO
+│   │   ├── ocr_service.py        # PaddleOCR-VL client
+│   │   └── docx_converter.py     # Markdown to DOCX
+│   ├── schemas/
+│   │   ├── image.py
+│   │   └── convert.py
+│   └── main.py
+├── models/                   # Pre-downloaded models (git-ignored)
+├── Dockerfile
+├── docker-compose.yml
+├── pyproject.toml
+└── README.md
+```
+
+## Processing Pipeline
+
+### Image OCR Flow
+
+1. **Input**: Accept `image_url` or `image_base64`
+2. **Preprocessing**: Add 30% whitespace padding using OpenCV
+3. **Layout Detection**: DocLayout-YOLO detects regions (text, formula, table, figure)
+4. **Recognition**:
+   - If plain text detected → PP-DocLayoutV2 for mixed content recognition
+   - Otherwise → PaddleOCR-VL with formula prompt
+5. **Output Conversion**: Generate LaTeX, Markdown, and MathML
+
+## Hardware Requirements
+
+- **Minimum**: 8GB GPU VRAM
+- **Recommended**: RTX 5080 16GB or equivalent
+- **CPU**: 4+ cores
+- **RAM**: 16GB+
+
+## License
+
+MIT
+
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/__init__.py b/app/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/v1/__init__.py b/app/api/v1/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/v1/endpoints/__init__.py b/app/api/v1/endpoints/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/v1/endpoints/convert.py b/app/api/v1/endpoints/convert.py
new file mode 100644
index 0000000..0ffa29a
--- /dev/null
+++ b/app/api/v1/endpoints/convert.py
@@ -0,0 +1,37 @@
+"""Markdown to DOCX conversion endpoint."""
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import Response
+
+from app.core.dependencies import get_docx_converter
+from app.schemas.convert import MarkdownToDocxRequest
+from app.services.docx_converter import DocxConverter
+
+router = APIRouter()
+
+
+@router.post("/docx")
+async def convert_markdown_to_docx(
+    request: MarkdownToDocxRequest,
+    converter: DocxConverter = Depends(get_docx_converter),
+) -> Response:
+    """Convert markdown content to DOCX file.
+
+    Returns the generated DOCX file as a binary download.
+    """
+    try:
+        docx_bytes = converter.convert(request.markdown)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
+
+    # Determine filename
+    filename = request.filename or "output"
+    if not filename.endswith(".docx"):
+        filename = f"{filename}.docx"
+
+    return Response(
+        content=docx_bytes,
+        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        headers={"Content-Disposition": f'attachment; filename="{filename}"'},
+    )
+
diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py
new file mode 100644
index 0000000..c194213
--- /dev/null
+++ b/app/api/v1/endpoints/image.py
@@ -0,0 +1,59 @@
+"""Image OCR endpoint."""
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service
+from app.schemas.image import ImageOCRRequest, ImageOCRResponse
+from app.services.image_processor import ImageProcessor
+from app.services.layout_detector import LayoutDetector
+from app.services.ocr_service import OCRService
+
+router = APIRouter()
+
+
+@router.post("/ocr", response_model=ImageOCRResponse)
+async def process_image_ocr(
+    request: ImageOCRRequest,
+    image_processor: ImageProcessor = Depends(get_image_processor),
+    layout_detector: LayoutDetector = Depends(get_layout_detector),
+    ocr_service: OCRService = Depends(get_ocr_service),
+) -> ImageOCRResponse:
+    """Process an image and extract content as LaTeX, Markdown, and MathML.
+
+    The processing pipeline:
+    1. Load and preprocess image (add 30% whitespace padding)
+    2. Detect layout using DocLayout-YOLO
+    3. Based on layout:
+       - If plain text exists: use PP-DocLayoutV2 for mixed recognition
+       - Otherwise: use PaddleOCR-VL with formula prompt
+    4. Convert output to LaTeX, Markdown, and MathML formats
+    """
+    try:
+        # 1. Load and preprocess image
+        image = image_processor.preprocess(
+            image_url=request.image_url,
+            image_base64=request.image_base64,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    try:
+        # 2. Detect layout
+        layout_info = layout_detector.detect(image)
+    except RuntimeError as e:
+        raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}")
+
+    try:
+        # 3. Perform OCR based on layout
+        ocr_result = ocr_service.recognize(image, layout_info)
+    except RuntimeError as e:
+        raise HTTPException(status_code=503, detail=str(e))
+
+    # 4. Return response
+    return ImageOCRResponse(
+        latex=ocr_result.get("latex", ""),
+        markdown=ocr_result.get("markdown", ""),
+        mathml=ocr_result.get("mathml", ""),
+        layout_info=layout_info,
+        recognition_mode=ocr_result.get("recognition_mode", ""),
+    )
diff --git a/app/api/v1/router.py b/app/api/v1/router.py
new file mode 100644
index 0000000..a553985
--- /dev/null
+++ b/app/api/v1/router.py
@@ -0,0 +1,13 @@
+"""API v1 router combining all endpoints."""
+
+from fastapi import APIRouter
+
+from app.api.v1.endpoints import convert, image
+
+api_router = APIRouter()
+
+# Include image processing endpoints
+api_router.include_router(image.router, prefix="/image", tags=["Image OCR"])
+
+# Include conversion endpoints
+api_router.include_router(convert.router, prefix="/convert", tags=["Conversion"])
diff --git a/app/core/__init__.py b/app/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/core/config.py b/app/core/config.py
new file mode 100644
index 0000000..0a57ad1
--- /dev/null
+++ b/app/core/config.py
@@ -0,0 +1,52 @@
+"""Application configuration using Pydantic Settings."""
+
+from functools import lru_cache
+from pathlib import Path
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+    )
+
+    # API Settings
+    api_prefix: str = "/doc_process/v1"
+    debug: bool = False
+
+    # PaddleOCR-VL Settings
+    paddleocr_vl_url: str = "http://localhost:8080/v1"
+
+    # Model Paths
+    doclayout_model_path: str = "app/model/DocLayout"
+    pp_doclayout_model_dir: str = "app/model/PP-DocLayout"
+
+    # Image Processing
+    max_image_size_mb: int = 10
+    image_padding_ratio: float = 0.15  # 15% on each side = 30% total expansion
+
+    # Server Settings
+    host: str = "0.0.0.0"
+    port: int = 8053
+
+    @property
+    def doclayout_model_file(self) -> Path:
+        """Get the DocLayout model file path."""
+        return Path(self.doclayout_model_path)
+
+    @property
+    def pp_doclayout_dir(self) -> Path:
+        """Get the PP-DocLayout model directory path."""
+        return Path(self.pp_doclayout_model_dir)
+
+
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance."""
+    return Settings()
+
diff --git a/app/core/dependencies.py b/app/core/dependencies.py
new file mode 100644
index 0000000..dcd04ae
--- /dev/null
+++ b/app/core/dependencies.py
@@ -0,0 +1,42 @@
+"""Application dependencies."""
+
+from app.services.image_processor import ImageProcessor
+from app.services.layout_detector import LayoutDetector
+from app.services.ocr_service import OCRService
+from app.services.docx_converter import DocxConverter
+
+# Global instances (initialized on startup)
+_layout_detector: LayoutDetector | None = None
+
+
+def init_layout_detector(model_path: str) -> None:
+    """Initialize the global layout detector.
+
+    Called during application startup.
+    """
+    global _layout_detector
+    _layout_detector = LayoutDetector(model_path=model_path)
+    _layout_detector.load_model()
+
+
+def get_layout_detector() -> LayoutDetector:
+    """Get the global layout detector instance."""
+    if _layout_detector is None:
+        raise RuntimeError("Layout detector not initialized. Call init_layout_detector() first.")
+    return _layout_detector
+
+
+def get_image_processor() -> ImageProcessor:
+    """Get an image processor instance."""
+    return ImageProcessor()
+
+
+def get_ocr_service() -> OCRService:
+    """Get an OCR service instance."""
+    return OCRService()
+
+
+def get_docx_converter() -> DocxConverter:
+    """Get a DOCX converter instance."""
+    return DocxConverter()
+
diff --git a/app/main.py b/app/main.py
new file mode 100644
index 0000000..174b5ae
--- /dev/null
+++ b/app/main.py
@@ -0,0 +1,39 @@
+"""FastAPI application entry point."""
+
+from contextlib import asynccontextmanager
+
+from fastapi import FastAPI
+
+from app.api.v1.router import api_router
+from app.core.config import get_settings
+from app.core.dependencies import init_layout_detector
+
+settings = get_settings()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan handler for startup/shutdown."""
+    # Startup: Load models
+    init_layout_detector(model_path=settings.doclayout_model_path)
+
+    yield
+
+    # Shutdown: Cleanup happens automatically
+
+
+app = FastAPI(
+    title="DocProcesser API",
+    description="Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+
+# Include API router
+app.include_router(api_router, prefix=settings.api_prefix)
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy"}
diff --git a/app/model/__init__.py b/app/model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/schemas/convert.py b/app/schemas/convert.py
new file mode 100644
index 0000000..09661e3
--- /dev/null
+++ b/app/schemas/convert.py
@@ -0,0 +1,19 @@
+"""Request and response schemas for markdown to DOCX conversion endpoint."""
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class MarkdownToDocxRequest(BaseModel):
+    """Request body for markdown to DOCX conversion endpoint."""
+
+    markdown: str = Field(..., description="Markdown content to convert")
+    filename: str | None = Field(None, description="Optional output filename (without extension)")
+
+    @field_validator("markdown")
+    @classmethod
+    def validate_markdown_not_empty(cls, v: str) -> str:
+        """Validate that markdown content is not empty."""
+        if not v or not v.strip():
+            raise ValueError("Markdown content cannot be empty")
+        return v
+
diff --git a/app/schemas/image.py b/app/schemas/image.py
new file mode 100644
index 0000000..ed81233
--- /dev/null
+++ b/app/schemas/image.py
@@ -0,0 +1,48 @@
+"""Request and response schemas for image OCR endpoint."""
+
+from pydantic import BaseModel, Field, model_validator
+
+
+class LayoutRegion(BaseModel):
+    """A detected layout region in the document."""
+
+    type: str = Field(..., description="Region type: text, formula, table, figure")
+    bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]")
+    confidence: float = Field(..., description="Detection confidence score")
+
+
+class LayoutInfo(BaseModel):
+    """Layout detection information."""
+
+    regions: list[LayoutRegion] = Field(default_factory=list)
+    has_plain_text: bool = Field(False, description="Whether plain text was detected")
+    has_formula: bool = Field(False, description="Whether formulas were detected")
+
+
+class ImageOCRRequest(BaseModel):
+    """Request body for image OCR endpoint."""
+
+    image_url: str | None = Field(None, description="URL to fetch the image from")
+    image_base64: str | None = Field(None, description="Base64-encoded image data")
+
+    @model_validator(mode="after")
+    def validate_input(self):
+        """Validate that exactly one of image_url or image_base64 is provided."""
+        if self.image_url is None and self.image_base64 is None:
+            raise ValueError("Either image_url or image_base64 must be provided")
+        if self.image_url is not None and self.image_base64 is not None:
+            raise ValueError("Only one of image_url or image_base64 should be provided")
+        return self
+
+
+class ImageOCRResponse(BaseModel):
+    """Response body for image OCR endpoint."""
+
+    latex: str = Field("", description="LaTeX representation of the content")
+    markdown: str = Field("", description="Markdown representation of the content")
+    mathml: str = Field("", description="MathML representation (empty if no math detected)")
+    layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
+    recognition_mode: str = Field(
+        "", description="Recognition mode used: mixed_recognition or formula_recognition"
+    )
+
diff --git a/app/services/__init__.py b/app/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/services/docx_converter.py b/app/services/docx_converter.py
new file mode 100644
index 0000000..6364507
--- /dev/null
+++ b/app/services/docx_converter.py
@@ -0,0 +1,335 @@
+"""Markdown to DOCX conversion service.
+
+Reference implementation based on https://github.com/YogeLiu/markdown_2_docx
+"""
+
+import io
+import re
+from dataclasses import dataclass
+
+from docx import Document
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+from docx.shared import Inches, Pt
+
+
+@dataclass
+class MarkdownElement:
+    """Parsed markdown element."""
+
+    type: str  # heading, paragraph, list_item, code_block, table, math
+    content: str
+    level: int = 0  # For headings and lists
+    language: str = ""  # For code blocks
+
+
+class DocxConverter:
+    """Converts markdown content to DOCX format."""
+
+    def __init__(self):
+        """Initialize the converter."""
+        self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$")
+        self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$")
+        self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$")
+        self.code_block_pattern = re.compile(r"^```(\w*)$")
+        self.inline_code_pattern = re.compile(r"`([^`]+)`")
+        self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*")
+        self.italic_pattern = re.compile(r"\*([^*]+)\*")
+        self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL)
+        self.inline_math_pattern = re.compile(r"\$([^$]+)\$")
+
+    def convert(self, markdown: str) -> bytes:
+        """Convert markdown content to DOCX.
+
+        Args:
+            markdown: Markdown content to convert.
+
+        Returns:
+            DOCX file as bytes.
+        """
+        doc = Document()
+        elements = self._parse_markdown(markdown)
+
+        for element in elements:
+            self._add_element_to_doc(doc, element)
+
+        # Save to bytes
+        buffer = io.BytesIO()
+        doc.save(buffer)
+        buffer.seek(0)
+        return buffer.getvalue()
+
+    def _parse_markdown(self, markdown: str) -> list[MarkdownElement]:
+        """Parse markdown into elements.
+
+        Args:
+            markdown: Markdown content.
+
+        Returns:
+            List of parsed elements.
+        """
+        elements: list[MarkdownElement] = []
+        lines = markdown.split("\n")
+        i = 0
+        in_code_block = False
+        code_content = []
+        code_language = ""
+
+        while i < len(lines):
+            line = lines[i]
+
+            # Code block handling
+            code_match = self.code_block_pattern.match(line)
+            if code_match:
+                if in_code_block:
+                    elements.append(
+                        MarkdownElement(
+                            type="code_block",
+                            content="\n".join(code_content),
+                            language=code_language,
+                        )
+                    )
+                    code_content = []
+                    in_code_block = False
+                else:
+                    in_code_block = True
+                    code_language = code_match.group(1)
+                i += 1
+                continue
+
+            if in_code_block:
+                code_content.append(line)
+                i += 1
+                continue
+
+            # Math block ($$...$$)
+            if line.strip().startswith("$$"):
+                math_content = []
+                if line.strip() == "$$":
+                    i += 1
+                    while i < len(lines) and lines[i].strip() != "$$":
+                        math_content.append(lines[i])
+                        i += 1
+                else:
+                    # Single line $$...$$ or start
+                    content = line.strip()[2:]
+                    if content.endswith("$$"):
+                        math_content.append(content[:-2])
+                    else:
+                        math_content.append(content)
+                        i += 1
+                        while i < len(lines):
+                            if lines[i].strip().endswith("$$"):
+                                math_content.append(lines[i].strip()[:-2])
+                                break
+                            math_content.append(lines[i])
+                            i += 1
+
+                elements.append(
+                    MarkdownElement(type="math", content="\n".join(math_content))
+                )
+                i += 1
+                continue
+
+            # Heading
+            heading_match = self.heading_pattern.match(line)
+            if heading_match:
+                level = len(heading_match.group(1))
+                content = heading_match.group(2)
+                elements.append(
+                    MarkdownElement(type="heading", content=content, level=level)
+                )
+                i += 1
+                continue
+
+            # Unordered list
+            list_match = self.list_pattern.match(line)
+            if list_match:
+                indent = len(list_match.group(1))
+                content = list_match.group(2)
+                elements.append(
+                    MarkdownElement(type="list_item", content=content, level=indent // 2)
+                )
+                i += 1
+                continue
+
+            # Ordered list
+            ordered_match = self.ordered_list_pattern.match(line)
+            if ordered_match:
+                indent = len(ordered_match.group(1))
+                content = ordered_match.group(2)
+                elements.append(
+                    MarkdownElement(
+                        type="ordered_list_item", content=content, level=indent // 2
+                    )
+                )
+                i += 1
+                continue
+
+            # Table (simple detection)
+            if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]:
+                table_lines = [line]
+                i += 1
+                while i < len(lines) and "|" in lines[i]:
+                    table_lines.append(lines[i])
+                    i += 1
+                elements.append(
+                    MarkdownElement(type="table", content="\n".join(table_lines))
+                )
+                continue
+
+            # Regular paragraph
+            if line.strip():
+                elements.append(MarkdownElement(type="paragraph", content=line))
+
+            i += 1
+
+        return elements
+
+    def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None:
+        """Add a markdown element to the document.
+
+        Args:
+            doc: Word document.
+            element: Parsed markdown element.
+        """
+        if element.type == "heading":
+            self._add_heading(doc, element.content, element.level)
+        elif element.type == "paragraph":
+            self._add_paragraph(doc, element.content)
+        elif element.type == "list_item":
+            self._add_list_item(doc, element.content, element.level, ordered=False)
+        elif element.type == "ordered_list_item":
+            self._add_list_item(doc, element.content, element.level, ordered=True)
+        elif element.type == "code_block":
+            self._add_code_block(doc, element.content)
+        elif element.type == "table":
+            self._add_table(doc, element.content)
+        elif element.type == "math":
+            self._add_math(doc, element.content)
+
+    def _add_heading(self, doc: Document, content: str, level: int) -> None:
+        """Add a heading to the document."""
+        # Map markdown levels to Word heading styles
+        heading_level = min(level, 9)  # Word supports up to Heading 9
+        doc.add_heading(content, level=heading_level)
+
+    def _add_paragraph(self, doc: Document, content: str) -> None:
+        """Add a paragraph with inline formatting."""
+        para = doc.add_paragraph()
+        self._add_formatted_text(para, content)
+
+    def _add_formatted_text(self, para, content: str) -> None:
+        """Add text with inline formatting (bold, italic, code)."""
+        # Simple approach: process inline patterns
+        remaining = content
+
+        while remaining:
+            # Find next formatting marker
+            bold_match = self.bold_pattern.search(remaining)
+            italic_match = self.italic_pattern.search(remaining)
+            code_match = self.inline_code_pattern.search(remaining)
+            math_match = self.inline_math_pattern.search(remaining)
+
+            matches = [
+                (bold_match, "bold"),
+                (italic_match, "italic"),
+                (code_match, "code"),
+                (math_match, "math"),
+            ]
+            matches = [(m, t) for m, t in matches if m]
+
+            if not matches:
+                para.add_run(remaining)
+                break
+
+            # Find earliest match
+            earliest = min(matches, key=lambda x: x[0].start())
+            match, match_type = earliest
+
+            # Add text before match
+            if match.start() > 0:
+                para.add_run(remaining[: match.start()])
+
+            # Add formatted text
+            run = para.add_run(match.group(1))
+            if match_type == "bold":
+                run.bold = True
+            elif match_type == "italic":
+                run.italic = True
+            elif match_type == "code":
+                run.font.name = "Courier New"
+                run.font.size = Pt(10)
+            elif match_type == "math":
+                run.italic = True
+
+            remaining = remaining[match.end() :]
+
+    def _add_list_item(
+        self, doc: Document, content: str, level: int, ordered: bool
+    ) -> None:
+        """Add a list item."""
+        para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number")
+        para.paragraph_format.left_indent = Inches(0.25 * level)
+        self._add_formatted_text(para, content)
+
+    def _add_code_block(self, doc: Document, content: str) -> None:
+        """Add a code block."""
+        para = doc.add_paragraph()
+        para.paragraph_format.left_indent = Inches(0.5)
+
+        run = para.add_run(content)
+        run.font.name = "Courier New"
+        run.font.size = Pt(9)
+
+        # Add shading
+        shading = OxmlElement("w:shd")
+        shading.set(qn("w:val"), "clear")
+        shading.set(qn("w:fill"), "F0F0F0")
+        para._p.get_or_add_pPr().append(shading)
+
+    def _add_table(self, doc: Document, content: str) -> None:
+        """Add a table from markdown table format."""
+        lines = [l.strip() for l in content.split("\n") if l.strip()]
+        if len(lines) < 2:
+            return
+
+        # Parse header
+        header = [c.strip() for c in lines[0].split("|") if c.strip()]
+
+        # Skip separator line
+        data_lines = lines[2:] if len(lines) > 2 else []
+
+        # Create table
+        table = doc.add_table(rows=1, cols=len(header))
+        table.style = "Table Grid"
+
+        # Add header
+        header_cells = table.rows[0].cells
+        for i, text in enumerate(header):
+            header_cells[i].text = text
+            header_cells[i].paragraphs[0].runs[0].bold = True
+
+        # Add data rows
+        for line in data_lines:
+            cells = [c.strip() for c in line.split("|") if c.strip()]
+            row_cells = table.add_row().cells
+            for i, text in enumerate(cells):
+                if i < len(row_cells):
+                    row_cells[i].text = text
+
+    def _add_math(self, doc: Document, content: str) -> None:
+        """Add a math block.
+
+        For proper OMML rendering, this would need more complex conversion.
+        Currently renders as italic text with the LaTeX source.
+        """
+        para = doc.add_paragraph()
+        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+        run = para.add_run(content)
+        run.italic = True
+        run.font.name = "Cambria Math"
+        run.font.size = Pt(12)
+
diff --git a/app/services/image_processor.py b/app/services/image_processor.py
new file mode 100644
index 0000000..e9c0e26
--- /dev/null
+++ b/app/services/image_processor.py
@@ -0,0 +1,139 @@
+"""Image preprocessing service using OpenCV."""
+
+import base64
+import io
+from urllib.request import urlopen
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from app.core.config import get_settings
+
+settings = get_settings()
+
+
+class ImageProcessor:
+    """Service for image preprocessing operations."""
+
+    def __init__(self, padding_ratio: float | None = None):
+        """Initialize with padding ratio.
+
+        Args:
+            padding_ratio: Ratio for padding on each side (default from settings).
+                          0.15 means 15% padding on each side = 30% total expansion.
+        """
+        self.padding_ratio = padding_ratio or settings.image_padding_ratio
+
+    def load_image_from_url(self, url: str) -> np.ndarray:
+        """Load image from URL.
+
+        Args:
+            url: Image URL to fetch.
+
+        Returns:
+            Image as numpy array in BGR format.
+
+        Raises:
+            ValueError: If image cannot be loaded from URL.
+        """
+        try:
+            with urlopen(url, timeout=30) as response:
+                image_data = response.read()
+            image = Image.open(io.BytesIO(image_data))
+            return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        except Exception as e:
+            raise ValueError(f"Failed to load image from URL: {e}") from e
+
+    def load_image_from_base64(self, base64_str: str) -> np.ndarray:
+        """Load image from base64 string.
+
+        Args:
+            base64_str: Base64-encoded image data.
+
+        Returns:
+            Image as numpy array in BGR format.
+
+        Raises:
+            ValueError: If image cannot be decoded.
+        """
+        try:
+            # Handle data URL format
+            if "," in base64_str:
+                base64_str = base64_str.split(",", 1)[1]
+
+            image_data = base64.b64decode(base64_str)
+            image = Image.open(io.BytesIO(image_data))
+            return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        except Exception as e:
+            raise ValueError(f"Failed to decode base64 image: {e}") from e
+
+    def add_padding(self, image: np.ndarray) -> np.ndarray:
+        """Add whitespace padding around the image.
+
+        Adds padding equal to padding_ratio * max(height, width) on each side.
+        This expands the image by approximately 30% total (15% on each side).
+
+        Args:
+            image: Input image as numpy array in BGR format.
+
+        Returns:
+            Padded image as numpy array.
+        """
+        height, width = image.shape[:2]
+        padding = int(max(height, width) * self.padding_ratio)
+
+        # Add white padding on all sides
+        padded_image = cv2.copyMakeBorder(
+            image,
+            top=padding,
+            bottom=padding,
+            left=padding,
+            right=padding,
+            borderType=cv2.BORDER_CONSTANT,
+            value=[255, 255, 255],  # White
+        )
+
+        return padded_image
+
+    def preprocess(self, image_url: str | None, image_base64: str | None) -> np.ndarray:
+        """Load and preprocess image with padding.
+
+        Args:
+            image_url: URL to fetch image from (optional).
+            image_base64: Base64-encoded image (optional).
+
+        Returns:
+            Preprocessed image with padding.
+
+        Raises:
+            ValueError: If neither input is provided or loading fails.
+        """
+        if image_url:
+            image = self.load_image_from_url(image_url)
+        elif image_base64:
+            image = self.load_image_from_base64(image_base64)
+        else:
+            raise ValueError("Either image_url or image_base64 must be provided")
+
+        return self.add_padding(image)
+
+    def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str:
+        """Convert numpy image to base64 string.
+
+        Args:
+            image: Image as numpy array in BGR format.
+            format: Output format (PNG, JPEG).
+
+        Returns:
+            Base64-encoded image string.
+        """
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        pil_image = Image.fromarray(image_rgb)
+
+        buffer = io.BytesIO()
+        pil_image.save(buffer, format=format)
+        buffer.seek(0)
+
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+
diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py
new file mode 100644
index 0000000..03bb020
--- /dev/null
+++ b/app/services/layout_detector.py
@@ -0,0 +1,119 @@
+"""DocLayout-YOLO wrapper for document layout detection."""
+
+import numpy as np
+
+from app.schemas.image import LayoutInfo, LayoutRegion
+
+
+class LayoutDetector:
+    """Wrapper for DocLayout-YOLO model."""
+
+    # Class names from DocLayout-YOLO
+    CLASS_NAMES = {
+        0: "title",
+        1: "plain_text",
+        2: "abandon",
+        3: "figure",
+        4: "figure_caption",
+        5: "table",
+        6: "table_caption",
+        7: "table_footnote",
+        8: "isolate_formula",
+        9: "formula_caption",
+    }
+
+    # Classes considered as plain text
+    PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"}
+
+    # Classes considered as formula
+    FORMULA_CLASSES = {"isolate_formula", "formula_caption"}
+
+    def __init__(self, model_path: str, confidence_threshold: float = 0.2):
+        """Initialize the layout detector.
+
+        Args:
+            model_path: Path to the DocLayout-YOLO model weights.
+            confidence_threshold: Minimum confidence for detections.
+        """
+        self.model_path = model_path
+        self.confidence_threshold = confidence_threshold
+        self.model = None
+
+    def load_model(self) -> None:
+        """Load the DocLayout-YOLO model.
+
+        Raises:
+            RuntimeError: If model cannot be loaded.
+        """
+        try:
+            from doclayout_yolo import YOLOv10
+
+            self.model = YOLOv10(self.model_path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e
+
+    def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo:
+        """Detect document layout regions.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+            image_size: Image size for prediction.
+
+        Returns:
+            LayoutInfo with detected regions.
+
+        Raises:
+            RuntimeError: If model not loaded.
+        """
+        if self.model is None:
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+
+        # Run prediction
+        results = self.model.predict(
+            image,
+            imgsz=image_size,
+            conf=self.confidence_threshold,
+            device="cuda:0",
+        )
+
+        regions: list[LayoutRegion] = []
+        has_plain_text = False
+        has_formula = False
+
+        if results and len(results) > 0:
+            result = results[0]
+            if result.boxes is not None:
+                for box in result.boxes:
+                    cls_id = int(box.cls[0].item())
+                    confidence = float(box.conf[0].item())
+                    bbox = box.xyxy[0].tolist()
+
+                    class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}")
+
+                    # Map to simplified type
+                    if class_name in self.PLAIN_TEXT_CLASSES:
+                        region_type = "text"
+                        has_plain_text = True
+                    elif class_name in self.FORMULA_CLASSES:
+                        region_type = "formula"
+                        has_formula = True
+                    elif class_name in {"figure"}:
+                        region_type = "figure"
+                    elif class_name in {"table"}:
+                        region_type = "table"
+                    else:
+                        region_type = class_name
+
+                    regions.append(
+                        LayoutRegion(
+                            type=region_type,
+                            bbox=bbox,
+                            confidence=confidence,
+                        )
+                    )
+
+        return LayoutInfo(
+            regions=regions,
+            has_plain_text=has_plain_text,
+            has_formula=has_formula,
+        )
diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py
new file mode 100644
index 0000000..8c7fe41
--- /dev/null
+++ b/app/services/ocr_service.py
@@ -0,0 +1,303 @@
+"""PaddleOCR-VL client service for text and formula recognition."""
+
+import io
+import tempfile
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from app.core.config import get_settings
+from app.schemas.image import LayoutInfo
+
+settings = get_settings()
+
+
+class OCRService:
+    """Service for OCR using PaddleOCR-VL."""
+
+    FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format."
+
+    def __init__(
+        self,
+        vl_server_url: str | None = None,
+        pp_doclayout_model_dir: str | None = None,
+    ):
+        """Initialize OCR service.
+
+        Args:
+            vl_server_url: URL of the vLLM server for PaddleOCR-VL.
+            pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory.
+        """
+        self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
+        self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir
+        self._pipeline = None
+
+    def _get_pipeline(self):
+        """Get or create PaddleOCR-VL pipeline.
+
+        Returns:
+            PaddleOCRVL pipeline instance.
+        """
+        if self._pipeline is None:
+            from paddleocr import PaddleOCRVL
+
+            self._pipeline = PaddleOCRVL(
+                vl_rec_backend="vllm-server",
+                vl_rec_server_url=self.vl_server_url,
+                layout_detection_model_name="PP-DocLayoutV2",
+                layout_detection_model_dir=self.pp_doclayout_model_dir,
+            )
+        return self._pipeline
+
+    def _save_temp_image(self, image: np.ndarray) -> str:
+        """Save image to a temporary file.
+
+        Args:
+            image: Image as numpy array in BGR format.
+
+        Returns:
+            Path to temporary file.
+        """
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+            cv2.imwrite(f.name, image)
+            return f.name
+
+    def recognize_mixed(self, image: np.ndarray) -> dict:
+        """Recognize mixed content (text + formulas) using PP-DocLayoutV2.
+
+        This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
+        recognition of mixed content.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+
+        Returns:
+            Dict with 'markdown', 'latex', 'mathml' keys.
+        """
+        try:
+            pipeline = self._get_pipeline()
+            temp_path = self._save_temp_image(image)
+
+            try:
+                results = list(pipeline.predict(temp_path))
+
+                markdown_content = ""
+                for result in results:
+                    # PaddleOCR-VL results can be saved to markdown
+                    md_buffer = io.StringIO()
+                    result.save_to_markdown(save_path=md_buffer)
+                    markdown_content += md_buffer.getvalue()
+
+                # Convert markdown to other formats
+                latex = self._markdown_to_latex(markdown_content)
+                mathml = self._extract_mathml(markdown_content)
+
+                return {
+                    "markdown": markdown_content,
+                    "latex": latex,
+                    "mathml": mathml,
+                }
+            finally:
+                Path(temp_path).unlink(missing_ok=True)
+
+        except Exception as e:
+            raise RuntimeError(f"Mixed recognition failed: {e}") from e
+
+    def recognize_formula(self, image: np.ndarray) -> dict:
+        """Recognize formula/math content using PaddleOCR-VL with prompt.
+
+        This mode uses PaddleOCR-VL directly with a formula recognition prompt.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+
+        Returns:
+            Dict with 'latex', 'markdown', 'mathml' keys.
+        """
+        try:
+            import httpx
+
+            temp_path = self._save_temp_image(image)
+
+            try:
+                # Use vLLM API directly for formula recognition
+                import base64
+
+                with open(temp_path, "rb") as f:
+                    image_base64 = base64.b64encode(f.read()).decode("utf-8")
+
+                # Call vLLM server with formula prompt
+                response = httpx.post(
+                    f"{self.vl_server_url}/chat/completions",
+                    json={
+                        "model": "paddleocr-vl",
+                        "messages": [
+                            {
+                                "role": "user",
+                                "content": [
+                                    {"type": "text", "text": self.FORMULA_PROMPT},
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                                    },
+                                ],
+                            }
+                        ],
+                        "max_tokens": 1024,
+                    },
+                    timeout=60.0,
+                )
+                response.raise_for_status()
+                result = response.json()
+
+                latex = result["choices"][0]["message"]["content"].strip()
+
+                # Convert latex to other formats
+                markdown = self._latex_to_markdown(latex)
+                mathml = self._latex_to_mathml(latex)
+
+                return {
+                    "latex": latex,
+                    "markdown": markdown,
+                    "mathml": mathml,
+                }
+            finally:
+                Path(temp_path).unlink(missing_ok=True)
+
+        except httpx.HTTPStatusError as e:
+            raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e
+        except Exception as e:
+            raise RuntimeError(f"Formula recognition failed: {e}") from e
+
+    def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict:
+        """Recognize content based on layout detection results.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+            layout_info: Layout detection results.
+
+        Returns:
+            Dict with recognition results including mode used.
+        """
+        # Decision logic:
+        # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2)
+        # - Otherwise -> use formula_recognition (VL with prompt)
+        if layout_info.has_plain_text:
+            result = self.recognize_mixed(image)
+            result["recognition_mode"] = "mixed_recognition"
+        else:
+            result = self.recognize_formula(image)
+            result["recognition_mode"] = "formula_recognition"
+
+        return result
+
+    def _markdown_to_latex(self, markdown: str) -> str:
+        """Convert markdown to LaTeX.
+
+        Simple conversion - wraps content in LaTeX document structure.
+
+        Args:
+            markdown: Markdown content.
+
+        Returns:
+            LaTeX representation.
+        """
+        # Basic conversion: preserve math blocks, convert structure
+        lines = []
+        in_code_block = False
+
+        for line in markdown.split("\n"):
+            if line.startswith("```"):
+                in_code_block = not in_code_block
+                if in_code_block:
+                    lines.append("\\begin{verbatim}")
+                else:
+                    lines.append("\\end{verbatim}")
+            elif in_code_block:
+                lines.append(line)
+            elif line.startswith("# "):
+                lines.append(f"\\section{{{line[2:]}}}")
+            elif line.startswith("## "):
+                lines.append(f"\\subsection{{{line[3:]}}}")
+            elif line.startswith("### "):
+                lines.append(f"\\subsubsection{{{line[4:]}}}")
+            elif line.startswith("- "):
+                lines.append(f"\\item {line[2:]}")
+            elif line.startswith("$$"):
+                lines.append(line.replace("$$", "\\[").replace("$$", "\\]"))
+            elif "$" in line:
+                # Keep inline math as-is
+                lines.append(line)
+            else:
+                lines.append(line)
+
+        return "\n".join(lines)
+
+    def _latex_to_markdown(self, latex: str) -> str:
+        """Convert LaTeX to markdown.
+
+        Args:
+            latex: LaTeX content.
+
+        Returns:
+            Markdown representation.
+        """
+        # Wrap LaTeX in markdown math block
+        if latex.strip():
+            return f"$$\n{latex}\n$$"
+        return ""
+
+    def _latex_to_mathml(self, latex: str) -> str:
+        """Convert LaTeX to MathML.
+
+        Args:
+            latex: LaTeX content.
+
+        Returns:
+            MathML representation.
+        """
+        # Basic LaTeX to MathML conversion
+        # For production, consider using latex2mathml library
+        if not latex.strip():
+            return ""
+
+        try:
+            # Try to use latex2mathml if available
+            from latex2mathml.converter import convert
+
+            return convert(latex)
+        except ImportError:
+            # Fallback: wrap in basic MathML structure
+            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
+        except Exception:
+            return f'<math xmlns="http://www.w3.org/1998/Math/MathML"><mtext>{latex}</mtext></math>'
+
+    def _extract_mathml(self, markdown: str) -> str:
+        """Extract and convert math from markdown to MathML.
+
+        Args:
+            markdown: Markdown content.
+
+        Returns:
+            MathML for any math content found.
+        """
+        import re
+
+        # Find all math blocks
+        math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL)
+        inline_math = re.findall(r"\$([^$]+)\$", markdown)
+
+        all_math = math_blocks + inline_math
+
+        if not all_math:
+            return ""
+
+        # Convert each to MathML and combine
+        mathml_parts = []
+        for latex in all_math:
+            mathml = self._latex_to_mathml(latex.strip())
+            if mathml:
+                mathml_parts.append(mathml)
+
+        return "\n".join(mathml_parts)
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..7c8cf36
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,53 @@
+version: "3.8"
+
+services:
+  doc-processer:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: doc-processer
+    ports:
+      - "8053:8053"
+    environment:
+      - PADDLEOCR_VL_URL=http://host.docker.internal:8080/v1
+      - DOCLAYOUT_MODEL_PATH=/app/models/DocLayout/doclayout_yolo_docstructbench_imgsz1024.pt
+      - PP_DOCLAYOUT_MODEL_DIR=/app/models/PP-DocLayout
+      - MAX_IMAGE_SIZE_MB=10
+    volumes:
+      # Mount pre-downloaded models (adjust paths as needed)
+      - ./models/DocLayout:/app/models/DocLayout:ro
+      - ./models/PP-DocLayout:/app/models/PP-DocLayout:ro
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8053/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
+
+  # Optional: Local development without GPU
+  doc-processer-cpu:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: doc-processer-cpu
+    ports:
+      - "8054:8053"
+    environment:
+      - PADDLEOCR_VL_URL=http://host.docker.internal:8080/v1
+      - DOCLAYOUT_MODEL_PATH=/app/models/DocLayout/doclayout_yolo_docstructbench_imgsz1024.pt
+      - PP_DOCLAYOUT_MODEL_DIR=/app/models/PP-DocLayout
+    volumes:
+      - ./models/DocLayout:/app/models/DocLayout:ro
+      - ./models/PP-DocLayout:/app/models/PP-DocLayout:ro
+    profiles:
+      - cpu
+    restart: unless-stopped
+
diff --git a/openspec/AGENTS.md b/openspec/AGENTS.md
new file mode 100644
index 0000000..96ab0bb
--- /dev/null
+++ b/openspec/AGENTS.md
@@ -0,0 +1,456 @@
+# OpenSpec Instructions
+
+Instructions for AI coding assistants using OpenSpec for spec-driven development.
+
+## TL;DR Quick Checklist
+
+- Search existing work: `openspec spec list --long`, `openspec list` (use `rg` only for full-text search)
+- Decide scope: new capability vs modify existing capability
+- Pick a unique `change-id`: kebab-case, verb-led (`add-`, `update-`, `remove-`, `refactor-`)
+- Scaffold: `proposal.md`, `tasks.md`, `design.md` (only if needed), and delta specs per affected capability
+- Write deltas: use `## ADDED|MODIFIED|REMOVED|RENAMED Requirements`; include at least one `#### Scenario:` per requirement
+- Validate: `openspec validate [change-id] --strict` and fix issues
+- Request approval: Do not start implementation until proposal is approved
+
+## Three-Stage Workflow
+
+### Stage 1: Creating Changes
+Create proposal when you need to:
+- Add features or functionality
+- Make breaking changes (API, schema)
+- Change architecture or patterns  
+- Optimize performance (changes behavior)
+- Update security patterns
+
+Triggers (examples):
+- "Help me create a change proposal"
+- "Help me plan a change"
+- "Help me create a proposal"
+- "I want to create a spec proposal"
+- "I want to create a spec"
+
+Loose matching guidance:
+- Contains one of: `proposal`, `change`, `spec`
+- With one of: `create`, `plan`, `make`, `start`, `help`
+
+Skip proposal for:
+- Bug fixes (restore intended behavior)
+- Typos, formatting, comments
+- Dependency updates (non-breaking)
+- Configuration changes
+- Tests for existing behavior
+
+**Workflow**
+1. Review `openspec/project.md`, `openspec list`, and `openspec list --specs` to understand current context.
+2. Choose a unique verb-led `change-id` and scaffold `proposal.md`, `tasks.md`, optional `design.md`, and spec deltas under `openspec/changes/<id>/`.
+3. Draft spec deltas using `## ADDED|MODIFIED|REMOVED Requirements` with at least one `#### Scenario:` per requirement.
+4. Run `openspec validate <id> --strict` and resolve any issues before sharing the proposal.
+
+### Stage 2: Implementing Changes
+Track these steps as TODOs and complete them one by one.
+1. **Read proposal.md** - Understand what's being built
+2. **Read design.md** (if exists) - Review technical decisions
+3. **Read tasks.md** - Get implementation checklist
+4. **Implement tasks sequentially** - Complete in order
+5. **Confirm completion** - Ensure every item in `tasks.md` is finished before updating statuses
+6. **Update checklist** - After all work is done, set every task to `- [x]` so the list reflects reality
+7. **Approval gate** - Do not start implementation until the proposal is reviewed and approved
+
+### Stage 3: Archiving Changes
+After deployment, create separate PR to:
+- Move `changes/[name]/` → `changes/archive/YYYY-MM-DD-[name]/`
+- Update `specs/` if capabilities changed
+- Use `openspec archive <change-id> --skip-specs --yes` for tooling-only changes (always pass the change ID explicitly)
+- Run `openspec validate --strict` to confirm the archived change passes checks
+
+## Before Any Task
+
+**Context Checklist:**
+- [ ] Read relevant specs in `specs/[capability]/spec.md`
+- [ ] Check pending changes in `changes/` for conflicts
+- [ ] Read `openspec/project.md` for conventions
+- [ ] Run `openspec list` to see active changes
+- [ ] Run `openspec list --specs` to see existing capabilities
+
+**Before Creating Specs:**
+- Always check if capability already exists
+- Prefer modifying existing specs over creating duplicates
+- Use `openspec show [spec]` to review current state
+- If request is ambiguous, ask 1–2 clarifying questions before scaffolding
+
+### Search Guidance
+- Enumerate specs: `openspec spec list --long` (or `--json` for scripts)
+- Enumerate changes: `openspec list` (or `openspec change list --json` - deprecated but available)
+- Show details:
+  - Spec: `openspec show <spec-id> --type spec` (use `--json` for filters)
+  - Change: `openspec show <change-id> --json --deltas-only`
+- Full-text search (use ripgrep): `rg -n "Requirement:|Scenario:" openspec/specs`
+
+## Quick Start
+
+### CLI Commands
+
+```bash
+# Essential commands
+openspec list                  # List active changes
+openspec list --specs          # List specifications
+openspec show [item]           # Display change or spec
+openspec validate [item]       # Validate changes or specs
+openspec archive <change-id> [--yes|-y]   # Archive after deployment (add --yes for non-interactive runs)
+
+# Project management
+openspec init [path]           # Initialize OpenSpec
+openspec update [path]         # Update instruction files
+
+# Interactive mode
+openspec show                  # Prompts for selection
+openspec validate              # Bulk validation mode
+
+# Debugging
+openspec show [change] --json --deltas-only
+openspec validate [change] --strict
+```
+
+### Command Flags
+
+- `--json` - Machine-readable output
+- `--type change|spec` - Disambiguate items
+- `--strict` - Comprehensive validation
+- `--no-interactive` - Disable prompts
+- `--skip-specs` - Archive without spec updates
+- `--yes`/`-y` - Skip confirmation prompts (non-interactive archive)
+
+## Directory Structure
+
+```
+openspec/
+├── project.md              # Project conventions
+├── specs/                  # Current truth - what IS built
+│   └── [capability]/       # Single focused capability
+│       ├── spec.md         # Requirements and scenarios
+│       └── design.md       # Technical patterns
+├── changes/                # Proposals - what SHOULD change
+│   ├── [change-name]/
+│   │   ├── proposal.md     # Why, what, impact
+│   │   ├── tasks.md        # Implementation checklist
+│   │   ├── design.md       # Technical decisions (optional; see criteria)
+│   │   └── specs/          # Delta changes
+│   │       └── [capability]/
+│   │           └── spec.md # ADDED/MODIFIED/REMOVED
+│   └── archive/            # Completed changes
+```
+
+## Creating Change Proposals
+
+### Decision Tree
+
+```
+New request?
+├─ Bug fix restoring spec behavior? → Fix directly
+├─ Typo/format/comment? → Fix directly  
+├─ New feature/capability? → Create proposal
+├─ Breaking change? → Create proposal
+├─ Architecture change? → Create proposal
+└─ Unclear? → Create proposal (safer)
+```
+
+### Proposal Structure
+
+1. **Create directory:** `changes/[change-id]/` (kebab-case, verb-led, unique)
+
+2. **Write proposal.md:**
+```markdown
+# Change: [Brief description of change]
+
+## Why
+[1-2 sentences on problem/opportunity]
+
+## What Changes
+- [Bullet list of changes]
+- [Mark breaking changes with **BREAKING**]
+
+## Impact
+- Affected specs: [list capabilities]
+- Affected code: [key files/systems]
+```
+
+3. **Create spec deltas:** `specs/[capability]/spec.md`
+```markdown
+## ADDED Requirements
+### Requirement: New Feature
+The system SHALL provide...
+
+#### Scenario: Success case
+- **WHEN** user performs action
+- **THEN** expected result
+
+## MODIFIED Requirements
+### Requirement: Existing Feature
+[Complete modified requirement]
+
+## REMOVED Requirements
+### Requirement: Old Feature
+**Reason**: [Why removing]
+**Migration**: [How to handle]
+```
+If multiple capabilities are affected, create multiple delta files under `changes/[change-id]/specs/<capability>/spec.md`—one per capability.
+
+4. **Create tasks.md:**
+```markdown
+## 1. Implementation
+- [ ] 1.1 Create database schema
+- [ ] 1.2 Implement API endpoint
+- [ ] 1.3 Add frontend component
+- [ ] 1.4 Write tests
+```
+
+5. **Create design.md when needed:**
+Create `design.md` if any of the following apply; otherwise omit it:
+- Cross-cutting change (multiple services/modules) or a new architectural pattern
+- New external dependency or significant data model changes
+- Security, performance, or migration complexity
+- Ambiguity that benefits from technical decisions before coding
+
+Minimal `design.md` skeleton:
+```markdown
+## Context
+[Background, constraints, stakeholders]
+
+## Goals / Non-Goals
+- Goals: [...]
+- Non-Goals: [...]
+
+## Decisions
+- Decision: [What and why]
+- Alternatives considered: [Options + rationale]
+
+## Risks / Trade-offs
+- [Risk] → Mitigation
+
+## Migration Plan
+[Steps, rollback]
+
+## Open Questions
+- [...]
+```
+
+## Spec File Format
+
+### Critical: Scenario Formatting
+
+**CORRECT** (use #### headers):
+```markdown
+#### Scenario: User login success
+- **WHEN** valid credentials provided
+- **THEN** return JWT token
+```
+
+**WRONG** (don't use bullets or bold):
+```markdown
+- **Scenario: User login**  ❌
+**Scenario**: User login     ❌
+### Scenario: User login      ❌
+```
+
+Every requirement MUST have at least one scenario.
+
+### Requirement Wording
+- Use SHALL/MUST for normative requirements (avoid should/may unless intentionally non-normative)
+
+### Delta Operations
+
+- `## ADDED Requirements` - New capabilities
+- `## MODIFIED Requirements` - Changed behavior
+- `## REMOVED Requirements` - Deprecated features
+- `## RENAMED Requirements` - Name changes
+
+Headers matched with `trim(header)` - whitespace ignored.
+
+#### When to use ADDED vs MODIFIED
+- ADDED: Introduces a new capability or sub-capability that can stand alone as a requirement. Prefer ADDED when the change is orthogonal (e.g., adding "Slash Command Configuration") rather than altering the semantics of an existing requirement.
+- MODIFIED: Changes the behavior, scope, or acceptance criteria of an existing requirement. Always paste the full, updated requirement content (header + all scenarios). The archiver will replace the entire requirement with what you provide here; partial deltas will drop previous details.
+- RENAMED: Use when only the name changes. If you also change behavior, use RENAMED (name) plus MODIFIED (content) referencing the new name.
+
+Common pitfall: Using MODIFIED to add a new concern without including the previous text. This causes loss of detail at archive time. If you aren’t explicitly changing the existing requirement, add a new requirement under ADDED instead.
+
+Authoring a MODIFIED requirement correctly:
+1) Locate the existing requirement in `openspec/specs/<capability>/spec.md`.
+2) Copy the entire requirement block (from `### Requirement: ...` through its scenarios).
+3) Paste it under `## MODIFIED Requirements` and edit to reflect the new behavior.
+4) Ensure the header text matches exactly (whitespace-insensitive) and keep at least one `#### Scenario:`.
+
+Example for RENAMED:
+```markdown
+## RENAMED Requirements
+- FROM: `### Requirement: Login`
+- TO: `### Requirement: User Authentication`
+```
+
+## Troubleshooting
+
+### Common Errors
+
+**"Change must have at least one delta"**
+- Check `changes/[name]/specs/` exists with .md files
+- Verify files have operation prefixes (## ADDED Requirements)
+
+**"Requirement must have at least one scenario"**
+- Check scenarios use `#### Scenario:` format (4 hashtags)
+- Don't use bullet points or bold for scenario headers
+
+**Silent scenario parsing failures**
+- Exact format required: `#### Scenario: Name`
+- Debug with: `openspec show [change] --json --deltas-only`
+
+### Validation Tips
+
+```bash
+# Always use strict mode for comprehensive checks
+openspec validate [change] --strict
+
+# Debug delta parsing
+openspec show [change] --json | jq '.deltas'
+
+# Check specific requirement
+openspec show [spec] --json -r 1
+```
+
+## Happy Path Script
+
+```bash
+# 1) Explore current state
+openspec spec list --long
+openspec list
+# Optional full-text search:
+# rg -n "Requirement:|Scenario:" openspec/specs
+# rg -n "^#|Requirement:" openspec/changes
+
+# 2) Choose change id and scaffold
+CHANGE=add-two-factor-auth
+mkdir -p openspec/changes/$CHANGE/{specs/auth}
+printf "## Why\n...\n\n## What Changes\n- ...\n\n## Impact\n- ...\n" > openspec/changes/$CHANGE/proposal.md
+printf "## 1. Implementation\n- [ ] 1.1 ...\n" > openspec/changes/$CHANGE/tasks.md
+
+# 3) Add deltas (example)
+cat > openspec/changes/$CHANGE/specs/auth/spec.md << 'EOF'
+## ADDED Requirements
+### Requirement: Two-Factor Authentication
+Users MUST provide a second factor during login.
+
+#### Scenario: OTP required
+- **WHEN** valid credentials are provided
+- **THEN** an OTP challenge is required
+EOF
+
+# 4) Validate
+openspec validate $CHANGE --strict
+```
+
+## Multi-Capability Example
+
+```
+openspec/changes/add-2fa-notify/
+├── proposal.md
+├── tasks.md
+└── specs/
+    ├── auth/
+    │   └── spec.md   # ADDED: Two-Factor Authentication
+    └── notifications/
+        └── spec.md   # ADDED: OTP email notification
+```
+
+auth/spec.md
+```markdown
+## ADDED Requirements
+### Requirement: Two-Factor Authentication
+...
+```
+
+notifications/spec.md
+```markdown
+## ADDED Requirements
+### Requirement: OTP Email Notification
+...
+```
+
+## Best Practices
+
+### Simplicity First
+- Default to <100 lines of new code
+- Single-file implementations until proven insufficient
+- Avoid frameworks without clear justification
+- Choose boring, proven patterns
+
+### Complexity Triggers
+Only add complexity with:
+- Performance data showing current solution too slow
+- Concrete scale requirements (>1000 users, >100MB data)
+- Multiple proven use cases requiring abstraction
+
+### Clear References
+- Use `file.ts:42` format for code locations
+- Reference specs as `specs/auth/spec.md`
+- Link related changes and PRs
+
+### Capability Naming
+- Use verb-noun: `user-auth`, `payment-capture`
+- Single purpose per capability
+- 10-minute understandability rule
+- Split if description needs "AND"
+
+### Change ID Naming
+- Use kebab-case, short and descriptive: `add-two-factor-auth`
+- Prefer verb-led prefixes: `add-`, `update-`, `remove-`, `refactor-`
+- Ensure uniqueness; if taken, append `-2`, `-3`, etc.
+
+## Tool Selection Guide
+
+| Task | Tool | Why |
+|------|------|-----|
+| Find files by pattern | Glob | Fast pattern matching |
+| Search code content | Grep | Optimized regex search |
+| Read specific files | Read | Direct file access |
+| Explore unknown scope | Task | Multi-step investigation |
+
+## Error Recovery
+
+### Change Conflicts
+1. Run `openspec list` to see active changes
+2. Check for overlapping specs
+3. Coordinate with change owners
+4. Consider combining proposals
+
+### Validation Failures
+1. Run with `--strict` flag
+2. Check JSON output for details
+3. Verify spec file format
+4. Ensure scenarios properly formatted
+
+### Missing Context
+1. Read project.md first
+2. Check related specs
+3. Review recent archives
+4. Ask for clarification
+
+## Quick Reference
+
+### Stage Indicators
+- `changes/` - Proposed, not yet built
+- `specs/` - Built and deployed
+- `archive/` - Completed changes
+
+### File Purposes
+- `proposal.md` - Why and what
+- `tasks.md` - Implementation steps
+- `design.md` - Technical decisions
+- `spec.md` - Requirements and behavior
+
+### CLI Essentials
+```bash
+openspec list              # What's in progress?
+openspec show [item]       # View details
+openspec validate --strict # Is it correct?
+openspec archive <change-id> [--yes|-y]  # Mark complete (add --yes for automation)
+```
+
+Remember: Specs are truth. Changes are proposals. Keep them in sync.
diff --git a/openspec/changes/add-doc-processing-api/design.md b/openspec/changes/add-doc-processing-api/design.md
new file mode 100644
index 0000000..4b04058
--- /dev/null
+++ b/openspec/changes/add-doc-processing-api/design.md
@@ -0,0 +1,107 @@
+## Context
+
+This is the initial implementation of the DocProcesser service. The system integrates multiple external models and services:
+
+- DocLayout-YOLO for document layout analysis
+- PaddleOCR-VL with PP-DocLayoutV2 for text and formula recognition (deployed via vLLM)
+- markdown_2_docx for document conversion
+
+Target deployment: Ubuntu machine with RTX 5080 GPU (16GB VRAM), Python 3.11.0.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Clean FastAPI project structure following best practices
+- Image preprocessing with OpenCV (30% padding)
+- Layout-aware OCR routing using DocLayout-YOLO
+- Text and formula recognition via PaddleOCR-VL
+- Markdown to DOCX conversion
+- GPU-enabled Docker deployment
+
+**Non-Goals:**
+
+- Authentication/authorization (can be added later)
+- Rate limiting
+- Persistent storage
+- Training or fine-tuning models
+
+## Decisions
+
+### Project Structure
+
+Follow FastAPI best practices with modular organization:
+
+```
+app/
+├── api/
+│   └── v1/
+│       ├── endpoints/
+│       │   ├── image.py      # Image OCR endpoint
+│       │   └── convert.py    # Markdown to DOCX endpoint
+│       └── router.py
+├── core/
+│   └── config.py             # Settings and environment config
+|—— model/
+|   |—— DocLayout
+|   |—— PP-DocLayout
+├── services/
+│   ├── image_processor.py    # OpenCV preprocessing
+│   ├── layout_detector.py    # DocLayout-YOLO wrapper
+│   ├── ocr_service.py        # PaddleOCR-VL client
+│   └── docx_converter.py     # markdown_2_docx wrapper
+├── schemas/
+│   ├── image.py              # Request/response models for image OCR
+│   └── convert.py            # Request/response models for conversion
+└── main.py                   # FastAPI app initialization
+```
+
+**Rationale:** Separation of concerns between API layer, business logic (services), and data models (schemas).
+
+### Image Preprocessing
+
+- Use OpenCV `cv2.copyMakeBorder()` to add 30% whitespace padding
+- Padding color: white `[255, 255, 255]`
+- This matches DocLayout-YOLO's demo.py pattern
+
+### Layout Detection Flow
+
+1. DocLayout-YOLO detects layout regions (plain text, formulas, tables, figures)
+2. Exsit plain text, routes to PaddleOCR-VL with PP-DocLayoutV2, othewise routes to PaddleOCR-VL with prompt
+3. PaddleOCR-VL combined PP-DocLayoutV2 handles mixed content recognition internally, PaddleOCR-VL combined prompt handles formula
+
+### External Service Integration
+
+- PaddleOCR-VL: Connect to vLLM server at configurable URL (default: `http://localhost:8080/v1`)
+- DocLayout-YOLO: Load model from pre-downloaded path (not downloaded in container)
+
+### Docker Strategy
+
+- Base image: NVIDIA CUDA with Python 3.11
+- Pre-install OpenCV dependencies (`libgl1-mesa-glx`, `libglib2.0-0`)
+- Mount model directory for DocLayout-YOLO weights
+- Expose port 8053
+- Use Uvicorn with multiple workers
+
+## Risks / Trade-offs
+
+| Risk                              | Mitigation                                                         |
+| --------------------------------- | ------------------------------------------------------------------ |
+| PaddleOCR-VL service unavailable  | Health check endpoint, retry logic with exponential backoff        |
+| Large image memory consumption    | Configure max image size, resize before processing                 |
+| DocLayout-YOLO model loading time | Load model once at startup, keep in memory                         |
+| GPU memory contention             | DocLayout-YOLO uses GPU; PaddleOCR-VL runs on separate vLLM server |
+
+## Configuration
+
+Environment variables:
+
+- `PADDLEOCR_VL_URL`: vLLM server URL (default: `http://localhost:8000/v1`)
+- `DOCLAYOUT_MODEL_PATH`: Path to DocLayout-YOLO weights
+- `PP_DOCLAYOUT_MODEL_DIR`: Path to PP-DocLayoutV3 model directory
+- `MAX_IMAGE_SIZE_MB`: Maximum upload size (default: 10)
+
+## Open Questions
+
+- Should we add async queue for large batch processing? (Defer to future change)
+- Do we need WebSocket for progress updates? (Defer to future change)
diff --git a/openspec/changes/add-doc-processing-api/proposal.md b/openspec/changes/add-doc-processing-api/proposal.md
new file mode 100644
index 0000000..b5592de
--- /dev/null
+++ b/openspec/changes/add-doc-processing-api/proposal.md
@@ -0,0 +1,31 @@
+# Change: Add Document Processing API
+
+## Why
+
+DocProcesser needs a FastAPI backend to accept images (via URL or base64) and convert them to LaTeX/Markdown/MathML, plus a markdown-to-DOCX conversion endpoint. This establishes the core functionality of the project.
+
+## What Changes
+
+- **BREAKING**: Initial project setup (new FastAPI project structure)
+- Add image-to-OCR API endpoint (`POST /doc_process/v1/image/ocr`)
+  - Accept `image_url` or `image_base64` input
+  - Preprocess with OpenCV (30% whitespace padding)
+  - Use DocLayout-YOLO for layout detection
+  - Route to PaddleOCR-VL (with PP-DocLayoutV2) for text/formula recognition
+  - Exists `plain_text` element, use PP-DocLayoutV2 to recognize the image as mixed_recognition , otherwise directly PaddleOCR-VL API combined with prompt Formula Recognition as formula_recognition.
+  - Refrence markdown_2_docx code convert the markdown to latex, mathml for mixed_recognition, convert the latex to markdown, mathml for formula_recognition
+  - Return LaTeX, Markdown, and MathML outputs
+- Add markdown-to-DOCX API endpoint (`POST /doc_process/v1/convert/docx`)
+  - Accept markdown content
+  - Refrence markdown_2_docx library for conversion, the address is http://github.com/YogeLiu/markdown_2_docxdd.
+  - Return DOCX file
+- Add Dockerfile for GPU-enabled deployment (RTX 5080, port 8053)
+
+## Impact
+
+- Affected specs: `image-ocr`, `markdown-docx`
+- Affected code: New project structure under `app/`
+- External dependencies:
+  - DocLayout-YOLO (pre-downloaded model, not fetched in container)
+  - PaddleOCR-VL with vLLM backend (external service at localhost:8080)
+  - markdown_2_docx library
diff --git a/openspec/changes/add-doc-processing-api/specs/image-ocr/spec.md b/openspec/changes/add-doc-processing-api/specs/image-ocr/spec.md
new file mode 100644
index 0000000..5ce690d
--- /dev/null
+++ b/openspec/changes/add-doc-processing-api/specs/image-ocr/spec.md
@@ -0,0 +1,137 @@
+## ADDED Requirements
+
+### Requirement: Image Input Acceptance
+
+The system SHALL accept images via `POST /api/v1/image/ocr` endpoint with either:
+
+- `image_url`: A publicly accessible URL to the image
+- `image_base64`: Base64-encoded image data
+
+The system SHALL return an error if neither input is provided or if both are provided simultaneously.
+
+#### Scenario: Image URL provided
+
+- **WHEN** a valid `image_url` is provided in the request body
+- **THEN** the system SHALL download the image and process it
+- **AND** return OCR results in the response
+
+#### Scenario: Base64 image provided
+
+- **WHEN** a valid `image_base64` string is provided in the request body
+- **THEN** the system SHALL decode the image and process it
+- **AND** return OCR results in the response
+
+#### Scenario: Invalid input
+
+- **WHEN** neither `image_url` nor `image_base64` is provided
+- **THEN** the system SHALL return HTTP 422 with validation error
+
+---
+
+### Requirement: Image Preprocessing with Padding
+
+The system SHALL preprocess all input images by adding 30% whitespace padding around the image borders using OpenCV.
+
+The padding calculation: `padding = int(max(height, width) * 0.15)` on each side (totaling 30% expansion).
+
+The padding color SHALL be white (`RGB: 255, 255, 255`).
+
+#### Scenario: Image padding applied
+
+- **WHEN** an image of dimensions 1000x800 pixels is received
+- **THEN** the system SHALL add approximately 150 pixels of white padding on each side
+- **AND** the resulting image dimensions SHALL be approximately 1300x1100 pixels
+
+---
+
+### Requirement: Layout Detection with DocLayout-YOLO
+
+The system SHALL use DocLayout-YOLO model to detect document layout regions including:
+
+- Plain text blocks
+- Formulas/equations
+- Tables
+- Figures
+
+The model SHALL be loaded from a pre-configured local path (not downloaded at runtime).
+
+#### Scenario: Layout detection success
+
+- **WHEN** a padded image is passed to DocLayout-YOLO
+- **THEN** the system SHALL return detected regions with bounding boxes and class labels
+- **AND** confidence scores for each detection
+
+#### Scenario: Model not available
+
+- **WHEN** the DocLayout-YOLO model file is not found at the configured path
+- **THEN** the system SHALL fail startup with a clear error message
+
+---
+
+### Requirement: OCR Processing with PaddleOCR-VL
+
+The system SHALL send images to PaddleOCR-VL (via vLLM backend) for text and formula recognition.
+
+PaddleOCR-VL SHALL be configured with PP-DocLayoutV2 for document layout understanding.
+
+The system SHALL handle both plain text and formula/math content.
+
+#### Scenario: Plain text recognition
+
+- **WHEN** DocLayout-YOLO detects plain text regions
+- **THEN** the system SHALL send the image to PaddleOCR-VL
+- **AND** return recognized text content
+
+#### Scenario: Formula recognition
+
+- **WHEN** DocLayout-YOLO detects formula/equation regions
+- **THEN** the system SHALL send the image to PaddleOCR-VL
+- **AND** return formula content in LaTeX format
+
+#### Scenario: Mixed content handling
+
+- **WHEN** DocLayout-YOLO detects both text and formula regions
+- **THEN** the system SHALL process all regions via PaddleOCR-VL with PP-DocLayoutV3
+- **AND** return combined results preserving document structure
+
+#### Scenario: PaddleOCR-VL service unavailable
+
+- **WHEN** the PaddleOCR-VL vLLM server is unreachable
+- **THEN** the system SHALL return HTTP 503 with service unavailable error
+
+---
+
+### Requirement: Multi-Format Output
+
+The system SHALL return OCR results in multiple formats:
+
+- `latex`: LaTeX representation of the content
+- `markdown`: Markdown representation of the content
+- `mathml`: MathML representation for mathematical content
+
+#### Scenario: Successful OCR response
+
+- **WHEN** image processing completes successfully
+- **THEN** the response SHALL include:
+  - `latex`: string containing LaTeX output
+  - `markdown`: string containing Markdown output
+  - `mathml`: string containing MathML output (empty string if no math detected)
+- **AND** HTTP status code SHALL be 200
+
+#### Scenario: Response structure
+
+- **WHEN** the OCR endpoint returns successfully
+- **THEN** the response body SHALL be JSON with structure:
+
+```json
+{
+  "latex": "...",
+  "markdown": "...",
+  "mathml": "...",
+  "layout_info": {
+    "regions": [
+      {"type": "text|formula|table|figure", "bbox": [x1, y1, x2, y2], "confidence": 0.95}
+    ]
+  }
+}
+```
diff --git a/openspec/changes/add-doc-processing-api/specs/markdown-docx/spec.md b/openspec/changes/add-doc-processing-api/specs/markdown-docx/spec.md
new file mode 100644
index 0000000..6b27820
--- /dev/null
+++ b/openspec/changes/add-doc-processing-api/specs/markdown-docx/spec.md
@@ -0,0 +1,93 @@
+## ADDED Requirements
+
+### Requirement: Markdown Input Acceptance
+
+The system SHALL accept markdown content via `POST /api/v1/convert/docx` endpoint.
+
+The request body SHALL contain:
+- `markdown`: string containing the markdown content to convert
+
+#### Scenario: Valid markdown provided
+
+- **WHEN** valid markdown content is provided in the request body
+- **THEN** the system SHALL process and convert it to DOCX format
+
+#### Scenario: Empty markdown
+
+- **WHEN** an empty `markdown` string is provided
+- **THEN** the system SHALL return HTTP 422 with validation error
+
+---
+
+### Requirement: DOCX Conversion
+
+The system SHALL convert markdown content to DOCX format using the markdown_2_docx library.
+
+The conversion SHALL preserve:
+- Headings (H1-H6)
+- Paragraphs
+- Bold and italic formatting
+- Lists (ordered and unordered)
+- Code blocks
+- Tables
+- Images (if embedded as base64 or accessible URLs)
+
+#### Scenario: Basic markdown conversion
+
+- **WHEN** markdown with headings, paragraphs, and formatting is provided
+- **THEN** the system SHALL generate a valid DOCX file
+- **AND** the DOCX SHALL preserve the document structure
+
+#### Scenario: Complex markdown with tables
+
+- **WHEN** markdown containing tables is provided
+- **THEN** the system SHALL convert tables to Word table format
+- **AND** preserve table structure and content
+
+#### Scenario: Markdown with math formulas
+
+- **WHEN** markdown containing LaTeX math expressions is provided
+- **THEN** the system SHALL convert math to OMML (Office Math Markup Language) format
+- **AND** render correctly in Microsoft Word
+
+---
+
+### Requirement: DOCX File Response
+
+The system SHALL return the generated DOCX file as a binary download.
+
+The response SHALL include:
+- Content-Type: `application/vnd.openxmlformats-officedocument.wordprocessingml.document`
+- Content-Disposition: `attachment; filename="output.docx"`
+
+#### Scenario: Successful conversion response
+
+- **WHEN** markdown conversion completes successfully
+- **THEN** the response SHALL be the DOCX file binary
+- **AND** HTTP status code SHALL be 200
+- **AND** appropriate headers for file download SHALL be set
+
+#### Scenario: Custom filename
+
+- **WHEN** an optional `filename` parameter is provided in the request
+- **THEN** the Content-Disposition header SHALL use the provided filename
+- **AND** append `.docx` extension if not present
+
+---
+
+### Requirement: Error Handling
+
+The system SHALL provide clear error responses for conversion failures.
+
+#### Scenario: Conversion failure
+
+- **WHEN** markdown_2_docx fails to convert the content
+- **THEN** the system SHALL return HTTP 500 with error details
+- **AND** the error message SHALL describe the failure reason
+
+#### Scenario: Malformed markdown
+
+- **WHEN** severely malformed markdown is provided
+- **THEN** the system SHALL attempt best-effort conversion
+- **AND** log a warning about potential formatting issues
+
diff --git a/openspec/changes/add-doc-processing-api/tasks.md b/openspec/changes/add-doc-processing-api/tasks.md
new file mode 100644
index 0000000..ef8d1f3
--- /dev/null
+++ b/openspec/changes/add-doc-processing-api/tasks.md
@@ -0,0 +1,34 @@
+## 1. Project Scaffolding
+
+- [x] 1.1 Create FastAPI project structure (`app/`, `api/`, `core/`, `services/`, `schemas/`)
+- [x] 1.2 Use uv handle with dependencies (fastapi, uvicorn, opencv-python, python-multipart, pydantic, httpx)
+- [x] 1.3 Create `app/main.py` with FastAPI app initialization
+- [x] 1.4 Create `app/core/config.py` with Pydantic Settings
+
+## 2. Image OCR API
+
+- [x] 2.1 Create request/response schemas in `app/schemas/image.py`
+- [x] 2.2 Implement image preprocessing service with OpenCV padding (`app/services/image_processor.py`)
+- [x] 2.3 Implement DocLayout-YOLO wrapper (`app/services/layout_detector.py`)
+- [x] 2.4 Implement PaddleOCR-VL client (`app/services/ocr_service.py`)
+- [x] 2.5 Create image OCR endpoint (`app/api/v1/endpoints/image.py`)
+- [x] 2.6 Wire up router and test endpoint
+
+## 3. Markdown to DOCX API
+
+- [x] 3.1 Create request/response schemas in `app/schemas/convert.py`
+- [x] 3.2 Integrate markdown_2_docx library (`app/services/docx_converter.py`)
+- [x] 3.3 Create conversion endpoint (`app/api/v1/endpoints/convert.py`)
+- [x] 3.4 Wire up router and test endpoint
+
+## 4. Deployment
+
+- [x] 4.1 Create Dockerfile with CUDA base image for RTX 5080
+- [x] 4.2 Create docker-compose.yml (optional, for local development)
+- [x] 4.3 Document deployment steps in README
+
+## 5. Validation
+
+- [ ] 5.1 Test image OCR endpoint with sample images
+- [ ] 5.2 Test markdown to DOCX conversion
+- [ ] 5.3 Verify Docker build and GPU access
diff --git a/openspec/project.md b/openspec/project.md
new file mode 100644
index 0000000..3d87cc8
--- /dev/null
+++ b/openspec/project.md
@@ -0,0 +1,42 @@
+# Project Context
+
+## Purpose
+
+This project is DocProcesser which can process the image to latex, markdown, mathml, omml, ect.
+It is a fastapi web project, it accept the request from upstream and process the image or send the image to the third-part, then return the result to upstream.
+
+## Tech Stack
+
+- python
+- fastapi
+
+## Project Conventions
+
+### Code Style
+
+[Describe your code style preferences, formatting rules, and naming conventions]
+
+### Architecture Patterns
+
+[Document your architectural decisions and patterns]
+
+### Testing Strategy
+
+[Explain your testing approach and requirements]
+
+### Git Workflow
+
+[Describe your branching strategy and commit conventions]
+
+## Domain Context
+
+- DocLayout
+  A YOLO model which can recognize the document layout (Book, Paper, NewPapers) will be used to recongize if has plain text in a image.
+
+## Important Constraints
+
+[List any technical, business, or regulatory constraints]
+
+## External Dependencies
+
+[Document key external services, APIs, or systems]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..42abbfb
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,53 @@
+[project]
+name = "doc-processer"
+version = "0.1.0"
+description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "MIT" }
+authors = [
+    { name = "YogeLiu" }
+]
+
+dependencies = [
+    "fastapi>=0.115.0",
+    "uvicorn[standard]>=0.32.0",
+    "opencv-python>=4.10.0",
+    "python-multipart>=0.0.12",
+    "pydantic>=2.10.0",
+    "pydantic-settings>=2.6.0",
+    "httpx>=0.28.0",
+    "numpy>=1.26.0",
+    "pillow>=10.4.0",
+    "python-docx>=1.1.0",
+    "paddleocr>=2.9.0",
+    "doclayout-yolo>=0.0.2",
+    "latex2mathml>=3.77.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.24.0",
+    "ruff>=0.8.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["app"]
+
+[tool.ruff]
+target-version = "py311"
+line-length = 100
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "UP"]
+ignore = ["E501"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+