commit 874fd383ccdafbcdeb205c774b30a19fca242eda Author: liuyuanchuang Date: Mon Dec 29 17:34:58 2025 +0800 init repo diff --git a/.cursor/commands/openspec-apply.md b/.cursor/commands/openspec-apply.md new file mode 100644 index 0000000..99a9148 --- /dev/null +++ b/.cursor/commands/openspec-apply.md @@ -0,0 +1,23 @@ +--- +name: /openspec-apply +id: openspec-apply +category: OpenSpec +description: Implement an approved OpenSpec change and keep tasks in sync. +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. + +**Steps** +Track these steps as TODOs and complete them one by one. +1. Read `changes//proposal.md`, `design.md` (if present), and `tasks.md` to confirm scope and acceptance criteria. +2. Work through tasks sequentially, keeping edits minimal and focused on the requested change. +3. Confirm completion before updating statuses—make sure every item in `tasks.md` is finished. +4. Update the checklist after all work is done so each task is marked `- [x]` and reflects reality. +5. Reference `openspec list` or `openspec show ` when additional context is required. + +**Reference** +- Use `openspec show --json --deltas-only` if you need additional context from the proposal while implementing. + diff --git a/.cursor/commands/openspec-archive.md b/.cursor/commands/openspec-archive.md new file mode 100644 index 0000000..492b3ed --- /dev/null +++ b/.cursor/commands/openspec-archive.md @@ -0,0 +1,27 @@ +--- +name: /openspec-archive +id: openspec-archive +category: OpenSpec +description: Archive a deployed OpenSpec change and update specs. +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. + +**Steps** +1. Determine the change ID to archive: + - If this prompt already includes a specific change ID (for example inside a `` block populated by slash-command arguments), use that value after trimming whitespace. + - If the conversation references a change loosely (for example by title or summary), run `openspec list` to surface likely IDs, share the relevant candidates, and confirm which one the user intends. + - Otherwise, review the conversation, run `openspec list`, and ask the user which change to archive; wait for a confirmed change ID before proceeding. + - If you still cannot identify a single change ID, stop and tell the user you cannot archive anything yet. +2. Validate the change ID by running `openspec list` (or `openspec show `) and stop if the change is missing, already archived, or otherwise not ready to archive. +3. Run `openspec archive --yes` so the CLI moves the change and applies spec updates without prompts (use `--skip-specs` only for tooling-only work). +4. Review the command output to confirm the target specs were updated and the change landed in `changes/archive/`. +5. Validate with `openspec validate --strict` and inspect with `openspec show ` if anything looks off. + +**Reference** +- Use `openspec list` to confirm change IDs before archiving. +- Inspect refreshed specs with `openspec list --specs` and address any validation issues before handing off. + diff --git a/.cursor/commands/openspec-proposal.md b/.cursor/commands/openspec-proposal.md new file mode 100644 index 0000000..25f1a3f --- /dev/null +++ b/.cursor/commands/openspec-proposal.md @@ -0,0 +1,28 @@ +--- +name: /openspec-proposal +id: openspec-proposal +category: OpenSpec +description: Scaffold a new OpenSpec change and validate strictly. +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. +- Identify any vague or ambiguous details and ask the necessary follow-up questions before editing files. +- Do not write any code during the proposal stage. Only create design documents (proposal.md, tasks.md, design.md, and spec deltas). Implementation happens in the apply stage after approval. + +**Steps** +1. Review `openspec/project.md`, run `openspec list` and `openspec list --specs`, and inspect related code or docs (e.g., via `rg`/`ls`) to ground the proposal in current behaviour; note any gaps that require clarification. +2. Choose a unique verb-led `change-id` and scaffold `proposal.md`, `tasks.md`, and `design.md` (when needed) under `openspec/changes//`. +3. Map the change into concrete capabilities or requirements, breaking multi-scope efforts into distinct spec deltas with clear relationships and sequencing. +4. Capture architectural reasoning in `design.md` when the solution spans multiple systems, introduces new patterns, or demands trade-off discussion before committing to specs. +5. Draft spec deltas in `changes//specs//spec.md` (one folder per capability) using `## ADDED|MODIFIED|REMOVED Requirements` with at least one `#### Scenario:` per requirement and cross-reference related capabilities when relevant. +6. Draft `tasks.md` as an ordered list of small, verifiable work items that deliver user-visible progress, include validation (tests, tooling), and highlight dependencies or parallelizable work. +7. Validate with `openspec validate --strict` and resolve every issue before sharing the proposal. + +**Reference** +- Use `openspec show --json --deltas-only` or `openspec show --type spec` to inspect details when validation fails. +- Search existing requirements with `rg -n "Requirement:|Scenario:" openspec/specs` before writing new ones. +- Explore the codebase with `rg `, `ls`, or direct file reads so proposals align with current implementation realities. + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f8ab18c --- /dev/null +++ b/.gitignore @@ -0,0 +1,73 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +.venv/ +venv/ +ENV/ +env/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# Environment +.env +.env.local +.env.*.local + +# Models (large files - download separately) +models/ +*.pt +*.onnx +*.pdmodel +*.pdiparams + +# Logs +*.log +logs/ + +# Temporary files +tmp/ +temp/ +*.tmp + +# OS +.DS_Store +Thumbs.db + +# Test +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# Docker +.docker/ + +# uv +uv.lock + +model/* diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..0669699 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,18 @@ + +# OpenSpec Instructions + +These instructions are for AI assistants working in this project. + +Always open `@/openspec/AGENTS.md` when the request: +- Mentions planning or proposals (words like proposal, spec, change, plan) +- Introduces new capabilities, breaking changes, architecture shifts, or big performance/security work +- Sounds ambiguous and you need the authoritative spec before coding + +Use `@/openspec/AGENTS.md` to learn: +- How to create and apply change proposals +- Spec format and conventions +- Project structure and guidelines + +Keep this managed block so 'openspec update' can refresh the instructions. + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3f3b60c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,62 @@ +# DocProcesser Dockerfile +# Optimized for RTX 5080 GPU deployment + +# Use NVIDIA CUDA base image with Python 3.11 +FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04 + +# Set environment variables +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3.11-venv \ + python3.11-dev \ + python3-pip \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libgomp1 \ + curl \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3.11 /usr/bin/python \ + && ln -sf /usr/bin/python3.11 /usr/bin/python3 + +# Install uv for fast package management +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Copy dependency files first for better caching +COPY pyproject.toml ./ + +# Create virtual environment and install dependencies +RUN uv venv /app/.venv +ENV PATH="/app/.venv/bin:$PATH" +ENV VIRTUAL_ENV="/app/.venv" + +RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . + +# Copy application code +COPY app/ ./app/ + +# Create model directories (models should be mounted at runtime) +RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout + +# Expose port +EXPOSE 8053 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8053/health || exit 1 + +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"] + diff --git a/README.md b/README.md new file mode 100644 index 0000000..b6f8c9a --- /dev/null +++ b/README.md @@ -0,0 +1,199 @@ +# DocProcesser + +Document processing API built with FastAPI. Converts images to LaTeX/Markdown/MathML and Markdown to DOCX. + +## Features + +- **Image OCR API** (`POST /doc_process/v1/image/ocr`) + - Accept images via URL or base64 + - Automatic layout detection using DocLayout-YOLO + - Text and formula recognition via PaddleOCR-VL + - Output in LaTeX, Markdown, and MathML formats + +- **Markdown to DOCX API** (`POST /doc_process/v1/convert/docx`) + - Convert markdown content to Word documents + - Preserve formatting, tables, and code blocks + +## Prerequisites + +- Python 3.11+ +- NVIDIA GPU with CUDA support (RTX 5080 recommended) +- PaddleOCR-VL service running via vLLM (default: `http://localhost:8080/v1`) +- Pre-downloaded models: + - DocLayout-YOLO + - PP-DocLayoutV2 + +## Quick Start + +### 1. Install Dependencies + +Using [uv](https://github.com/astral-sh/uv): + +```bash +# Install uv if not already installed +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Create virtual environment and install dependencies +uv venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate +uv pip install -e . +``` + +### 2. Download Models + +Download the required models and place them in the `models/` directory: + +```bash +mkdir -p models/DocLayout models/PP-DocLayout + +# DocLayout-YOLO (from HuggingFace) +# https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench +# Place the .pt file in models/DocLayout/ + +# PP-DocLayoutV2 (from PaddlePaddle) +# Place the model files in models/PP-DocLayout/ +``` + +### 3. Configure Environment + +Create a `.env` file: + +```bash +# PaddleOCR-VL vLLM server URL +PADDLEOCR_VL_URL=http://localhost:8080/v1 + +# Model paths +DOCLAYOUT_MODEL_PATH=models/DocLayout/doclayout_yolo_docstructbench_imgsz1024.pt +PP_DOCLAYOUT_MODEL_DIR=models/PP-DocLayout + +# Server settings +HOST=0.0.0.0 +PORT=8053 +``` + +### 4. Run the Server + +```bash +uvicorn app.main:app --host 0.0.0.0 --port 8053 +``` + +## Docker Deployment + +### Build and Run with GPU + +```bash +# Build the image +docker build -t doc-processer . + +# Run with GPU support +docker run --gpus all -p 8053:8053 \ + -v ./models/DocLayout:/app/models/DocLayout:ro \ + -v ./models/PP-DocLayout:/app/models/PP-DocLayout:ro \ + -e PADDLEOCR_VL_URL=http://host.docker.internal:8080/v1 \ + doc-processer +``` + +### Using Docker Compose + +```bash +# Start the service with GPU +docker-compose up -d doc-processer + +# Or without GPU (CPU mode) +docker-compose --profile cpu up -d doc-processer-cpu +``` + +## API Usage + +### Image OCR + +```bash +# Using image URL +curl -X POST http://localhost:8053/doc_process/v1/image/ocr \ + -H "Content-Type: application/json" \ + -d '{"image_url": "https://example.com/document.png"}' + +# Using base64 image +curl -X POST http://localhost:8053/doc_process/v1/image/ocr \ + -H "Content-Type: application/json" \ + -d '{"image_base64": "iVBORw0KGgo..."}' +``` + +Response: +```json +{ + "latex": "\\section{Title}...", + "markdown": "# Title\n...", + "mathml": "...", + "layout_info": { + "regions": [ + {"type": "text", "bbox": [10, 20, 100, 50], "confidence": 0.95} + ], + "has_plain_text": true, + "has_formula": false + }, + "recognition_mode": "mixed_recognition" +} +``` + +### Markdown to DOCX + +```bash +curl -X POST http://localhost:8053/doc_process/v1/convert/docx \ + -H "Content-Type: application/json" \ + -d '{"markdown": "# Hello World\n\nThis is a test.", "filename": "output"}' \ + --output output.docx +``` + +## Project Structure + +``` +doc_processer/ +├── app/ +│ ├── api/v1/ +│ │ ├── endpoints/ +│ │ │ ├── image.py # Image OCR endpoint +│ │ │ └── convert.py # Markdown to DOCX endpoint +│ │ └── router.py +│ ├── core/ +│ │ ├── config.py # Settings +│ │ └── dependencies.py # DI providers +│ ├── services/ +│ │ ├── image_processor.py # OpenCV preprocessing +│ │ ├── layout_detector.py # DocLayout-YOLO +│ │ ├── ocr_service.py # PaddleOCR-VL client +│ │ └── docx_converter.py # Markdown to DOCX +│ ├── schemas/ +│ │ ├── image.py +│ │ └── convert.py +│ └── main.py +├── models/ # Pre-downloaded models (git-ignored) +├── Dockerfile +├── docker-compose.yml +├── pyproject.toml +└── README.md +``` + +## Processing Pipeline + +### Image OCR Flow + +1. **Input**: Accept `image_url` or `image_base64` +2. **Preprocessing**: Add 30% whitespace padding using OpenCV +3. **Layout Detection**: DocLayout-YOLO detects regions (text, formula, table, figure) +4. **Recognition**: + - If plain text detected → PP-DocLayoutV2 for mixed content recognition + - Otherwise → PaddleOCR-VL with formula prompt +5. **Output Conversion**: Generate LaTeX, Markdown, and MathML + +## Hardware Requirements + +- **Minimum**: 8GB GPU VRAM +- **Recommended**: RTX 5080 16GB or equivalent +- **CPU**: 4+ cores +- **RAM**: 16GB+ + +## License + +MIT + diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/v1/__init__.py b/app/api/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/v1/endpoints/__init__.py b/app/api/v1/endpoints/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/v1/endpoints/convert.py b/app/api/v1/endpoints/convert.py new file mode 100644 index 0000000..0ffa29a --- /dev/null +++ b/app/api/v1/endpoints/convert.py @@ -0,0 +1,37 @@ +"""Markdown to DOCX conversion endpoint.""" + +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import Response + +from app.core.dependencies import get_docx_converter +from app.schemas.convert import MarkdownToDocxRequest +from app.services.docx_converter import DocxConverter + +router = APIRouter() + + +@router.post("/docx") +async def convert_markdown_to_docx( + request: MarkdownToDocxRequest, + converter: DocxConverter = Depends(get_docx_converter), +) -> Response: + """Convert markdown content to DOCX file. + + Returns the generated DOCX file as a binary download. + """ + try: + docx_bytes = converter.convert(request.markdown) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Conversion failed: {e}") + + # Determine filename + filename = request.filename or "output" + if not filename.endswith(".docx"): + filename = f"{filename}.docx" + + return Response( + content=docx_bytes, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + headers={"Content-Disposition": f'attachment; filename="{filename}"'}, + ) + diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py new file mode 100644 index 0000000..c194213 --- /dev/null +++ b/app/api/v1/endpoints/image.py @@ -0,0 +1,59 @@ +"""Image OCR endpoint.""" + +from fastapi import APIRouter, Depends, HTTPException + +from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service +from app.schemas.image import ImageOCRRequest, ImageOCRResponse +from app.services.image_processor import ImageProcessor +from app.services.layout_detector import LayoutDetector +from app.services.ocr_service import OCRService + +router = APIRouter() + + +@router.post("/ocr", response_model=ImageOCRResponse) +async def process_image_ocr( + request: ImageOCRRequest, + image_processor: ImageProcessor = Depends(get_image_processor), + layout_detector: LayoutDetector = Depends(get_layout_detector), + ocr_service: OCRService = Depends(get_ocr_service), +) -> ImageOCRResponse: + """Process an image and extract content as LaTeX, Markdown, and MathML. + + The processing pipeline: + 1. Load and preprocess image (add 30% whitespace padding) + 2. Detect layout using DocLayout-YOLO + 3. Based on layout: + - If plain text exists: use PP-DocLayoutV2 for mixed recognition + - Otherwise: use PaddleOCR-VL with formula prompt + 4. Convert output to LaTeX, Markdown, and MathML formats + """ + try: + # 1. Load and preprocess image + image = image_processor.preprocess( + image_url=request.image_url, + image_base64=request.image_base64, + ) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + + try: + # 2. Detect layout + layout_info = layout_detector.detect(image) + except RuntimeError as e: + raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}") + + try: + # 3. Perform OCR based on layout + ocr_result = ocr_service.recognize(image, layout_info) + except RuntimeError as e: + raise HTTPException(status_code=503, detail=str(e)) + + # 4. Return response + return ImageOCRResponse( + latex=ocr_result.get("latex", ""), + markdown=ocr_result.get("markdown", ""), + mathml=ocr_result.get("mathml", ""), + layout_info=layout_info, + recognition_mode=ocr_result.get("recognition_mode", ""), + ) diff --git a/app/api/v1/router.py b/app/api/v1/router.py new file mode 100644 index 0000000..a553985 --- /dev/null +++ b/app/api/v1/router.py @@ -0,0 +1,13 @@ +"""API v1 router combining all endpoints.""" + +from fastapi import APIRouter + +from app.api.v1.endpoints import convert, image + +api_router = APIRouter() + +# Include image processing endpoints +api_router.include_router(image.router, prefix="/image", tags=["Image OCR"]) + +# Include conversion endpoints +api_router.include_router(convert.router, prefix="/convert", tags=["Conversion"]) diff --git a/app/core/__init__.py b/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/config.py b/app/core/config.py new file mode 100644 index 0000000..0a57ad1 --- /dev/null +++ b/app/core/config.py @@ -0,0 +1,52 @@ +"""Application configuration using Pydantic Settings.""" + +from functools import lru_cache +from pathlib import Path + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Application settings loaded from environment variables.""" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + ) + + # API Settings + api_prefix: str = "/doc_process/v1" + debug: bool = False + + # PaddleOCR-VL Settings + paddleocr_vl_url: str = "http://localhost:8080/v1" + + # Model Paths + doclayout_model_path: str = "app/model/DocLayout" + pp_doclayout_model_dir: str = "app/model/PP-DocLayout" + + # Image Processing + max_image_size_mb: int = 10 + image_padding_ratio: float = 0.15 # 15% on each side = 30% total expansion + + # Server Settings + host: str = "0.0.0.0" + port: int = 8053 + + @property + def doclayout_model_file(self) -> Path: + """Get the DocLayout model file path.""" + return Path(self.doclayout_model_path) + + @property + def pp_doclayout_dir(self) -> Path: + """Get the PP-DocLayout model directory path.""" + return Path(self.pp_doclayout_model_dir) + + +@lru_cache +def get_settings() -> Settings: + """Get cached settings instance.""" + return Settings() + diff --git a/app/core/dependencies.py b/app/core/dependencies.py new file mode 100644 index 0000000..dcd04ae --- /dev/null +++ b/app/core/dependencies.py @@ -0,0 +1,42 @@ +"""Application dependencies.""" + +from app.services.image_processor import ImageProcessor +from app.services.layout_detector import LayoutDetector +from app.services.ocr_service import OCRService +from app.services.docx_converter import DocxConverter + +# Global instances (initialized on startup) +_layout_detector: LayoutDetector | None = None + + +def init_layout_detector(model_path: str) -> None: + """Initialize the global layout detector. + + Called during application startup. + """ + global _layout_detector + _layout_detector = LayoutDetector(model_path=model_path) + _layout_detector.load_model() + + +def get_layout_detector() -> LayoutDetector: + """Get the global layout detector instance.""" + if _layout_detector is None: + raise RuntimeError("Layout detector not initialized. Call init_layout_detector() first.") + return _layout_detector + + +def get_image_processor() -> ImageProcessor: + """Get an image processor instance.""" + return ImageProcessor() + + +def get_ocr_service() -> OCRService: + """Get an OCR service instance.""" + return OCRService() + + +def get_docx_converter() -> DocxConverter: + """Get a DOCX converter instance.""" + return DocxConverter() + diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..174b5ae --- /dev/null +++ b/app/main.py @@ -0,0 +1,39 @@ +"""FastAPI application entry point.""" + +from contextlib import asynccontextmanager + +from fastapi import FastAPI + +from app.api.v1.router import api_router +from app.core.config import get_settings +from app.core.dependencies import init_layout_detector + +settings = get_settings() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Application lifespan handler for startup/shutdown.""" + # Startup: Load models + init_layout_detector(model_path=settings.doclayout_model_path) + + yield + + # Shutdown: Cleanup happens automatically + + +app = FastAPI( + title="DocProcesser API", + description="Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX", + version="0.1.0", + lifespan=lifespan, +) + +# Include API router +app.include_router(api_router, prefix=settings.api_prefix) + + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + return {"status": "healthy"} diff --git a/app/model/__init__.py b/app/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/schemas/convert.py b/app/schemas/convert.py new file mode 100644 index 0000000..09661e3 --- /dev/null +++ b/app/schemas/convert.py @@ -0,0 +1,19 @@ +"""Request and response schemas for markdown to DOCX conversion endpoint.""" + +from pydantic import BaseModel, Field, field_validator + + +class MarkdownToDocxRequest(BaseModel): + """Request body for markdown to DOCX conversion endpoint.""" + + markdown: str = Field(..., description="Markdown content to convert") + filename: str | None = Field(None, description="Optional output filename (without extension)") + + @field_validator("markdown") + @classmethod + def validate_markdown_not_empty(cls, v: str) -> str: + """Validate that markdown content is not empty.""" + if not v or not v.strip(): + raise ValueError("Markdown content cannot be empty") + return v + diff --git a/app/schemas/image.py b/app/schemas/image.py new file mode 100644 index 0000000..ed81233 --- /dev/null +++ b/app/schemas/image.py @@ -0,0 +1,48 @@ +"""Request and response schemas for image OCR endpoint.""" + +from pydantic import BaseModel, Field, model_validator + + +class LayoutRegion(BaseModel): + """A detected layout region in the document.""" + + type: str = Field(..., description="Region type: text, formula, table, figure") + bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]") + confidence: float = Field(..., description="Detection confidence score") + + +class LayoutInfo(BaseModel): + """Layout detection information.""" + + regions: list[LayoutRegion] = Field(default_factory=list) + has_plain_text: bool = Field(False, description="Whether plain text was detected") + has_formula: bool = Field(False, description="Whether formulas were detected") + + +class ImageOCRRequest(BaseModel): + """Request body for image OCR endpoint.""" + + image_url: str | None = Field(None, description="URL to fetch the image from") + image_base64: str | None = Field(None, description="Base64-encoded image data") + + @model_validator(mode="after") + def validate_input(self): + """Validate that exactly one of image_url or image_base64 is provided.""" + if self.image_url is None and self.image_base64 is None: + raise ValueError("Either image_url or image_base64 must be provided") + if self.image_url is not None and self.image_base64 is not None: + raise ValueError("Only one of image_url or image_base64 should be provided") + return self + + +class ImageOCRResponse(BaseModel): + """Response body for image OCR endpoint.""" + + latex: str = Field("", description="LaTeX representation of the content") + markdown: str = Field("", description="Markdown representation of the content") + mathml: str = Field("", description="MathML representation (empty if no math detected)") + layout_info: LayoutInfo = Field(default_factory=LayoutInfo) + recognition_mode: str = Field( + "", description="Recognition mode used: mixed_recognition or formula_recognition" + ) + diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/docx_converter.py b/app/services/docx_converter.py new file mode 100644 index 0000000..6364507 --- /dev/null +++ b/app/services/docx_converter.py @@ -0,0 +1,335 @@ +"""Markdown to DOCX conversion service. + +Reference implementation based on https://github.com/YogeLiu/markdown_2_docx +""" + +import io +import re +from dataclasses import dataclass + +from docx import Document +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement +from docx.oxml.ns import qn +from docx.shared import Inches, Pt + + +@dataclass +class MarkdownElement: + """Parsed markdown element.""" + + type: str # heading, paragraph, list_item, code_block, table, math + content: str + level: int = 0 # For headings and lists + language: str = "" # For code blocks + + +class DocxConverter: + """Converts markdown content to DOCX format.""" + + def __init__(self): + """Initialize the converter.""" + self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$") + self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$") + self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$") + self.code_block_pattern = re.compile(r"^```(\w*)$") + self.inline_code_pattern = re.compile(r"`([^`]+)`") + self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*") + self.italic_pattern = re.compile(r"\*([^*]+)\*") + self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL) + self.inline_math_pattern = re.compile(r"\$([^$]+)\$") + + def convert(self, markdown: str) -> bytes: + """Convert markdown content to DOCX. + + Args: + markdown: Markdown content to convert. + + Returns: + DOCX file as bytes. + """ + doc = Document() + elements = self._parse_markdown(markdown) + + for element in elements: + self._add_element_to_doc(doc, element) + + # Save to bytes + buffer = io.BytesIO() + doc.save(buffer) + buffer.seek(0) + return buffer.getvalue() + + def _parse_markdown(self, markdown: str) -> list[MarkdownElement]: + """Parse markdown into elements. + + Args: + markdown: Markdown content. + + Returns: + List of parsed elements. + """ + elements: list[MarkdownElement] = [] + lines = markdown.split("\n") + i = 0 + in_code_block = False + code_content = [] + code_language = "" + + while i < len(lines): + line = lines[i] + + # Code block handling + code_match = self.code_block_pattern.match(line) + if code_match: + if in_code_block: + elements.append( + MarkdownElement( + type="code_block", + content="\n".join(code_content), + language=code_language, + ) + ) + code_content = [] + in_code_block = False + else: + in_code_block = True + code_language = code_match.group(1) + i += 1 + continue + + if in_code_block: + code_content.append(line) + i += 1 + continue + + # Math block ($$...$$) + if line.strip().startswith("$$"): + math_content = [] + if line.strip() == "$$": + i += 1 + while i < len(lines) and lines[i].strip() != "$$": + math_content.append(lines[i]) + i += 1 + else: + # Single line $$...$$ or start + content = line.strip()[2:] + if content.endswith("$$"): + math_content.append(content[:-2]) + else: + math_content.append(content) + i += 1 + while i < len(lines): + if lines[i].strip().endswith("$$"): + math_content.append(lines[i].strip()[:-2]) + break + math_content.append(lines[i]) + i += 1 + + elements.append( + MarkdownElement(type="math", content="\n".join(math_content)) + ) + i += 1 + continue + + # Heading + heading_match = self.heading_pattern.match(line) + if heading_match: + level = len(heading_match.group(1)) + content = heading_match.group(2) + elements.append( + MarkdownElement(type="heading", content=content, level=level) + ) + i += 1 + continue + + # Unordered list + list_match = self.list_pattern.match(line) + if list_match: + indent = len(list_match.group(1)) + content = list_match.group(2) + elements.append( + MarkdownElement(type="list_item", content=content, level=indent // 2) + ) + i += 1 + continue + + # Ordered list + ordered_match = self.ordered_list_pattern.match(line) + if ordered_match: + indent = len(ordered_match.group(1)) + content = ordered_match.group(2) + elements.append( + MarkdownElement( + type="ordered_list_item", content=content, level=indent // 2 + ) + ) + i += 1 + continue + + # Table (simple detection) + if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]: + table_lines = [line] + i += 1 + while i < len(lines) and "|" in lines[i]: + table_lines.append(lines[i]) + i += 1 + elements.append( + MarkdownElement(type="table", content="\n".join(table_lines)) + ) + continue + + # Regular paragraph + if line.strip(): + elements.append(MarkdownElement(type="paragraph", content=line)) + + i += 1 + + return elements + + def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None: + """Add a markdown element to the document. + + Args: + doc: Word document. + element: Parsed markdown element. + """ + if element.type == "heading": + self._add_heading(doc, element.content, element.level) + elif element.type == "paragraph": + self._add_paragraph(doc, element.content) + elif element.type == "list_item": + self._add_list_item(doc, element.content, element.level, ordered=False) + elif element.type == "ordered_list_item": + self._add_list_item(doc, element.content, element.level, ordered=True) + elif element.type == "code_block": + self._add_code_block(doc, element.content) + elif element.type == "table": + self._add_table(doc, element.content) + elif element.type == "math": + self._add_math(doc, element.content) + + def _add_heading(self, doc: Document, content: str, level: int) -> None: + """Add a heading to the document.""" + # Map markdown levels to Word heading styles + heading_level = min(level, 9) # Word supports up to Heading 9 + doc.add_heading(content, level=heading_level) + + def _add_paragraph(self, doc: Document, content: str) -> None: + """Add a paragraph with inline formatting.""" + para = doc.add_paragraph() + self._add_formatted_text(para, content) + + def _add_formatted_text(self, para, content: str) -> None: + """Add text with inline formatting (bold, italic, code).""" + # Simple approach: process inline patterns + remaining = content + + while remaining: + # Find next formatting marker + bold_match = self.bold_pattern.search(remaining) + italic_match = self.italic_pattern.search(remaining) + code_match = self.inline_code_pattern.search(remaining) + math_match = self.inline_math_pattern.search(remaining) + + matches = [ + (bold_match, "bold"), + (italic_match, "italic"), + (code_match, "code"), + (math_match, "math"), + ] + matches = [(m, t) for m, t in matches if m] + + if not matches: + para.add_run(remaining) + break + + # Find earliest match + earliest = min(matches, key=lambda x: x[0].start()) + match, match_type = earliest + + # Add text before match + if match.start() > 0: + para.add_run(remaining[: match.start()]) + + # Add formatted text + run = para.add_run(match.group(1)) + if match_type == "bold": + run.bold = True + elif match_type == "italic": + run.italic = True + elif match_type == "code": + run.font.name = "Courier New" + run.font.size = Pt(10) + elif match_type == "math": + run.italic = True + + remaining = remaining[match.end() :] + + def _add_list_item( + self, doc: Document, content: str, level: int, ordered: bool + ) -> None: + """Add a list item.""" + para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number") + para.paragraph_format.left_indent = Inches(0.25 * level) + self._add_formatted_text(para, content) + + def _add_code_block(self, doc: Document, content: str) -> None: + """Add a code block.""" + para = doc.add_paragraph() + para.paragraph_format.left_indent = Inches(0.5) + + run = para.add_run(content) + run.font.name = "Courier New" + run.font.size = Pt(9) + + # Add shading + shading = OxmlElement("w:shd") + shading.set(qn("w:val"), "clear") + shading.set(qn("w:fill"), "F0F0F0") + para._p.get_or_add_pPr().append(shading) + + def _add_table(self, doc: Document, content: str) -> None: + """Add a table from markdown table format.""" + lines = [l.strip() for l in content.split("\n") if l.strip()] + if len(lines) < 2: + return + + # Parse header + header = [c.strip() for c in lines[0].split("|") if c.strip()] + + # Skip separator line + data_lines = lines[2:] if len(lines) > 2 else [] + + # Create table + table = doc.add_table(rows=1, cols=len(header)) + table.style = "Table Grid" + + # Add header + header_cells = table.rows[0].cells + for i, text in enumerate(header): + header_cells[i].text = text + header_cells[i].paragraphs[0].runs[0].bold = True + + # Add data rows + for line in data_lines: + cells = [c.strip() for c in line.split("|") if c.strip()] + row_cells = table.add_row().cells + for i, text in enumerate(cells): + if i < len(row_cells): + row_cells[i].text = text + + def _add_math(self, doc: Document, content: str) -> None: + """Add a math block. + + For proper OMML rendering, this would need more complex conversion. + Currently renders as italic text with the LaTeX source. + """ + para = doc.add_paragraph() + para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + run = para.add_run(content) + run.italic = True + run.font.name = "Cambria Math" + run.font.size = Pt(12) + diff --git a/app/services/image_processor.py b/app/services/image_processor.py new file mode 100644 index 0000000..e9c0e26 --- /dev/null +++ b/app/services/image_processor.py @@ -0,0 +1,139 @@ +"""Image preprocessing service using OpenCV.""" + +import base64 +import io +from urllib.request import urlopen + +import cv2 +import numpy as np +from PIL import Image + +from app.core.config import get_settings + +settings = get_settings() + + +class ImageProcessor: + """Service for image preprocessing operations.""" + + def __init__(self, padding_ratio: float | None = None): + """Initialize with padding ratio. + + Args: + padding_ratio: Ratio for padding on each side (default from settings). + 0.15 means 15% padding on each side = 30% total expansion. + """ + self.padding_ratio = padding_ratio or settings.image_padding_ratio + + def load_image_from_url(self, url: str) -> np.ndarray: + """Load image from URL. + + Args: + url: Image URL to fetch. + + Returns: + Image as numpy array in BGR format. + + Raises: + ValueError: If image cannot be loaded from URL. + """ + try: + with urlopen(url, timeout=30) as response: + image_data = response.read() + image = Image.open(io.BytesIO(image_data)) + return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) + except Exception as e: + raise ValueError(f"Failed to load image from URL: {e}") from e + + def load_image_from_base64(self, base64_str: str) -> np.ndarray: + """Load image from base64 string. + + Args: + base64_str: Base64-encoded image data. + + Returns: + Image as numpy array in BGR format. + + Raises: + ValueError: If image cannot be decoded. + """ + try: + # Handle data URL format + if "," in base64_str: + base64_str = base64_str.split(",", 1)[1] + + image_data = base64.b64decode(base64_str) + image = Image.open(io.BytesIO(image_data)) + return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) + except Exception as e: + raise ValueError(f"Failed to decode base64 image: {e}") from e + + def add_padding(self, image: np.ndarray) -> np.ndarray: + """Add whitespace padding around the image. + + Adds padding equal to padding_ratio * max(height, width) on each side. + This expands the image by approximately 30% total (15% on each side). + + Args: + image: Input image as numpy array in BGR format. + + Returns: + Padded image as numpy array. + """ + height, width = image.shape[:2] + padding = int(max(height, width) * self.padding_ratio) + + # Add white padding on all sides + padded_image = cv2.copyMakeBorder( + image, + top=padding, + bottom=padding, + left=padding, + right=padding, + borderType=cv2.BORDER_CONSTANT, + value=[255, 255, 255], # White + ) + + return padded_image + + def preprocess(self, image_url: str | None, image_base64: str | None) -> np.ndarray: + """Load and preprocess image with padding. + + Args: + image_url: URL to fetch image from (optional). + image_base64: Base64-encoded image (optional). + + Returns: + Preprocessed image with padding. + + Raises: + ValueError: If neither input is provided or loading fails. + """ + if image_url: + image = self.load_image_from_url(image_url) + elif image_base64: + image = self.load_image_from_base64(image_base64) + else: + raise ValueError("Either image_url or image_base64 must be provided") + + return self.add_padding(image) + + def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str: + """Convert numpy image to base64 string. + + Args: + image: Image as numpy array in BGR format. + format: Output format (PNG, JPEG). + + Returns: + Base64-encoded image string. + """ + image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(image_rgb) + + buffer = io.BytesIO() + pil_image.save(buffer, format=format) + buffer.seek(0) + + return base64.b64encode(buffer.getvalue()).decode("utf-8") + diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py new file mode 100644 index 0000000..03bb020 --- /dev/null +++ b/app/services/layout_detector.py @@ -0,0 +1,119 @@ +"""DocLayout-YOLO wrapper for document layout detection.""" + +import numpy as np + +from app.schemas.image import LayoutInfo, LayoutRegion + + +class LayoutDetector: + """Wrapper for DocLayout-YOLO model.""" + + # Class names from DocLayout-YOLO + CLASS_NAMES = { + 0: "title", + 1: "plain_text", + 2: "abandon", + 3: "figure", + 4: "figure_caption", + 5: "table", + 6: "table_caption", + 7: "table_footnote", + 8: "isolate_formula", + 9: "formula_caption", + } + + # Classes considered as plain text + PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"} + + # Classes considered as formula + FORMULA_CLASSES = {"isolate_formula", "formula_caption"} + + def __init__(self, model_path: str, confidence_threshold: float = 0.2): + """Initialize the layout detector. + + Args: + model_path: Path to the DocLayout-YOLO model weights. + confidence_threshold: Minimum confidence for detections. + """ + self.model_path = model_path + self.confidence_threshold = confidence_threshold + self.model = None + + def load_model(self) -> None: + """Load the DocLayout-YOLO model. + + Raises: + RuntimeError: If model cannot be loaded. + """ + try: + from doclayout_yolo import YOLOv10 + + self.model = YOLOv10(self.model_path) + except Exception as e: + raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e + + def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo: + """Detect document layout regions. + + Args: + image: Input image as numpy array in BGR format. + image_size: Image size for prediction. + + Returns: + LayoutInfo with detected regions. + + Raises: + RuntimeError: If model not loaded. + """ + if self.model is None: + raise RuntimeError("Model not loaded. Call load_model() first.") + + # Run prediction + results = self.model.predict( + image, + imgsz=image_size, + conf=self.confidence_threshold, + device="cuda:0", + ) + + regions: list[LayoutRegion] = [] + has_plain_text = False + has_formula = False + + if results and len(results) > 0: + result = results[0] + if result.boxes is not None: + for box in result.boxes: + cls_id = int(box.cls[0].item()) + confidence = float(box.conf[0].item()) + bbox = box.xyxy[0].tolist() + + class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}") + + # Map to simplified type + if class_name in self.PLAIN_TEXT_CLASSES: + region_type = "text" + has_plain_text = True + elif class_name in self.FORMULA_CLASSES: + region_type = "formula" + has_formula = True + elif class_name in {"figure"}: + region_type = "figure" + elif class_name in {"table"}: + region_type = "table" + else: + region_type = class_name + + regions.append( + LayoutRegion( + type=region_type, + bbox=bbox, + confidence=confidence, + ) + ) + + return LayoutInfo( + regions=regions, + has_plain_text=has_plain_text, + has_formula=has_formula, + ) diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py new file mode 100644 index 0000000..8c7fe41 --- /dev/null +++ b/app/services/ocr_service.py @@ -0,0 +1,303 @@ +"""PaddleOCR-VL client service for text and formula recognition.""" + +import io +import tempfile +from pathlib import Path + +import cv2 +import numpy as np + +from app.core.config import get_settings +from app.schemas.image import LayoutInfo + +settings = get_settings() + + +class OCRService: + """Service for OCR using PaddleOCR-VL.""" + + FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format." + + def __init__( + self, + vl_server_url: str | None = None, + pp_doclayout_model_dir: str | None = None, + ): + """Initialize OCR service. + + Args: + vl_server_url: URL of the vLLM server for PaddleOCR-VL. + pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory. + """ + self.vl_server_url = vl_server_url or settings.paddleocr_vl_url + self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir + self._pipeline = None + + def _get_pipeline(self): + """Get or create PaddleOCR-VL pipeline. + + Returns: + PaddleOCRVL pipeline instance. + """ + if self._pipeline is None: + from paddleocr import PaddleOCRVL + + self._pipeline = PaddleOCRVL( + vl_rec_backend="vllm-server", + vl_rec_server_url=self.vl_server_url, + layout_detection_model_name="PP-DocLayoutV2", + layout_detection_model_dir=self.pp_doclayout_model_dir, + ) + return self._pipeline + + def _save_temp_image(self, image: np.ndarray) -> str: + """Save image to a temporary file. + + Args: + image: Image as numpy array in BGR format. + + Returns: + Path to temporary file. + """ + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + cv2.imwrite(f.name, image) + return f.name + + def recognize_mixed(self, image: np.ndarray) -> dict: + """Recognize mixed content (text + formulas) using PP-DocLayoutV2. + + This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware + recognition of mixed content. + + Args: + image: Input image as numpy array in BGR format. + + Returns: + Dict with 'markdown', 'latex', 'mathml' keys. + """ + try: + pipeline = self._get_pipeline() + temp_path = self._save_temp_image(image) + + try: + results = list(pipeline.predict(temp_path)) + + markdown_content = "" + for result in results: + # PaddleOCR-VL results can be saved to markdown + md_buffer = io.StringIO() + result.save_to_markdown(save_path=md_buffer) + markdown_content += md_buffer.getvalue() + + # Convert markdown to other formats + latex = self._markdown_to_latex(markdown_content) + mathml = self._extract_mathml(markdown_content) + + return { + "markdown": markdown_content, + "latex": latex, + "mathml": mathml, + } + finally: + Path(temp_path).unlink(missing_ok=True) + + except Exception as e: + raise RuntimeError(f"Mixed recognition failed: {e}") from e + + def recognize_formula(self, image: np.ndarray) -> dict: + """Recognize formula/math content using PaddleOCR-VL with prompt. + + This mode uses PaddleOCR-VL directly with a formula recognition prompt. + + Args: + image: Input image as numpy array in BGR format. + + Returns: + Dict with 'latex', 'markdown', 'mathml' keys. + """ + try: + import httpx + + temp_path = self._save_temp_image(image) + + try: + # Use vLLM API directly for formula recognition + import base64 + + with open(temp_path, "rb") as f: + image_base64 = base64.b64encode(f.read()).decode("utf-8") + + # Call vLLM server with formula prompt + response = httpx.post( + f"{self.vl_server_url}/chat/completions", + json={ + "model": "paddleocr-vl", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": self.FORMULA_PROMPT}, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_base64}"}, + }, + ], + } + ], + "max_tokens": 1024, + }, + timeout=60.0, + ) + response.raise_for_status() + result = response.json() + + latex = result["choices"][0]["message"]["content"].strip() + + # Convert latex to other formats + markdown = self._latex_to_markdown(latex) + mathml = self._latex_to_mathml(latex) + + return { + "latex": latex, + "markdown": markdown, + "mathml": mathml, + } + finally: + Path(temp_path).unlink(missing_ok=True) + + except httpx.HTTPStatusError as e: + raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e + except Exception as e: + raise RuntimeError(f"Formula recognition failed: {e}") from e + + def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict: + """Recognize content based on layout detection results. + + Args: + image: Input image as numpy array in BGR format. + layout_info: Layout detection results. + + Returns: + Dict with recognition results including mode used. + """ + # Decision logic: + # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2) + # - Otherwise -> use formula_recognition (VL with prompt) + if layout_info.has_plain_text: + result = self.recognize_mixed(image) + result["recognition_mode"] = "mixed_recognition" + else: + result = self.recognize_formula(image) + result["recognition_mode"] = "formula_recognition" + + return result + + def _markdown_to_latex(self, markdown: str) -> str: + """Convert markdown to LaTeX. + + Simple conversion - wraps content in LaTeX document structure. + + Args: + markdown: Markdown content. + + Returns: + LaTeX representation. + """ + # Basic conversion: preserve math blocks, convert structure + lines = [] + in_code_block = False + + for line in markdown.split("\n"): + if line.startswith("```"): + in_code_block = not in_code_block + if in_code_block: + lines.append("\\begin{verbatim}") + else: + lines.append("\\end{verbatim}") + elif in_code_block: + lines.append(line) + elif line.startswith("# "): + lines.append(f"\\section{{{line[2:]}}}") + elif line.startswith("## "): + lines.append(f"\\subsection{{{line[3:]}}}") + elif line.startswith("### "): + lines.append(f"\\subsubsection{{{line[4:]}}}") + elif line.startswith("- "): + lines.append(f"\\item {line[2:]}") + elif line.startswith("$$"): + lines.append(line.replace("$$", "\\[").replace("$$", "\\]")) + elif "$" in line: + # Keep inline math as-is + lines.append(line) + else: + lines.append(line) + + return "\n".join(lines) + + def _latex_to_markdown(self, latex: str) -> str: + """Convert LaTeX to markdown. + + Args: + latex: LaTeX content. + + Returns: + Markdown representation. + """ + # Wrap LaTeX in markdown math block + if latex.strip(): + return f"$$\n{latex}\n$$" + return "" + + def _latex_to_mathml(self, latex: str) -> str: + """Convert LaTeX to MathML. + + Args: + latex: LaTeX content. + + Returns: + MathML representation. + """ + # Basic LaTeX to MathML conversion + # For production, consider using latex2mathml library + if not latex.strip(): + return "" + + try: + # Try to use latex2mathml if available + from latex2mathml.converter import convert + + return convert(latex) + except ImportError: + # Fallback: wrap in basic MathML structure + return f'{latex}' + except Exception: + return f'{latex}' + + def _extract_mathml(self, markdown: str) -> str: + """Extract and convert math from markdown to MathML. + + Args: + markdown: Markdown content. + + Returns: + MathML for any math content found. + """ + import re + + # Find all math blocks + math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL) + inline_math = re.findall(r"\$([^$]+)\$", markdown) + + all_math = math_blocks + inline_math + + if not all_math: + return "" + + # Convert each to MathML and combine + mathml_parts = [] + for latex in all_math: + mathml = self._latex_to_mathml(latex.strip()) + if mathml: + mathml_parts.append(mathml) + + return "\n".join(mathml_parts) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7c8cf36 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,53 @@ +version: "3.8" + +services: + doc-processer: + build: + context: . + dockerfile: Dockerfile + container_name: doc-processer + ports: + - "8053:8053" + environment: + - PADDLEOCR_VL_URL=http://host.docker.internal:8080/v1 + - DOCLAYOUT_MODEL_PATH=/app/models/DocLayout/doclayout_yolo_docstructbench_imgsz1024.pt + - PP_DOCLAYOUT_MODEL_DIR=/app/models/PP-DocLayout + - MAX_IMAGE_SIZE_MB=10 + volumes: + # Mount pre-downloaded models (adjust paths as needed) + - ./models/DocLayout:/app/models/DocLayout:ro + - ./models/PP-DocLayout:/app/models/PP-DocLayout:ro + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8053/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + + # Optional: Local development without GPU + doc-processer-cpu: + build: + context: . + dockerfile: Dockerfile + container_name: doc-processer-cpu + ports: + - "8054:8053" + environment: + - PADDLEOCR_VL_URL=http://host.docker.internal:8080/v1 + - DOCLAYOUT_MODEL_PATH=/app/models/DocLayout/doclayout_yolo_docstructbench_imgsz1024.pt + - PP_DOCLAYOUT_MODEL_DIR=/app/models/PP-DocLayout + volumes: + - ./models/DocLayout:/app/models/DocLayout:ro + - ./models/PP-DocLayout:/app/models/PP-DocLayout:ro + profiles: + - cpu + restart: unless-stopped + diff --git a/openspec/AGENTS.md b/openspec/AGENTS.md new file mode 100644 index 0000000..96ab0bb --- /dev/null +++ b/openspec/AGENTS.md @@ -0,0 +1,456 @@ +# OpenSpec Instructions + +Instructions for AI coding assistants using OpenSpec for spec-driven development. + +## TL;DR Quick Checklist + +- Search existing work: `openspec spec list --long`, `openspec list` (use `rg` only for full-text search) +- Decide scope: new capability vs modify existing capability +- Pick a unique `change-id`: kebab-case, verb-led (`add-`, `update-`, `remove-`, `refactor-`) +- Scaffold: `proposal.md`, `tasks.md`, `design.md` (only if needed), and delta specs per affected capability +- Write deltas: use `## ADDED|MODIFIED|REMOVED|RENAMED Requirements`; include at least one `#### Scenario:` per requirement +- Validate: `openspec validate [change-id] --strict` and fix issues +- Request approval: Do not start implementation until proposal is approved + +## Three-Stage Workflow + +### Stage 1: Creating Changes +Create proposal when you need to: +- Add features or functionality +- Make breaking changes (API, schema) +- Change architecture or patterns +- Optimize performance (changes behavior) +- Update security patterns + +Triggers (examples): +- "Help me create a change proposal" +- "Help me plan a change" +- "Help me create a proposal" +- "I want to create a spec proposal" +- "I want to create a spec" + +Loose matching guidance: +- Contains one of: `proposal`, `change`, `spec` +- With one of: `create`, `plan`, `make`, `start`, `help` + +Skip proposal for: +- Bug fixes (restore intended behavior) +- Typos, formatting, comments +- Dependency updates (non-breaking) +- Configuration changes +- Tests for existing behavior + +**Workflow** +1. Review `openspec/project.md`, `openspec list`, and `openspec list --specs` to understand current context. +2. Choose a unique verb-led `change-id` and scaffold `proposal.md`, `tasks.md`, optional `design.md`, and spec deltas under `openspec/changes//`. +3. Draft spec deltas using `## ADDED|MODIFIED|REMOVED Requirements` with at least one `#### Scenario:` per requirement. +4. Run `openspec validate --strict` and resolve any issues before sharing the proposal. + +### Stage 2: Implementing Changes +Track these steps as TODOs and complete them one by one. +1. **Read proposal.md** - Understand what's being built +2. **Read design.md** (if exists) - Review technical decisions +3. **Read tasks.md** - Get implementation checklist +4. **Implement tasks sequentially** - Complete in order +5. **Confirm completion** - Ensure every item in `tasks.md` is finished before updating statuses +6. **Update checklist** - After all work is done, set every task to `- [x]` so the list reflects reality +7. **Approval gate** - Do not start implementation until the proposal is reviewed and approved + +### Stage 3: Archiving Changes +After deployment, create separate PR to: +- Move `changes/[name]/` → `changes/archive/YYYY-MM-DD-[name]/` +- Update `specs/` if capabilities changed +- Use `openspec archive --skip-specs --yes` for tooling-only changes (always pass the change ID explicitly) +- Run `openspec validate --strict` to confirm the archived change passes checks + +## Before Any Task + +**Context Checklist:** +- [ ] Read relevant specs in `specs/[capability]/spec.md` +- [ ] Check pending changes in `changes/` for conflicts +- [ ] Read `openspec/project.md` for conventions +- [ ] Run `openspec list` to see active changes +- [ ] Run `openspec list --specs` to see existing capabilities + +**Before Creating Specs:** +- Always check if capability already exists +- Prefer modifying existing specs over creating duplicates +- Use `openspec show [spec]` to review current state +- If request is ambiguous, ask 1–2 clarifying questions before scaffolding + +### Search Guidance +- Enumerate specs: `openspec spec list --long` (or `--json` for scripts) +- Enumerate changes: `openspec list` (or `openspec change list --json` - deprecated but available) +- Show details: + - Spec: `openspec show --type spec` (use `--json` for filters) + - Change: `openspec show --json --deltas-only` +- Full-text search (use ripgrep): `rg -n "Requirement:|Scenario:" openspec/specs` + +## Quick Start + +### CLI Commands + +```bash +# Essential commands +openspec list # List active changes +openspec list --specs # List specifications +openspec show [item] # Display change or spec +openspec validate [item] # Validate changes or specs +openspec archive [--yes|-y] # Archive after deployment (add --yes for non-interactive runs) + +# Project management +openspec init [path] # Initialize OpenSpec +openspec update [path] # Update instruction files + +# Interactive mode +openspec show # Prompts for selection +openspec validate # Bulk validation mode + +# Debugging +openspec show [change] --json --deltas-only +openspec validate [change] --strict +``` + +### Command Flags + +- `--json` - Machine-readable output +- `--type change|spec` - Disambiguate items +- `--strict` - Comprehensive validation +- `--no-interactive` - Disable prompts +- `--skip-specs` - Archive without spec updates +- `--yes`/`-y` - Skip confirmation prompts (non-interactive archive) + +## Directory Structure + +``` +openspec/ +├── project.md # Project conventions +├── specs/ # Current truth - what IS built +│ └── [capability]/ # Single focused capability +│ ├── spec.md # Requirements and scenarios +│ └── design.md # Technical patterns +├── changes/ # Proposals - what SHOULD change +│ ├── [change-name]/ +│ │ ├── proposal.md # Why, what, impact +│ │ ├── tasks.md # Implementation checklist +│ │ ├── design.md # Technical decisions (optional; see criteria) +│ │ └── specs/ # Delta changes +│ │ └── [capability]/ +│ │ └── spec.md # ADDED/MODIFIED/REMOVED +│ └── archive/ # Completed changes +``` + +## Creating Change Proposals + +### Decision Tree + +``` +New request? +├─ Bug fix restoring spec behavior? → Fix directly +├─ Typo/format/comment? → Fix directly +├─ New feature/capability? → Create proposal +├─ Breaking change? → Create proposal +├─ Architecture change? → Create proposal +└─ Unclear? → Create proposal (safer) +``` + +### Proposal Structure + +1. **Create directory:** `changes/[change-id]/` (kebab-case, verb-led, unique) + +2. **Write proposal.md:** +```markdown +# Change: [Brief description of change] + +## Why +[1-2 sentences on problem/opportunity] + +## What Changes +- [Bullet list of changes] +- [Mark breaking changes with **BREAKING**] + +## Impact +- Affected specs: [list capabilities] +- Affected code: [key files/systems] +``` + +3. **Create spec deltas:** `specs/[capability]/spec.md` +```markdown +## ADDED Requirements +### Requirement: New Feature +The system SHALL provide... + +#### Scenario: Success case +- **WHEN** user performs action +- **THEN** expected result + +## MODIFIED Requirements +### Requirement: Existing Feature +[Complete modified requirement] + +## REMOVED Requirements +### Requirement: Old Feature +**Reason**: [Why removing] +**Migration**: [How to handle] +``` +If multiple capabilities are affected, create multiple delta files under `changes/[change-id]/specs//spec.md`—one per capability. + +4. **Create tasks.md:** +```markdown +## 1. Implementation +- [ ] 1.1 Create database schema +- [ ] 1.2 Implement API endpoint +- [ ] 1.3 Add frontend component +- [ ] 1.4 Write tests +``` + +5. **Create design.md when needed:** +Create `design.md` if any of the following apply; otherwise omit it: +- Cross-cutting change (multiple services/modules) or a new architectural pattern +- New external dependency or significant data model changes +- Security, performance, or migration complexity +- Ambiguity that benefits from technical decisions before coding + +Minimal `design.md` skeleton: +```markdown +## Context +[Background, constraints, stakeholders] + +## Goals / Non-Goals +- Goals: [...] +- Non-Goals: [...] + +## Decisions +- Decision: [What and why] +- Alternatives considered: [Options + rationale] + +## Risks / Trade-offs +- [Risk] → Mitigation + +## Migration Plan +[Steps, rollback] + +## Open Questions +- [...] +``` + +## Spec File Format + +### Critical: Scenario Formatting + +**CORRECT** (use #### headers): +```markdown +#### Scenario: User login success +- **WHEN** valid credentials provided +- **THEN** return JWT token +``` + +**WRONG** (don't use bullets or bold): +```markdown +- **Scenario: User login** ❌ +**Scenario**: User login ❌ +### Scenario: User login ❌ +``` + +Every requirement MUST have at least one scenario. + +### Requirement Wording +- Use SHALL/MUST for normative requirements (avoid should/may unless intentionally non-normative) + +### Delta Operations + +- `## ADDED Requirements` - New capabilities +- `## MODIFIED Requirements` - Changed behavior +- `## REMOVED Requirements` - Deprecated features +- `## RENAMED Requirements` - Name changes + +Headers matched with `trim(header)` - whitespace ignored. + +#### When to use ADDED vs MODIFIED +- ADDED: Introduces a new capability or sub-capability that can stand alone as a requirement. Prefer ADDED when the change is orthogonal (e.g., adding "Slash Command Configuration") rather than altering the semantics of an existing requirement. +- MODIFIED: Changes the behavior, scope, or acceptance criteria of an existing requirement. Always paste the full, updated requirement content (header + all scenarios). The archiver will replace the entire requirement with what you provide here; partial deltas will drop previous details. +- RENAMED: Use when only the name changes. If you also change behavior, use RENAMED (name) plus MODIFIED (content) referencing the new name. + +Common pitfall: Using MODIFIED to add a new concern without including the previous text. This causes loss of detail at archive time. If you aren’t explicitly changing the existing requirement, add a new requirement under ADDED instead. + +Authoring a MODIFIED requirement correctly: +1) Locate the existing requirement in `openspec/specs//spec.md`. +2) Copy the entire requirement block (from `### Requirement: ...` through its scenarios). +3) Paste it under `## MODIFIED Requirements` and edit to reflect the new behavior. +4) Ensure the header text matches exactly (whitespace-insensitive) and keep at least one `#### Scenario:`. + +Example for RENAMED: +```markdown +## RENAMED Requirements +- FROM: `### Requirement: Login` +- TO: `### Requirement: User Authentication` +``` + +## Troubleshooting + +### Common Errors + +**"Change must have at least one delta"** +- Check `changes/[name]/specs/` exists with .md files +- Verify files have operation prefixes (## ADDED Requirements) + +**"Requirement must have at least one scenario"** +- Check scenarios use `#### Scenario:` format (4 hashtags) +- Don't use bullet points or bold for scenario headers + +**Silent scenario parsing failures** +- Exact format required: `#### Scenario: Name` +- Debug with: `openspec show [change] --json --deltas-only` + +### Validation Tips + +```bash +# Always use strict mode for comprehensive checks +openspec validate [change] --strict + +# Debug delta parsing +openspec show [change] --json | jq '.deltas' + +# Check specific requirement +openspec show [spec] --json -r 1 +``` + +## Happy Path Script + +```bash +# 1) Explore current state +openspec spec list --long +openspec list +# Optional full-text search: +# rg -n "Requirement:|Scenario:" openspec/specs +# rg -n "^#|Requirement:" openspec/changes + +# 2) Choose change id and scaffold +CHANGE=add-two-factor-auth +mkdir -p openspec/changes/$CHANGE/{specs/auth} +printf "## Why\n...\n\n## What Changes\n- ...\n\n## Impact\n- ...\n" > openspec/changes/$CHANGE/proposal.md +printf "## 1. Implementation\n- [ ] 1.1 ...\n" > openspec/changes/$CHANGE/tasks.md + +# 3) Add deltas (example) +cat > openspec/changes/$CHANGE/specs/auth/spec.md << 'EOF' +## ADDED Requirements +### Requirement: Two-Factor Authentication +Users MUST provide a second factor during login. + +#### Scenario: OTP required +- **WHEN** valid credentials are provided +- **THEN** an OTP challenge is required +EOF + +# 4) Validate +openspec validate $CHANGE --strict +``` + +## Multi-Capability Example + +``` +openspec/changes/add-2fa-notify/ +├── proposal.md +├── tasks.md +└── specs/ + ├── auth/ + │ └── spec.md # ADDED: Two-Factor Authentication + └── notifications/ + └── spec.md # ADDED: OTP email notification +``` + +auth/spec.md +```markdown +## ADDED Requirements +### Requirement: Two-Factor Authentication +... +``` + +notifications/spec.md +```markdown +## ADDED Requirements +### Requirement: OTP Email Notification +... +``` + +## Best Practices + +### Simplicity First +- Default to <100 lines of new code +- Single-file implementations until proven insufficient +- Avoid frameworks without clear justification +- Choose boring, proven patterns + +### Complexity Triggers +Only add complexity with: +- Performance data showing current solution too slow +- Concrete scale requirements (>1000 users, >100MB data) +- Multiple proven use cases requiring abstraction + +### Clear References +- Use `file.ts:42` format for code locations +- Reference specs as `specs/auth/spec.md` +- Link related changes and PRs + +### Capability Naming +- Use verb-noun: `user-auth`, `payment-capture` +- Single purpose per capability +- 10-minute understandability rule +- Split if description needs "AND" + +### Change ID Naming +- Use kebab-case, short and descriptive: `add-two-factor-auth` +- Prefer verb-led prefixes: `add-`, `update-`, `remove-`, `refactor-` +- Ensure uniqueness; if taken, append `-2`, `-3`, etc. + +## Tool Selection Guide + +| Task | Tool | Why | +|------|------|-----| +| Find files by pattern | Glob | Fast pattern matching | +| Search code content | Grep | Optimized regex search | +| Read specific files | Read | Direct file access | +| Explore unknown scope | Task | Multi-step investigation | + +## Error Recovery + +### Change Conflicts +1. Run `openspec list` to see active changes +2. Check for overlapping specs +3. Coordinate with change owners +4. Consider combining proposals + +### Validation Failures +1. Run with `--strict` flag +2. Check JSON output for details +3. Verify spec file format +4. Ensure scenarios properly formatted + +### Missing Context +1. Read project.md first +2. Check related specs +3. Review recent archives +4. Ask for clarification + +## Quick Reference + +### Stage Indicators +- `changes/` - Proposed, not yet built +- `specs/` - Built and deployed +- `archive/` - Completed changes + +### File Purposes +- `proposal.md` - Why and what +- `tasks.md` - Implementation steps +- `design.md` - Technical decisions +- `spec.md` - Requirements and behavior + +### CLI Essentials +```bash +openspec list # What's in progress? +openspec show [item] # View details +openspec validate --strict # Is it correct? +openspec archive [--yes|-y] # Mark complete (add --yes for automation) +``` + +Remember: Specs are truth. Changes are proposals. Keep them in sync. diff --git a/openspec/changes/add-doc-processing-api/design.md b/openspec/changes/add-doc-processing-api/design.md new file mode 100644 index 0000000..4b04058 --- /dev/null +++ b/openspec/changes/add-doc-processing-api/design.md @@ -0,0 +1,107 @@ +## Context + +This is the initial implementation of the DocProcesser service. The system integrates multiple external models and services: + +- DocLayout-YOLO for document layout analysis +- PaddleOCR-VL with PP-DocLayoutV2 for text and formula recognition (deployed via vLLM) +- markdown_2_docx for document conversion + +Target deployment: Ubuntu machine with RTX 5080 GPU (16GB VRAM), Python 3.11.0. + +## Goals / Non-Goals + +**Goals:** + +- Clean FastAPI project structure following best practices +- Image preprocessing with OpenCV (30% padding) +- Layout-aware OCR routing using DocLayout-YOLO +- Text and formula recognition via PaddleOCR-VL +- Markdown to DOCX conversion +- GPU-enabled Docker deployment + +**Non-Goals:** + +- Authentication/authorization (can be added later) +- Rate limiting +- Persistent storage +- Training or fine-tuning models + +## Decisions + +### Project Structure + +Follow FastAPI best practices with modular organization: + +``` +app/ +├── api/ +│ └── v1/ +│ ├── endpoints/ +│ │ ├── image.py # Image OCR endpoint +│ │ └── convert.py # Markdown to DOCX endpoint +│ └── router.py +├── core/ +│ └── config.py # Settings and environment config +|—— model/ +| |—— DocLayout +| |—— PP-DocLayout +├── services/ +│ ├── image_processor.py # OpenCV preprocessing +│ ├── layout_detector.py # DocLayout-YOLO wrapper +│ ├── ocr_service.py # PaddleOCR-VL client +│ └── docx_converter.py # markdown_2_docx wrapper +├── schemas/ +│ ├── image.py # Request/response models for image OCR +│ └── convert.py # Request/response models for conversion +└── main.py # FastAPI app initialization +``` + +**Rationale:** Separation of concerns between API layer, business logic (services), and data models (schemas). + +### Image Preprocessing + +- Use OpenCV `cv2.copyMakeBorder()` to add 30% whitespace padding +- Padding color: white `[255, 255, 255]` +- This matches DocLayout-YOLO's demo.py pattern + +### Layout Detection Flow + +1. DocLayout-YOLO detects layout regions (plain text, formulas, tables, figures) +2. Exsit plain text, routes to PaddleOCR-VL with PP-DocLayoutV2, othewise routes to PaddleOCR-VL with prompt +3. PaddleOCR-VL combined PP-DocLayoutV2 handles mixed content recognition internally, PaddleOCR-VL combined prompt handles formula + +### External Service Integration + +- PaddleOCR-VL: Connect to vLLM server at configurable URL (default: `http://localhost:8080/v1`) +- DocLayout-YOLO: Load model from pre-downloaded path (not downloaded in container) + +### Docker Strategy + +- Base image: NVIDIA CUDA with Python 3.11 +- Pre-install OpenCV dependencies (`libgl1-mesa-glx`, `libglib2.0-0`) +- Mount model directory for DocLayout-YOLO weights +- Expose port 8053 +- Use Uvicorn with multiple workers + +## Risks / Trade-offs + +| Risk | Mitigation | +| --------------------------------- | ------------------------------------------------------------------ | +| PaddleOCR-VL service unavailable | Health check endpoint, retry logic with exponential backoff | +| Large image memory consumption | Configure max image size, resize before processing | +| DocLayout-YOLO model loading time | Load model once at startup, keep in memory | +| GPU memory contention | DocLayout-YOLO uses GPU; PaddleOCR-VL runs on separate vLLM server | + +## Configuration + +Environment variables: + +- `PADDLEOCR_VL_URL`: vLLM server URL (default: `http://localhost:8000/v1`) +- `DOCLAYOUT_MODEL_PATH`: Path to DocLayout-YOLO weights +- `PP_DOCLAYOUT_MODEL_DIR`: Path to PP-DocLayoutV3 model directory +- `MAX_IMAGE_SIZE_MB`: Maximum upload size (default: 10) + +## Open Questions + +- Should we add async queue for large batch processing? (Defer to future change) +- Do we need WebSocket for progress updates? (Defer to future change) diff --git a/openspec/changes/add-doc-processing-api/proposal.md b/openspec/changes/add-doc-processing-api/proposal.md new file mode 100644 index 0000000..b5592de --- /dev/null +++ b/openspec/changes/add-doc-processing-api/proposal.md @@ -0,0 +1,31 @@ +# Change: Add Document Processing API + +## Why + +DocProcesser needs a FastAPI backend to accept images (via URL or base64) and convert them to LaTeX/Markdown/MathML, plus a markdown-to-DOCX conversion endpoint. This establishes the core functionality of the project. + +## What Changes + +- **BREAKING**: Initial project setup (new FastAPI project structure) +- Add image-to-OCR API endpoint (`POST /doc_process/v1/image/ocr`) + - Accept `image_url` or `image_base64` input + - Preprocess with OpenCV (30% whitespace padding) + - Use DocLayout-YOLO for layout detection + - Route to PaddleOCR-VL (with PP-DocLayoutV2) for text/formula recognition + - Exists `plain_text` element, use PP-DocLayoutV2 to recognize the image as mixed_recognition , otherwise directly PaddleOCR-VL API combined with prompt Formula Recognition as formula_recognition. + - Refrence markdown_2_docx code convert the markdown to latex, mathml for mixed_recognition, convert the latex to markdown, mathml for formula_recognition + - Return LaTeX, Markdown, and MathML outputs +- Add markdown-to-DOCX API endpoint (`POST /doc_process/v1/convert/docx`) + - Accept markdown content + - Refrence markdown_2_docx library for conversion, the address is http://github.com/YogeLiu/markdown_2_docxdd. + - Return DOCX file +- Add Dockerfile for GPU-enabled deployment (RTX 5080, port 8053) + +## Impact + +- Affected specs: `image-ocr`, `markdown-docx` +- Affected code: New project structure under `app/` +- External dependencies: + - DocLayout-YOLO (pre-downloaded model, not fetched in container) + - PaddleOCR-VL with vLLM backend (external service at localhost:8080) + - markdown_2_docx library diff --git a/openspec/changes/add-doc-processing-api/specs/image-ocr/spec.md b/openspec/changes/add-doc-processing-api/specs/image-ocr/spec.md new file mode 100644 index 0000000..5ce690d --- /dev/null +++ b/openspec/changes/add-doc-processing-api/specs/image-ocr/spec.md @@ -0,0 +1,137 @@ +## ADDED Requirements + +### Requirement: Image Input Acceptance + +The system SHALL accept images via `POST /api/v1/image/ocr` endpoint with either: + +- `image_url`: A publicly accessible URL to the image +- `image_base64`: Base64-encoded image data + +The system SHALL return an error if neither input is provided or if both are provided simultaneously. + +#### Scenario: Image URL provided + +- **WHEN** a valid `image_url` is provided in the request body +- **THEN** the system SHALL download the image and process it +- **AND** return OCR results in the response + +#### Scenario: Base64 image provided + +- **WHEN** a valid `image_base64` string is provided in the request body +- **THEN** the system SHALL decode the image and process it +- **AND** return OCR results in the response + +#### Scenario: Invalid input + +- **WHEN** neither `image_url` nor `image_base64` is provided +- **THEN** the system SHALL return HTTP 422 with validation error + +--- + +### Requirement: Image Preprocessing with Padding + +The system SHALL preprocess all input images by adding 30% whitespace padding around the image borders using OpenCV. + +The padding calculation: `padding = int(max(height, width) * 0.15)` on each side (totaling 30% expansion). + +The padding color SHALL be white (`RGB: 255, 255, 255`). + +#### Scenario: Image padding applied + +- **WHEN** an image of dimensions 1000x800 pixels is received +- **THEN** the system SHALL add approximately 150 pixels of white padding on each side +- **AND** the resulting image dimensions SHALL be approximately 1300x1100 pixels + +--- + +### Requirement: Layout Detection with DocLayout-YOLO + +The system SHALL use DocLayout-YOLO model to detect document layout regions including: + +- Plain text blocks +- Formulas/equations +- Tables +- Figures + +The model SHALL be loaded from a pre-configured local path (not downloaded at runtime). + +#### Scenario: Layout detection success + +- **WHEN** a padded image is passed to DocLayout-YOLO +- **THEN** the system SHALL return detected regions with bounding boxes and class labels +- **AND** confidence scores for each detection + +#### Scenario: Model not available + +- **WHEN** the DocLayout-YOLO model file is not found at the configured path +- **THEN** the system SHALL fail startup with a clear error message + +--- + +### Requirement: OCR Processing with PaddleOCR-VL + +The system SHALL send images to PaddleOCR-VL (via vLLM backend) for text and formula recognition. + +PaddleOCR-VL SHALL be configured with PP-DocLayoutV2 for document layout understanding. + +The system SHALL handle both plain text and formula/math content. + +#### Scenario: Plain text recognition + +- **WHEN** DocLayout-YOLO detects plain text regions +- **THEN** the system SHALL send the image to PaddleOCR-VL +- **AND** return recognized text content + +#### Scenario: Formula recognition + +- **WHEN** DocLayout-YOLO detects formula/equation regions +- **THEN** the system SHALL send the image to PaddleOCR-VL +- **AND** return formula content in LaTeX format + +#### Scenario: Mixed content handling + +- **WHEN** DocLayout-YOLO detects both text and formula regions +- **THEN** the system SHALL process all regions via PaddleOCR-VL with PP-DocLayoutV3 +- **AND** return combined results preserving document structure + +#### Scenario: PaddleOCR-VL service unavailable + +- **WHEN** the PaddleOCR-VL vLLM server is unreachable +- **THEN** the system SHALL return HTTP 503 with service unavailable error + +--- + +### Requirement: Multi-Format Output + +The system SHALL return OCR results in multiple formats: + +- `latex`: LaTeX representation of the content +- `markdown`: Markdown representation of the content +- `mathml`: MathML representation for mathematical content + +#### Scenario: Successful OCR response + +- **WHEN** image processing completes successfully +- **THEN** the response SHALL include: + - `latex`: string containing LaTeX output + - `markdown`: string containing Markdown output + - `mathml`: string containing MathML output (empty string if no math detected) +- **AND** HTTP status code SHALL be 200 + +#### Scenario: Response structure + +- **WHEN** the OCR endpoint returns successfully +- **THEN** the response body SHALL be JSON with structure: + +```json +{ + "latex": "...", + "markdown": "...", + "mathml": "...", + "layout_info": { + "regions": [ + {"type": "text|formula|table|figure", "bbox": [x1, y1, x2, y2], "confidence": 0.95} + ] + } +} +``` diff --git a/openspec/changes/add-doc-processing-api/specs/markdown-docx/spec.md b/openspec/changes/add-doc-processing-api/specs/markdown-docx/spec.md new file mode 100644 index 0000000..6b27820 --- /dev/null +++ b/openspec/changes/add-doc-processing-api/specs/markdown-docx/spec.md @@ -0,0 +1,93 @@ +## ADDED Requirements + +### Requirement: Markdown Input Acceptance + +The system SHALL accept markdown content via `POST /api/v1/convert/docx` endpoint. + +The request body SHALL contain: +- `markdown`: string containing the markdown content to convert + +#### Scenario: Valid markdown provided + +- **WHEN** valid markdown content is provided in the request body +- **THEN** the system SHALL process and convert it to DOCX format + +#### Scenario: Empty markdown + +- **WHEN** an empty `markdown` string is provided +- **THEN** the system SHALL return HTTP 422 with validation error + +--- + +### Requirement: DOCX Conversion + +The system SHALL convert markdown content to DOCX format using the markdown_2_docx library. + +The conversion SHALL preserve: +- Headings (H1-H6) +- Paragraphs +- Bold and italic formatting +- Lists (ordered and unordered) +- Code blocks +- Tables +- Images (if embedded as base64 or accessible URLs) + +#### Scenario: Basic markdown conversion + +- **WHEN** markdown with headings, paragraphs, and formatting is provided +- **THEN** the system SHALL generate a valid DOCX file +- **AND** the DOCX SHALL preserve the document structure + +#### Scenario: Complex markdown with tables + +- **WHEN** markdown containing tables is provided +- **THEN** the system SHALL convert tables to Word table format +- **AND** preserve table structure and content + +#### Scenario: Markdown with math formulas + +- **WHEN** markdown containing LaTeX math expressions is provided +- **THEN** the system SHALL convert math to OMML (Office Math Markup Language) format +- **AND** render correctly in Microsoft Word + +--- + +### Requirement: DOCX File Response + +The system SHALL return the generated DOCX file as a binary download. + +The response SHALL include: +- Content-Type: `application/vnd.openxmlformats-officedocument.wordprocessingml.document` +- Content-Disposition: `attachment; filename="output.docx"` + +#### Scenario: Successful conversion response + +- **WHEN** markdown conversion completes successfully +- **THEN** the response SHALL be the DOCX file binary +- **AND** HTTP status code SHALL be 200 +- **AND** appropriate headers for file download SHALL be set + +#### Scenario: Custom filename + +- **WHEN** an optional `filename` parameter is provided in the request +- **THEN** the Content-Disposition header SHALL use the provided filename +- **AND** append `.docx` extension if not present + +--- + +### Requirement: Error Handling + +The system SHALL provide clear error responses for conversion failures. + +#### Scenario: Conversion failure + +- **WHEN** markdown_2_docx fails to convert the content +- **THEN** the system SHALL return HTTP 500 with error details +- **AND** the error message SHALL describe the failure reason + +#### Scenario: Malformed markdown + +- **WHEN** severely malformed markdown is provided +- **THEN** the system SHALL attempt best-effort conversion +- **AND** log a warning about potential formatting issues + diff --git a/openspec/changes/add-doc-processing-api/tasks.md b/openspec/changes/add-doc-processing-api/tasks.md new file mode 100644 index 0000000..ef8d1f3 --- /dev/null +++ b/openspec/changes/add-doc-processing-api/tasks.md @@ -0,0 +1,34 @@ +## 1. Project Scaffolding + +- [x] 1.1 Create FastAPI project structure (`app/`, `api/`, `core/`, `services/`, `schemas/`) +- [x] 1.2 Use uv handle with dependencies (fastapi, uvicorn, opencv-python, python-multipart, pydantic, httpx) +- [x] 1.3 Create `app/main.py` with FastAPI app initialization +- [x] 1.4 Create `app/core/config.py` with Pydantic Settings + +## 2. Image OCR API + +- [x] 2.1 Create request/response schemas in `app/schemas/image.py` +- [x] 2.2 Implement image preprocessing service with OpenCV padding (`app/services/image_processor.py`) +- [x] 2.3 Implement DocLayout-YOLO wrapper (`app/services/layout_detector.py`) +- [x] 2.4 Implement PaddleOCR-VL client (`app/services/ocr_service.py`) +- [x] 2.5 Create image OCR endpoint (`app/api/v1/endpoints/image.py`) +- [x] 2.6 Wire up router and test endpoint + +## 3. Markdown to DOCX API + +- [x] 3.1 Create request/response schemas in `app/schemas/convert.py` +- [x] 3.2 Integrate markdown_2_docx library (`app/services/docx_converter.py`) +- [x] 3.3 Create conversion endpoint (`app/api/v1/endpoints/convert.py`) +- [x] 3.4 Wire up router and test endpoint + +## 4. Deployment + +- [x] 4.1 Create Dockerfile with CUDA base image for RTX 5080 +- [x] 4.2 Create docker-compose.yml (optional, for local development) +- [x] 4.3 Document deployment steps in README + +## 5. Validation + +- [ ] 5.1 Test image OCR endpoint with sample images +- [ ] 5.2 Test markdown to DOCX conversion +- [ ] 5.3 Verify Docker build and GPU access diff --git a/openspec/project.md b/openspec/project.md new file mode 100644 index 0000000..3d87cc8 --- /dev/null +++ b/openspec/project.md @@ -0,0 +1,42 @@ +# Project Context + +## Purpose + +This project is DocProcesser which can process the image to latex, markdown, mathml, omml, ect. +It is a fastapi web project, it accept the request from upstream and process the image or send the image to the third-part, then return the result to upstream. + +## Tech Stack + +- python +- fastapi + +## Project Conventions + +### Code Style + +[Describe your code style preferences, formatting rules, and naming conventions] + +### Architecture Patterns + +[Document your architectural decisions and patterns] + +### Testing Strategy + +[Explain your testing approach and requirements] + +### Git Workflow + +[Describe your branching strategy and commit conventions] + +## Domain Context + +- DocLayout + A YOLO model which can recognize the document layout (Book, Paper, NewPapers) will be used to recongize if has plain text in a image. + +## Important Constraints + +[List any technical, business, or regulatory constraints] + +## External Dependencies + +[Document key external services, APIs, or systems] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..42abbfb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,53 @@ +[project] +name = "doc-processer" +version = "0.1.0" +description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX" +readme = "README.md" +requires-python = ">=3.11" +license = { text = "MIT" } +authors = [ + { name = "YogeLiu" } +] + +dependencies = [ + "fastapi>=0.115.0", + "uvicorn[standard]>=0.32.0", + "opencv-python>=4.10.0", + "python-multipart>=0.0.12", + "pydantic>=2.10.0", + "pydantic-settings>=2.6.0", + "httpx>=0.28.0", + "numpy>=1.26.0", + "pillow>=10.4.0", + "python-docx>=1.1.0", + "paddleocr>=2.9.0", + "doclayout-yolo>=0.0.2", + "latex2mathml>=3.77.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.24.0", + "ruff>=0.8.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["app"] + +[tool.ruff] +target-version = "py311" +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "I", "UP"] +ignore = ["E501"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] +