doc_processer/app/schemas/image.py

"""Request and response schemas for image OCR endpoint."""

from pydantic import BaseModel, Field, model_validator


class LayoutRegion(BaseModel):
    """A detected layout region in the document."""

    type: str = Field(..., description="Region type: text, formula, table, figure")
    bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]")
    confidence: float = Field(..., description="Detection confidence score")
    score: float = Field(..., description="Detection score")


class LayoutInfo(BaseModel):
    """Layout detection information."""

    regions: list[LayoutRegion] = Field(default_factory=list)
    MixedRecognition: bool = Field(False, description="Whether mixed recognition was used")
    # FormulaRecognition: bool = Field(False, description="Whether formula recognition (with prompt) was used")


class ImageOCRRequest(BaseModel):
    """Request body for image OCR endpoint."""

    image_url: str | None = Field(None, description="URL to fetch the image from")
    image_base64: str | None = Field(None, description="Base64-encoded image data")
    model_name: str = Field("mineru", description="Name of the model to use for OCR")

    @model_validator(mode="after")
    def validate_input(self):
        """Validate that exactly one of image_url or image_base64 is provided."""
        if self.image_url is None and self.image_base64 is None:
            raise ValueError("Either image_url or image_base64 must be provided")
        if self.image_url is not None and self.image_base64 is not None:
            raise ValueError("Only one of image_url or image_base64 should be provided")
        return self


class ImageOCRResponse(BaseModel):
    """Response body for image OCR endpoint."""

    latex: str = Field("", description="LaTeX representation of the content")
    markdown: str = Field("", description="Markdown representation of the content")
    mathml: str = Field("", description="MathML representation (empty if no math detected)")
    layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
    recognition_mode: str = Field(
        "", description="Recognition mode used: mixed_recognition or formula_recognition"
    )