TexTeller/texteller/api/detection/detect.py

from typing import List

from onnxruntime import InferenceSession

from texteller.types import Bbox

from .preprocess import Compose

_config = {
    "mode": "paddle",
    "draw_threshold": 0.5,
    "metric": "COCO",
    "use_dynamic_shape": False,
    "arch": "DETR",
    "min_subgraph_size": 3,
    "preprocess": [
        {"interp": 2, "keep_ratio": False, "target_size": [1600, 1600], "type": "Resize"},
        {
            "mean": [0.0, 0.0, 0.0],
            "norm_type": "none",
            "std": [1.0, 1.0, 1.0],
            "type": "NormalizeImage",
        },
        {"type": "Permute"},
    ],
    "label_list": ["isolated", "embedding"],
}


def latex_detect(img_path: str, predictor: InferenceSession) -> List[Bbox]:
    """
    Detect LaTeX formulas in an image and classify them as isolated or embedded.

    This function uses an ONNX model to detect LaTeX formulas in images. The model
    identifies two types of LaTeX formulas:
    - 'isolated': Standalone LaTeX formulas (typically displayed equations)
    - 'embedding': Inline LaTeX formulas embedded within text

    Args:
        img_path: Path to the input image file
        predictor: ONNX InferenceSession model for LaTeX detection

    Returns:
        List of Bbox objects representing the detected LaTeX formulas with their
        positions, classifications, and confidence scores

    Example:
        >>> from texteller.api import load_latexdet_model, latex_detect
        >>> model = load_latexdet_model()
        >>> bboxes = latex_detect("path/to/image.png", model)
    """
    transforms = Compose(_config["preprocess"])
    inputs = transforms(img_path)
    inputs_name = [var.name for var in predictor.get_inputs()]
    inputs = {k: inputs[k][None,] for k in inputs_name}

    outputs = predictor.run(output_names=None, input_feed=inputs)[0]
    res = []
    for output in outputs:
        cls_name = _config["label_list"][int(output[0])]
        score = output[1]
        xmin = int(max(output[2], 0))
        ymin = int(max(output[3], 0))
        xmax = int(output[4])
        ymax = int(output[5])
        if score > 0.5:
            res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score))

    return res