texteller/models/det_model/inference.py

import os
import time
import yaml
import numpy as np
import cv2

from tqdm import tqdm
from typing import List
from .preprocess import Compose
from .Bbox import Bbox


# Global dictionary
SUPPORT_MODELS = {
    'YOLO',
    'PPYOLOE',
    'RCNN',
    'SSD',
    'Face',
    'FCOS',
    'SOLOv2',
    'TTFNet',
    'S2ANet',
    'JDE',
    'FairMOT',
    'DeepSORT',
    'GFL',
    'PicoDet',
    'CenterNet',
    'TOOD',
    'RetinaNet',
    'StrongBaseline',
    'STGCN',
    'YOLOX',
    'HRNet',
    'DETR',
}


class PredictConfig(object):
    """set config of preprocess, postprocess and visualize
    Args:
        infer_config (str): path of infer_cfg.yml
    """

    def __init__(self, infer_config):
        # parsing Yaml config for Preprocess
        with open(infer_config) as f:
            yml_conf = yaml.safe_load(f)
        self.check_model(yml_conf)
        self.arch = yml_conf['arch']
        self.preprocess_infos = yml_conf['Preprocess']
        self.min_subgraph_size = yml_conf['min_subgraph_size']
        self.label_list = yml_conf['label_list']
        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
        self.mask = yml_conf.get("mask", False)
        self.tracker = yml_conf.get("tracker", None)
        self.nms = yml_conf.get("NMS", None)
        self.fpn_stride = yml_conf.get("fpn_stride", None)

        color_pool = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255)]
        self.colors = {
            label: color_pool[i % len(color_pool)] for i, label in enumerate(self.label_list)
        }

        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
            print('The RCNN export model is used for ONNX and it only supports batch_size = 1')
        self.print_config()

    def check_model(self, yml_conf):
        """
        Raises:
            ValueError: loaded model not in supported model type
        """
        for support_model in SUPPORT_MODELS:
            if support_model in yml_conf['arch']:
                return True
        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf['arch'], SUPPORT_MODELS))

    def print_config(self):
        print('-----------  Model Configuration -----------')
        print('%s: %s' % ('Model Arch', self.arch))
        print('%s: ' % ('Transform Order'))
        for op_info in self.preprocess_infos:
            print('--%s: %s' % ('transform op', op_info['type']))
        print('--------------------------------------------')


def draw_bbox(image, outputs, infer_config):
    for output in outputs:
        cls_id, score, xmin, ymin, xmax, ymax = output
        if score > infer_config.draw_threshold:
            label = infer_config.label_list[int(cls_id)]
            color = infer_config.colors[label]
            cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2)
            cv2.putText(
                image,
                "{}: {:.2f}".format(label, score),
                (int(xmin), int(ymin - 5)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                color,
                2,
            )
    return image


def predict_image(imgsave_dir, infer_config, predictor, img_list):
    # load preprocess transforms
    transforms = Compose(infer_config.preprocess_infos)
    errImgList = []

    # Check and create subimg_save_dir if not exist
    subimg_save_dir = os.path.join(imgsave_dir, 'subimages')
    os.makedirs(subimg_save_dir, exist_ok=True)

    first_image_skipped = False
    total_time = 0
    num_images = 0
    # predict image
    for img_path in tqdm(img_list):
        img = cv2.imread(img_path)
        if img is None:
            print(f"Warning: Could not read image {img_path}. Skipping...")
            errImgList.append(img_path)
            continue

        inputs = transforms(img_path)
        inputs_name = [var.name for var in predictor.get_inputs()]
        inputs = {k: inputs[k][None,] for k in inputs_name}

        # Start timing
        start_time = time.time()

        outputs = predictor.run(output_names=None, input_feed=inputs)

        # Stop timing
        end_time = time.time()
        inference_time = end_time - start_time
        if not first_image_skipped:
            first_image_skipped = True
        else:
            total_time += inference_time
            num_images += 1
        print(
            f"ONNXRuntime predict time for {os.path.basename(img_path)}: {inference_time:.4f} seconds"
        )

        print("ONNXRuntime predict: ")
        if infer_config.arch in ["HRNet"]:
            print(np.array(outputs[0]))
        else:
            bboxes = np.array(outputs[0])
            for bbox in bboxes:
                if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:
                    print(f"{int(bbox[0])} {bbox[1]} " f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")

        # Save the subimages (crop from the original image)
        subimg_counter = 1
        for output in np.array(outputs[0]):
            cls_id, score, xmin, ymin, xmax, ymax = output
            if score > infer_config.draw_threshold:
                label = infer_config.label_list[int(cls_id)]
                subimg = img[int(max(ymin, 0)) : int(ymax), int(max(xmin, 0)) : int(xmax)]
                if len(subimg) == 0:
                    continue

                subimg_filename = f"{os.path.splitext(os.path.basename(img_path))[0]}_{label}_{xmin:.2f}_{ymin:.2f}_{xmax:.2f}_{ymax:.2f}.jpg"
                subimg_path = os.path.join(subimg_save_dir, subimg_filename)
                cv2.imwrite(subimg_path, subimg)
                subimg_counter += 1

        # Draw bounding boxes and save the image with bounding boxes
        img_with_mask = img.copy()
        for output in np.array(outputs[0]):
            cls_id, score, xmin, ymin, xmax, ymax = output
            if score > infer_config.draw_threshold:
                cv2.rectangle(
                    img_with_mask,
                    (int(xmin), int(ymin)),
                    (int(xmax), int(ymax)),
                    (255, 255, 255),
                    -1,
                )  # 盖白

        img_with_bbox = draw_bbox(img, np.array(outputs[0]), infer_config)

        output_dir = imgsave_dir
        os.makedirs(output_dir, exist_ok=True)
        draw_box_dir = os.path.join(output_dir, 'draw_box')
        mask_white_dir = os.path.join(output_dir, 'mask_white')
        os.makedirs(draw_box_dir, exist_ok=True)
        os.makedirs(mask_white_dir, exist_ok=True)

        output_file_mask = os.path.join(mask_white_dir, os.path.basename(img_path))
        output_file_bbox = os.path.join(draw_box_dir, os.path.basename(img_path))
        cv2.imwrite(output_file_mask, img_with_mask)
        cv2.imwrite(output_file_bbox, img_with_bbox)

    avg_time_per_image = total_time / num_images if num_images > 0 else 0
    print(f"Total inference time for {num_images} images: {total_time:.4f} seconds")
    print(f"Average time per image: {avg_time_per_image:.4f} seconds")
    print("ErrorImgs:")
    print(errImgList)


def predict(img_path: str, predictor, infer_config) -> List[Bbox]:
    transforms = Compose(infer_config.preprocess_infos)
    inputs = transforms(img_path)
    inputs_name = [var.name for var in predictor.get_inputs()]
    inputs = {k: inputs[k][None,] for k in inputs_name}

    outputs = predictor.run(output_names=None, input_feed=inputs)[0]
    res = []
    for output in outputs:
        cls_name = infer_config.label_list[int(output[0])]
        score = output[1]
        xmin = int(max(output[2], 0))
        ymin = int(max(output[3], 0))
        xmax = int(output[4])
        ymax = int(output[5])
        if score > infer_config.draw_threshold:
            res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score))

    return res
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`import os`
Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00			`import time`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`import yaml`
			`import numpy as np`
			`import cv2`

			`from tqdm import tqdm`
			`from typing import List`
			`from .preprocess import Compose`
			`from .Bbox import Bbox`


			`# Global dictionary`
			`SUPPORT_MODELS = {`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`'YOLO',`
			`'PPYOLOE',`
			`'RCNN',`
			`'SSD',`
			`'Face',`
			`'FCOS',`
			`'SOLOv2',`
			`'TTFNet',`
			`'S2ANet',`
			`'JDE',`
			`'FairMOT',`
			`'DeepSORT',`
			`'GFL',`
			`'PicoDet',`
			`'CenterNet',`
			`'TOOD',`
			`'RetinaNet',`
			`'StrongBaseline',`
			`'STGCN',`
			`'YOLOX',`
			`'HRNet',`
			`'DETR',`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`}`


			`class PredictConfig(object):`
			`"""set config of preprocess, postprocess and visualize`
			`Args:`
			`infer_config (str): path of infer_cfg.yml`
			`"""`

			`def __init__(self, infer_config):`
			`# parsing Yaml config for Preprocess`
			`with open(infer_config) as f:`
			`yml_conf = yaml.safe_load(f)`
			`self.check_model(yml_conf)`
			`self.arch = yml_conf['arch']`
			`self.preprocess_infos = yml_conf['Preprocess']`
			`self.min_subgraph_size = yml_conf['min_subgraph_size']`
			`self.label_list = yml_conf['label_list']`
			`self.use_dynamic_shape = yml_conf['use_dynamic_shape']`
			`self.draw_threshold = yml_conf.get("draw_threshold", 0.5)`
			`self.mask = yml_conf.get("mask", False)`
			`self.tracker = yml_conf.get("tracker", None)`
			`self.nms = yml_conf.get("NMS", None)`
			`self.fpn_stride = yml_conf.get("fpn_stride", None)`

			`color_pool = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255)]`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`self.colors = {`
			`label: color_pool[i % len(color_pool)] for i, label in enumerate(self.label_list)`
			`}`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00
			`if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`print('The RCNN export model is used for ONNX and it only supports batch_size = 1')`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`self.print_config()`

			`def check_model(self, yml_conf):`
			`"""`
			`Raises:`
			`ValueError: loaded model not in supported model type`
			`"""`
			`for support_model in SUPPORT_MODELS:`
			`if support_model in yml_conf['arch']:`
			`return True`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf['arch'], SUPPORT_MODELS))`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00
			`def print_config(self):`
			`print('----------- Model Configuration -----------')`
			`print('%s: %s' % ('Model Arch', self.arch))`
			`print('%s: ' % ('Transform Order'))`
			`for op_info in self.preprocess_infos:`
			`print('--%s: %s' % ('transform op', op_info['type']))`
			`print('--------------------------------------------')`


			`def draw_bbox(image, outputs, infer_config):`
			`for output in outputs:`
			`cls_id, score, xmin, ymin, xmax, ymax = output`
			`if score > infer_config.draw_threshold:`
			`label = infer_config.label_list[int(cls_id)]`
			`color = infer_config.colors[label]`
			`cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2)`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`cv2.putText(`
			`image,`
			`"{}: {:.2f}".format(label, score),`
			`(int(xmin), int(ymin - 5)),`
			`cv2.FONT_HERSHEY_SIMPLEX,`
			`0.5,`
			`color,`
			`2,`
			`)`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`return image`


			`def predict_image(imgsave_dir, infer_config, predictor, img_list):`
			`# load preprocess transforms`
			`transforms = Compose(infer_config.preprocess_infos)`
			`errImgList = []`

			`# Check and create subimg_save_dir if not exist`
			`subimg_save_dir = os.path.join(imgsave_dir, 'subimages')`
			`os.makedirs(subimg_save_dir, exist_ok=True)`

Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00			`first_image_skipped = False`
			`total_time = 0`
			`num_images = 0`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`# predict image`
			`for img_path in tqdm(img_list):`
			`img = cv2.imread(img_path)`
			`if img is None:`
			`print(f"Warning: Could not read image {img_path}. Skipping...")`
			`errImgList.append(img_path)`
			`continue`

			`inputs = transforms(img_path)`
			`inputs_name = [var.name for var in predictor.get_inputs()]`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`inputs = {k: inputs[k][None,] for k in inputs_name}`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00
Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00			`# Start timing`
			`start_time = time.time()`

1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`outputs = predictor.run(output_names=None, input_feed=inputs)`

Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00			`# Stop timing`
			`end_time = time.time()`
			`inference_time = end_time - start_time`
			`if not first_image_skipped:`
			`first_image_skipped = True`
			`else:`
			`total_time += inference_time`
			`num_images += 1`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`print(`
			`f"ONNXRuntime predict time for {os.path.basename(img_path)}: {inference_time:.4f} seconds"`
			`)`
Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`print("ONNXRuntime predict: ")`
			`if infer_config.arch in ["HRNet"]:`
			`print(np.array(outputs[0]))`
			`else:`
			`bboxes = np.array(outputs[0])`
			`for bbox in bboxes:`
			`if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`print(f"{int(bbox[0])} {bbox[1]} " f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00
			`# Save the subimages (crop from the original image)`
			`subimg_counter = 1`
			`for output in np.array(outputs[0]):`
			`cls_id, score, xmin, ymin, xmax, ymax = output`
			`if score > infer_config.draw_threshold:`
			`label = infer_config.label_list[int(cls_id)]`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`subimg = img[int(max(ymin, 0)) : int(ymax), int(max(xmin, 0)) : int(xmax)]`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`if len(subimg) == 0:`
			`continue`

			`subimg_filename = f"{os.path.splitext(os.path.basename(img_path))[0]}_{label}_{xmin:.2f}_{ymin:.2f}_{xmax:.2f}_{ymax:.2f}.jpg"`
			`subimg_path = os.path.join(subimg_save_dir, subimg_filename)`
			`cv2.imwrite(subimg_path, subimg)`
			`subimg_counter += 1`

			`# Draw bounding boxes and save the image with bounding boxes`
Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00			`img_with_mask = img.copy()`
			`for output in np.array(outputs[0]):`
			`cls_id, score, xmin, ymin, xmax, ymax = output`
			`if score > infer_config.draw_threshold:`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`cv2.rectangle(`
			`img_with_mask,`
			`(int(xmin), int(ymin)),`
			`(int(xmax), int(ymax)),`
			`(255, 255, 255),`
			`-1,`
			`) # 盖白`

1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`img_with_bbox = draw_bbox(img, np.array(outputs[0]), infer_config)`
Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`output_dir = imgsave_dir`
			`os.makedirs(output_dir, exist_ok=True)`
Update inference.py 增加了计时功能 2024-05-09 00:20:32 +08:00			`draw_box_dir = os.path.join(output_dir, 'draw_box')`
			`mask_white_dir = os.path.join(output_dir, 'mask_white')`
			`os.makedirs(draw_box_dir, exist_ok=True)`
			`os.makedirs(mask_white_dir, exist_ok=True)`

			`output_file_mask = os.path.join(mask_white_dir, os.path.basename(img_path))`
			`output_file_bbox = os.path.join(draw_box_dir, os.path.basename(img_path))`
			`cv2.imwrite(output_file_mask, img_with_mask)`
			`cv2.imwrite(output_file_bbox, img_with_bbox)`

			`avg_time_per_image = total_time / num_images if num_images > 0 else 0`
			`print(f"Total inference time for {num_images} images: {total_time:.4f} seconds")`
			`print(f"Average time per image: {avg_time_per_image:.4f} seconds")`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00			`print("ErrorImgs:")`
			`print(errImgList)`


			`def predict(img_path: str, predictor, infer_config) -> List[Bbox]:`
			`transforms = Compose(infer_config.preprocess_infos)`
			`inputs = transforms(img_path)`
			`inputs_name = [var.name for var in predictor.get_inputs()]`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`inputs = {k: inputs[k][None,] for k in inputs_name}`
1) 实现了文本-公式混排识别; 2) 重构了项目结构 2024-04-21 00:05:14 +08:00
			`outputs = predictor.run(output_names=None, input_feed=inputs)[0]`
			`res = []`
			`for output in outputs:`
			`cls_name = infer_config.label_list[int(output[0])]`
			`score = output[1]`
			`xmin = int(max(output[2], 0))`
			`ymin = int(max(output[3], 0))`
			`xmax = int(output[4])`
			`ymax = int(output[5])`
			`if score > infer_config.draw_threshold:`
			`res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score))`

			`return res`