Added code for PaddleOCR inference
This commit is contained in:
215
src/models/thrid_party/paddleocr/infer/CTCLabelDecode.py
Normal file
215
src/models/thrid_party/paddleocr/infer/CTCLabelDecode.py
Normal file
@@ -0,0 +1,215 @@
|
||||
import re
|
||||
import numpy as np
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class BaseRecLabelDecode(object):
|
||||
"""Convert between text-label and text-index"""
|
||||
|
||||
def __init__(self, character_dict_path=None, use_space_char=False):
|
||||
cur_path = os.getcwd()
|
||||
scriptDir = Path(__file__).resolve().parent
|
||||
os.chdir(scriptDir)
|
||||
character_dict_path = str(Path(scriptDir / "ppocr_keys_v1.txt"))
|
||||
|
||||
self.beg_str = "sos"
|
||||
self.end_str = "eos"
|
||||
self.reverse = False
|
||||
self.character_str = []
|
||||
|
||||
if character_dict_path is None:
|
||||
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
|
||||
dict_character = list(self.character_str)
|
||||
else:
|
||||
with open(character_dict_path, "rb") as fin:
|
||||
lines = fin.readlines()
|
||||
for line in lines:
|
||||
line = line.decode("utf-8").strip("\n").strip("\r\n")
|
||||
self.character_str.append(line)
|
||||
if use_space_char:
|
||||
self.character_str.append(" ")
|
||||
dict_character = list(self.character_str)
|
||||
if "arabic" in character_dict_path:
|
||||
self.reverse = True
|
||||
|
||||
dict_character = self.add_special_char(dict_character)
|
||||
self.dict = {}
|
||||
for i, char in enumerate(dict_character):
|
||||
self.dict[char] = i
|
||||
self.character = dict_character
|
||||
os.chdir(cur_path)
|
||||
|
||||
def pred_reverse(self, pred):
|
||||
pred_re = []
|
||||
c_current = ""
|
||||
for c in pred:
|
||||
if not bool(re.search("[a-zA-Z0-9 :*./%+-]", c)):
|
||||
if c_current != "":
|
||||
pred_re.append(c_current)
|
||||
pred_re.append(c)
|
||||
c_current = ""
|
||||
else:
|
||||
c_current += c
|
||||
if c_current != "":
|
||||
pred_re.append(c_current)
|
||||
|
||||
return "".join(pred_re[::-1])
|
||||
|
||||
def add_special_char(self, dict_character):
|
||||
return dict_character
|
||||
|
||||
def get_word_info(self, text, selection):
|
||||
"""
|
||||
Group the decoded characters and record the corresponding decoded positions.
|
||||
|
||||
Args:
|
||||
text: the decoded text
|
||||
selection: the bool array that identifies which columns of features are decoded as non-separated characters
|
||||
Returns:
|
||||
word_list: list of the grouped words
|
||||
word_col_list: list of decoding positions corresponding to each character in the grouped word
|
||||
state_list: list of marker to identify the type of grouping words, including two types of grouping words:
|
||||
- 'cn': continous chinese characters (e.g., 你好啊)
|
||||
- 'en&num': continous english characters (e.g., hello), number (e.g., 123, 1.123), or mixed of them connected by '-' (e.g., VGG-16)
|
||||
The remaining characters in text are treated as separators between groups (e.g., space, '(', ')', etc.).
|
||||
"""
|
||||
state = None
|
||||
word_content = []
|
||||
word_col_content = []
|
||||
word_list = []
|
||||
word_col_list = []
|
||||
state_list = []
|
||||
valid_col = np.where(selection == True)[0]
|
||||
|
||||
for c_i, char in enumerate(text):
|
||||
if "\u4e00" <= char <= "\u9fff":
|
||||
c_state = "cn"
|
||||
elif bool(re.search("[a-zA-Z0-9]", char)):
|
||||
c_state = "en&num"
|
||||
else:
|
||||
c_state = "splitter"
|
||||
|
||||
if (
|
||||
char == "."
|
||||
and state == "en&num"
|
||||
and c_i + 1 < len(text)
|
||||
and bool(re.search("[0-9]", text[c_i + 1]))
|
||||
): # grouping floting number
|
||||
c_state = "en&num"
|
||||
if (
|
||||
char == "-" and state == "en&num"
|
||||
): # grouping word with '-', such as 'state-of-the-art'
|
||||
c_state = "en&num"
|
||||
|
||||
if state == None:
|
||||
state = c_state
|
||||
|
||||
if state != c_state:
|
||||
if len(word_content) != 0:
|
||||
word_list.append(word_content)
|
||||
word_col_list.append(word_col_content)
|
||||
state_list.append(state)
|
||||
word_content = []
|
||||
word_col_content = []
|
||||
state = c_state
|
||||
|
||||
if state != "splitter":
|
||||
word_content.append(char)
|
||||
word_col_content.append(valid_col[c_i])
|
||||
|
||||
if len(word_content) != 0:
|
||||
word_list.append(word_content)
|
||||
word_col_list.append(word_col_content)
|
||||
state_list.append(state)
|
||||
|
||||
return word_list, word_col_list, state_list
|
||||
|
||||
def decode(
|
||||
self,
|
||||
text_index,
|
||||
text_prob=None,
|
||||
is_remove_duplicate=False,
|
||||
return_word_box=False,
|
||||
):
|
||||
"""convert text-index into text-label."""
|
||||
result_list = []
|
||||
ignored_tokens = self.get_ignored_tokens()
|
||||
batch_size = len(text_index)
|
||||
for batch_idx in range(batch_size):
|
||||
selection = np.ones(len(text_index[batch_idx]), dtype=bool)
|
||||
if is_remove_duplicate:
|
||||
selection[1:] = text_index[batch_idx][1:] != text_index[batch_idx][:-1]
|
||||
for ignored_token in ignored_tokens:
|
||||
selection &= text_index[batch_idx] != ignored_token
|
||||
|
||||
char_list = [
|
||||
self.character[text_id] for text_id in text_index[batch_idx][selection]
|
||||
]
|
||||
if text_prob is not None:
|
||||
conf_list = text_prob[batch_idx][selection]
|
||||
else:
|
||||
conf_list = [1] * len(selection)
|
||||
if len(conf_list) == 0:
|
||||
conf_list = [0]
|
||||
|
||||
text = "".join(char_list)
|
||||
|
||||
if self.reverse: # for arabic rec
|
||||
text = self.pred_reverse(text)
|
||||
|
||||
if return_word_box:
|
||||
word_list, word_col_list, state_list = self.get_word_info(
|
||||
text, selection
|
||||
)
|
||||
result_list.append(
|
||||
(
|
||||
text,
|
||||
np.mean(conf_list).tolist(),
|
||||
[
|
||||
len(text_index[batch_idx]),
|
||||
word_list,
|
||||
word_col_list,
|
||||
state_list,
|
||||
],
|
||||
)
|
||||
)
|
||||
else:
|
||||
result_list.append((text, np.mean(conf_list).tolist()))
|
||||
return result_list
|
||||
|
||||
def get_ignored_tokens(self):
|
||||
return [0] # for ctc blank
|
||||
|
||||
|
||||
class CTCLabelDecode(BaseRecLabelDecode):
|
||||
"""Convert between text-label and text-index"""
|
||||
|
||||
def __init__(self, character_dict_path=None, use_space_char=False, **kwargs):
|
||||
super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char)
|
||||
|
||||
def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs):
|
||||
if isinstance(preds, tuple) or isinstance(preds, list):
|
||||
preds = preds[-1]
|
||||
assert isinstance(preds, np.ndarray)
|
||||
preds_idx = preds.argmax(axis=2)
|
||||
preds_prob = preds.max(axis=2)
|
||||
text = self.decode(
|
||||
preds_idx,
|
||||
preds_prob,
|
||||
is_remove_duplicate=True,
|
||||
return_word_box=return_word_box,
|
||||
)
|
||||
if return_word_box:
|
||||
for rec_idx, rec in enumerate(text):
|
||||
wh_ratio = kwargs["wh_ratio_list"][rec_idx]
|
||||
max_wh_ratio = kwargs["max_wh_ratio"]
|
||||
rec[2][0] = rec[2][0] * (wh_ratio / max_wh_ratio)
|
||||
if label is None:
|
||||
return text
|
||||
label = self.decode(label)
|
||||
return text, label
|
||||
|
||||
def add_special_char(self, dict_character):
|
||||
dict_character = ["blank"] + dict_character
|
||||
return dict_character
|
||||
229
src/models/thrid_party/paddleocr/infer/DBPostProcess.py
Normal file
229
src/models/thrid_party/paddleocr/infer/DBPostProcess.py
Normal file
@@ -0,0 +1,229 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
from shapely.geometry import Polygon
|
||||
import pyclipper
|
||||
|
||||
|
||||
class DBPostProcess(object):
|
||||
"""
|
||||
The post process for Differentiable Binarization (DB).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
thresh=0.3,
|
||||
box_thresh=0.7,
|
||||
max_candidates=1000,
|
||||
unclip_ratio=2.0,
|
||||
use_dilation=False,
|
||||
score_mode="fast",
|
||||
box_type="quad",
|
||||
**kwargs
|
||||
):
|
||||
self.thresh = thresh
|
||||
self.box_thresh = box_thresh
|
||||
self.max_candidates = max_candidates
|
||||
self.unclip_ratio = unclip_ratio
|
||||
self.min_size = 3
|
||||
self.score_mode = score_mode
|
||||
self.box_type = box_type
|
||||
assert score_mode in [
|
||||
"slow",
|
||||
"fast",
|
||||
], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
|
||||
|
||||
self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]])
|
||||
|
||||
def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
|
||||
"""
|
||||
_bitmap: single map with shape (1, H, W),
|
||||
whose values are binarized as {0, 1}
|
||||
"""
|
||||
|
||||
bitmap = _bitmap
|
||||
height, width = bitmap.shape
|
||||
|
||||
boxes = []
|
||||
scores = []
|
||||
|
||||
contours, _ = cv2.findContours(
|
||||
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
for contour in contours[: self.max_candidates]:
|
||||
epsilon = 0.002 * cv2.arcLength(contour, True)
|
||||
approx = cv2.approxPolyDP(contour, epsilon, True)
|
||||
points = approx.reshape((-1, 2))
|
||||
if points.shape[0] < 4:
|
||||
continue
|
||||
|
||||
score = self.box_score_fast(pred, points.reshape(-1, 2))
|
||||
if self.box_thresh > score:
|
||||
continue
|
||||
|
||||
if points.shape[0] > 2:
|
||||
box = self.unclip(points, self.unclip_ratio)
|
||||
if len(box) > 1:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
box = box.reshape(-1, 2)
|
||||
|
||||
_, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
|
||||
if sside < self.min_size + 2:
|
||||
continue
|
||||
|
||||
box = np.array(box)
|
||||
box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
|
||||
box[:, 1] = np.clip(
|
||||
np.round(box[:, 1] / height * dest_height), 0, dest_height
|
||||
)
|
||||
boxes.append(box.tolist())
|
||||
scores.append(score)
|
||||
return boxes, scores
|
||||
|
||||
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
|
||||
"""
|
||||
_bitmap: single map with shape (1, H, W),
|
||||
whose values are binarized as {0, 1}
|
||||
"""
|
||||
|
||||
bitmap = _bitmap
|
||||
height, width = bitmap.shape
|
||||
|
||||
outs = cv2.findContours(
|
||||
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
if len(outs) == 3:
|
||||
img, contours, _ = outs[0], outs[1], outs[2]
|
||||
elif len(outs) == 2:
|
||||
contours, _ = outs[0], outs[1]
|
||||
|
||||
num_contours = min(len(contours), self.max_candidates)
|
||||
|
||||
boxes = []
|
||||
scores = []
|
||||
for index in range(num_contours):
|
||||
contour = contours[index]
|
||||
points, sside = self.get_mini_boxes(contour)
|
||||
if sside < self.min_size:
|
||||
continue
|
||||
points = np.array(points)
|
||||
if self.score_mode == "fast":
|
||||
score = self.box_score_fast(pred, points.reshape(-1, 2))
|
||||
else:
|
||||
score = self.box_score_slow(pred, contour)
|
||||
if self.box_thresh > score:
|
||||
continue
|
||||
|
||||
box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2)
|
||||
box, sside = self.get_mini_boxes(box)
|
||||
if sside < self.min_size + 2:
|
||||
continue
|
||||
box = np.array(box)
|
||||
|
||||
box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
|
||||
box[:, 1] = np.clip(
|
||||
np.round(box[:, 1] / height * dest_height), 0, dest_height
|
||||
)
|
||||
boxes.append(box.astype("int32"))
|
||||
scores.append(score)
|
||||
return np.array(boxes, dtype="int32"), scores
|
||||
|
||||
def unclip(self, box, unclip_ratio):
|
||||
poly = Polygon(box)
|
||||
distance = poly.area * unclip_ratio / poly.length
|
||||
offset = pyclipper.PyclipperOffset()
|
||||
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
expanded = np.array(offset.Execute(distance))
|
||||
return expanded
|
||||
|
||||
def get_mini_boxes(self, contour):
|
||||
bounding_box = cv2.minAreaRect(contour)
|
||||
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
|
||||
|
||||
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
|
||||
if points[1][1] > points[0][1]:
|
||||
index_1 = 0
|
||||
index_4 = 1
|
||||
else:
|
||||
index_1 = 1
|
||||
index_4 = 0
|
||||
if points[3][1] > points[2][1]:
|
||||
index_2 = 2
|
||||
index_3 = 3
|
||||
else:
|
||||
index_2 = 3
|
||||
index_3 = 2
|
||||
|
||||
box = [points[index_1], points[index_2], points[index_3], points[index_4]]
|
||||
return box, min(bounding_box[1])
|
||||
|
||||
def box_score_fast(self, bitmap, _box):
|
||||
"""
|
||||
box_score_fast: use bbox mean score as the mean score
|
||||
"""
|
||||
h, w = bitmap.shape[:2]
|
||||
box = _box.copy()
|
||||
xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
|
||||
xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1)
|
||||
ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1)
|
||||
ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1)
|
||||
|
||||
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
|
||||
box[:, 0] = box[:, 0] - xmin
|
||||
box[:, 1] = box[:, 1] - ymin
|
||||
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
|
||||
return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
|
||||
|
||||
def box_score_slow(self, bitmap, contour):
|
||||
"""
|
||||
box_score_slow: use polyon mean score as the mean score
|
||||
"""
|
||||
h, w = bitmap.shape[:2]
|
||||
contour = contour.copy()
|
||||
contour = np.reshape(contour, (-1, 2))
|
||||
|
||||
xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
|
||||
xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
|
||||
ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
|
||||
ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
|
||||
|
||||
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
|
||||
|
||||
contour[:, 0] = contour[:, 0] - xmin
|
||||
contour[:, 1] = contour[:, 1] - ymin
|
||||
|
||||
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
|
||||
return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
|
||||
|
||||
def __call__(self, outs_dict, shape_list):
|
||||
pred = outs_dict["maps"]
|
||||
assert isinstance(pred, np.ndarray)
|
||||
pred = pred[:, 0, :, :]
|
||||
segmentation = pred > self.thresh
|
||||
|
||||
boxes_batch = []
|
||||
for batch_index in range(pred.shape[0]):
|
||||
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
|
||||
if self.dilation_kernel is not None:
|
||||
mask = cv2.dilate(
|
||||
np.array(segmentation[batch_index]).astype(np.uint8),
|
||||
self.dilation_kernel,
|
||||
)
|
||||
else:
|
||||
mask = segmentation[batch_index]
|
||||
if self.box_type == "poly":
|
||||
boxes, scores = self.polygons_from_bitmap(
|
||||
pred[batch_index], mask, src_w, src_h
|
||||
)
|
||||
elif self.box_type == "quad":
|
||||
boxes, scores = self.boxes_from_bitmap(
|
||||
pred[batch_index], mask, src_w, src_h
|
||||
)
|
||||
else:
|
||||
raise ValueError("box_type can only be one of ['quad', 'poly']")
|
||||
|
||||
boxes_batch.append({"points": boxes})
|
||||
return boxes_batch
|
||||
186
src/models/thrid_party/paddleocr/infer/operators.py
Normal file
186
src/models/thrid_party/paddleocr/infer/operators.py
Normal file
@@ -0,0 +1,186 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
import math
|
||||
import sys
|
||||
|
||||
|
||||
class DetResizeForTest(object):
|
||||
def __init__(self, **kwargs):
|
||||
super(DetResizeForTest, self).__init__()
|
||||
self.resize_type = 0
|
||||
self.keep_ratio = False
|
||||
if "image_shape" in kwargs:
|
||||
self.image_shape = kwargs["image_shape"]
|
||||
self.resize_type = 1
|
||||
if "keep_ratio" in kwargs:
|
||||
self.keep_ratio = kwargs["keep_ratio"]
|
||||
elif "limit_side_len" in kwargs:
|
||||
self.limit_side_len = kwargs["limit_side_len"]
|
||||
self.limit_type = kwargs.get("limit_type", "min")
|
||||
elif "resize_long" in kwargs:
|
||||
self.resize_type = 2
|
||||
self.resize_long = kwargs.get("resize_long", 960)
|
||||
else:
|
||||
self.limit_side_len = 736
|
||||
self.limit_type = "min"
|
||||
|
||||
def __call__(self, data):
|
||||
img = data["image"]
|
||||
src_h, src_w, _ = img.shape
|
||||
if sum([src_h, src_w]) < 64:
|
||||
img = self.image_padding(img)
|
||||
|
||||
if self.resize_type == 0:
|
||||
# img, shape = self.resize_image_type0(img)
|
||||
img, [ratio_h, ratio_w] = self.resize_image_type0(img)
|
||||
elif self.resize_type == 2:
|
||||
img, [ratio_h, ratio_w] = self.resize_image_type2(img)
|
||||
else:
|
||||
# img, shape = self.resize_image_type1(img)
|
||||
img, [ratio_h, ratio_w] = self.resize_image_type1(img)
|
||||
data["image"] = img
|
||||
data["shape"] = np.array([src_h, src_w, ratio_h, ratio_w])
|
||||
return data
|
||||
|
||||
def image_padding(self, im, value=0):
|
||||
h, w, c = im.shape
|
||||
im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value
|
||||
im_pad[:h, :w, :] = im
|
||||
return im_pad
|
||||
|
||||
def resize_image_type1(self, img):
|
||||
resize_h, resize_w = self.image_shape
|
||||
ori_h, ori_w = img.shape[:2] # (h, w, c)
|
||||
if self.keep_ratio is True:
|
||||
resize_w = ori_w * resize_h / ori_h
|
||||
N = math.ceil(resize_w / 32)
|
||||
resize_w = N * 32
|
||||
ratio_h = float(resize_h) / ori_h
|
||||
ratio_w = float(resize_w) / ori_w
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
# return img, np.array([ori_h, ori_w])
|
||||
return img, [ratio_h, ratio_w]
|
||||
|
||||
def resize_image_type0(self, img):
|
||||
"""
|
||||
resize image to a size multiple of 32 which is required by the network
|
||||
args:
|
||||
img(array): array with shape [h, w, c]
|
||||
return(tuple):
|
||||
img, (ratio_h, ratio_w)
|
||||
"""
|
||||
limit_side_len = self.limit_side_len
|
||||
h, w, c = img.shape
|
||||
|
||||
# limit the max side
|
||||
if self.limit_type == "max":
|
||||
if max(h, w) > limit_side_len:
|
||||
if h > w:
|
||||
ratio = float(limit_side_len) / h
|
||||
else:
|
||||
ratio = float(limit_side_len) / w
|
||||
else:
|
||||
ratio = 1.0
|
||||
elif self.limit_type == "min":
|
||||
if min(h, w) < limit_side_len:
|
||||
if h < w:
|
||||
ratio = float(limit_side_len) / h
|
||||
else:
|
||||
ratio = float(limit_side_len) / w
|
||||
else:
|
||||
ratio = 1.0
|
||||
elif self.limit_type == "resize_long":
|
||||
ratio = float(limit_side_len) / max(h, w)
|
||||
else:
|
||||
raise Exception("not support limit type, image ")
|
||||
resize_h = int(h * ratio)
|
||||
resize_w = int(w * ratio)
|
||||
|
||||
resize_h = max(int(round(resize_h / 32) * 32), 32)
|
||||
resize_w = max(int(round(resize_w / 32) * 32), 32)
|
||||
|
||||
try:
|
||||
if int(resize_w) <= 0 or int(resize_h) <= 0:
|
||||
return None, (None, None)
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
except:
|
||||
print(img.shape, resize_w, resize_h)
|
||||
sys.exit(0)
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
return img, [ratio_h, ratio_w]
|
||||
|
||||
def resize_image_type2(self, img):
|
||||
h, w, _ = img.shape
|
||||
|
||||
resize_w = w
|
||||
resize_h = h
|
||||
|
||||
if resize_h > resize_w:
|
||||
ratio = float(self.resize_long) / resize_h
|
||||
else:
|
||||
ratio = float(self.resize_long) / resize_w
|
||||
|
||||
resize_h = int(resize_h * ratio)
|
||||
resize_w = int(resize_w * ratio)
|
||||
|
||||
max_stride = 128
|
||||
resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
|
||||
resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
|
||||
img = cv2.resize(img, (int(resize_w), int(resize_h)))
|
||||
ratio_h = resize_h / float(h)
|
||||
ratio_w = resize_w / float(w)
|
||||
|
||||
return img, [ratio_h, ratio_w]
|
||||
|
||||
|
||||
class NormalizeImage(object):
|
||||
"""normalize image such as substract mean, divide std"""
|
||||
|
||||
def __init__(self, scale=None, mean=None, std=None, order="chw", **kwargs):
|
||||
if isinstance(scale, str):
|
||||
scale = eval(scale)
|
||||
self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
|
||||
mean = mean if mean is not None else [0.485, 0.456, 0.406]
|
||||
std = std if std is not None else [0.229, 0.224, 0.225]
|
||||
|
||||
shape = (3, 1, 1) if order == "chw" else (1, 1, 3)
|
||||
self.mean = np.array(mean).reshape(shape).astype("float32")
|
||||
self.std = np.array(std).reshape(shape).astype("float32")
|
||||
|
||||
def __call__(self, data):
|
||||
img = data["image"]
|
||||
from PIL import Image
|
||||
|
||||
if isinstance(img, Image.Image):
|
||||
img = np.array(img)
|
||||
assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage"
|
||||
data["image"] = (img.astype("float32") * self.scale - self.mean) / self.std
|
||||
return data
|
||||
|
||||
|
||||
class ToCHWImage(object):
|
||||
"""convert hwc image to chw image"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __call__(self, data):
|
||||
img = data["image"]
|
||||
from PIL import Image
|
||||
|
||||
if isinstance(img, Image.Image):
|
||||
img = np.array(img)
|
||||
data["image"] = img.transpose((2, 0, 1))
|
||||
return data
|
||||
|
||||
|
||||
class KeepKeys(object):
|
||||
def __init__(self, keep_keys, **kwargs):
|
||||
self.keep_keys = keep_keys
|
||||
|
||||
def __call__(self, data):
|
||||
data_list = []
|
||||
for key in self.keep_keys:
|
||||
data_list.append(data[key])
|
||||
return data_list
|
||||
6623
src/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt
Normal file
6623
src/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt
Normal file
File diff suppressed because it is too large
Load Diff
298
src/models/thrid_party/paddleocr/infer/predict_det.py
Executable file
298
src/models/thrid_party/paddleocr/infer/predict_det.py
Executable file
@@ -0,0 +1,298 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import sys
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "../..")))
|
||||
|
||||
os.environ["FLAGS_allocator_strategy"] = "auto_growth"
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import time
|
||||
import sys
|
||||
|
||||
# import tools.infer.utility as utility
|
||||
import utility
|
||||
from utility import get_logger
|
||||
|
||||
from DBPostProcess import DBPostProcess
|
||||
from operators import DetResizeForTest, KeepKeys, NormalizeImage, ToCHWImage
|
||||
|
||||
|
||||
def transform(data, ops=None):
|
||||
"""transform"""
|
||||
if ops is None:
|
||||
ops = []
|
||||
for op in ops:
|
||||
data = op(data)
|
||||
if data is None:
|
||||
return None
|
||||
return data
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class TextDetector(object):
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
self.det_algorithm = args.det_algorithm
|
||||
self.use_onnx = args.use_onnx
|
||||
postprocess_params = {}
|
||||
assert self.det_algorithm == "DB"
|
||||
postprocess_params["name"] = "DBPostProcess"
|
||||
postprocess_params["thresh"] = args.det_db_thresh
|
||||
postprocess_params["box_thresh"] = args.det_db_box_thresh
|
||||
postprocess_params["max_candidates"] = 1000
|
||||
postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
|
||||
postprocess_params["use_dilation"] = args.use_dilation
|
||||
postprocess_params["score_mode"] = args.det_db_score_mode
|
||||
postprocess_params["box_type"] = args.det_box_type
|
||||
|
||||
self.preprocess_op = [
|
||||
DetResizeForTest(limit_side_len=args.det_limit_side_len, limit_type=args.det_limit_type),
|
||||
NormalizeImage(std= [0.229, 0.224, 0.225], mean= [0.485, 0.456, 0.406], scale= 1./255., order= "hwc"),
|
||||
ToCHWImage(),
|
||||
KeepKeys(keep_keys= ["image", "shape"])
|
||||
]
|
||||
self.postprocess_op = DBPostProcess(**postprocess_params)
|
||||
(
|
||||
self.predictor,
|
||||
self.input_tensor,
|
||||
self.output_tensors,
|
||||
self.config,
|
||||
) = utility.create_predictor(args, "det", logger)
|
||||
|
||||
assert self.use_onnx
|
||||
if self.use_onnx:
|
||||
img_h, img_w = self.input_tensor.shape[2:]
|
||||
if isinstance(img_h, str) or isinstance(img_w, str):
|
||||
pass
|
||||
elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
|
||||
self.preprocess_op[0] = DetResizeForTest(image_shape=[img_h, img_w])
|
||||
|
||||
|
||||
def order_points_clockwise(self, pts):
|
||||
rect = np.zeros((4, 2), dtype="float32")
|
||||
s = pts.sum(axis=1)
|
||||
rect[0] = pts[np.argmin(s)]
|
||||
rect[2] = pts[np.argmax(s)]
|
||||
tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
|
||||
diff = np.diff(np.array(tmp), axis=1)
|
||||
rect[1] = tmp[np.argmin(diff)]
|
||||
rect[3] = tmp[np.argmax(diff)]
|
||||
return rect
|
||||
|
||||
def clip_det_res(self, points, img_height, img_width):
|
||||
for pno in range(points.shape[0]):
|
||||
points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
|
||||
points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
|
||||
return points
|
||||
|
||||
def filter_tag_det_res(self, dt_boxes, image_shape):
|
||||
img_height, img_width = image_shape[0:2]
|
||||
dt_boxes_new = []
|
||||
for box in dt_boxes:
|
||||
if type(box) is list:
|
||||
box = np.array(box)
|
||||
box = self.order_points_clockwise(box)
|
||||
box = self.clip_det_res(box, img_height, img_width)
|
||||
rect_width = int(np.linalg.norm(box[0] - box[1]))
|
||||
rect_height = int(np.linalg.norm(box[0] - box[3]))
|
||||
if rect_width <= 3 or rect_height <= 3:
|
||||
continue
|
||||
dt_boxes_new.append(box)
|
||||
dt_boxes = np.array(dt_boxes_new)
|
||||
return dt_boxes
|
||||
|
||||
def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
|
||||
img_height, img_width = image_shape[0:2]
|
||||
dt_boxes_new = []
|
||||
for box in dt_boxes:
|
||||
if type(box) is list:
|
||||
box = np.array(box)
|
||||
box = self.clip_det_res(box, img_height, img_width)
|
||||
dt_boxes_new.append(box)
|
||||
dt_boxes = np.array(dt_boxes_new)
|
||||
return dt_boxes
|
||||
|
||||
def predict(self, img):
|
||||
ori_im = img.copy()
|
||||
data = {"image": img}
|
||||
|
||||
st = time.time()
|
||||
|
||||
if self.args.benchmark:
|
||||
self.autolog.times.start()
|
||||
|
||||
data = transform(data, self.preprocess_op)
|
||||
img, shape_list = data
|
||||
if img is None:
|
||||
return None, 0
|
||||
img = np.expand_dims(img, axis=0)
|
||||
shape_list = np.expand_dims(shape_list, axis=0)
|
||||
img = img.copy()
|
||||
|
||||
if self.args.benchmark:
|
||||
self.autolog.times.stamp()
|
||||
if self.use_onnx:
|
||||
input_dict = {}
|
||||
input_dict[self.input_tensor.name] = img
|
||||
outputs = self.predictor.run(self.output_tensors, input_dict)
|
||||
else:
|
||||
self.input_tensor.copy_from_cpu(img)
|
||||
self.predictor.run()
|
||||
outputs = []
|
||||
for output_tensor in self.output_tensors:
|
||||
output = output_tensor.copy_to_cpu()
|
||||
outputs.append(output)
|
||||
if self.args.benchmark:
|
||||
self.autolog.times.stamp()
|
||||
|
||||
preds = {}
|
||||
if self.det_algorithm == "EAST":
|
||||
preds["f_geo"] = outputs[0]
|
||||
preds["f_score"] = outputs[1]
|
||||
elif self.det_algorithm == "SAST":
|
||||
preds["f_border"] = outputs[0]
|
||||
preds["f_score"] = outputs[1]
|
||||
preds["f_tco"] = outputs[2]
|
||||
preds["f_tvo"] = outputs[3]
|
||||
elif self.det_algorithm in ["DB", "PSE", "DB++"]:
|
||||
preds["maps"] = outputs[0]
|
||||
elif self.det_algorithm == "FCE":
|
||||
for i, output in enumerate(outputs):
|
||||
preds["level_{}".format(i)] = output
|
||||
elif self.det_algorithm == "CT":
|
||||
preds["maps"] = outputs[0]
|
||||
preds["score"] = outputs[1]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
post_result = self.postprocess_op(preds, shape_list)
|
||||
dt_boxes = post_result[0]["points"]
|
||||
|
||||
if self.args.det_box_type == "poly":
|
||||
dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
|
||||
else:
|
||||
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
|
||||
|
||||
if self.args.benchmark:
|
||||
self.autolog.times.end(stamp=True)
|
||||
et = time.time()
|
||||
return dt_boxes, et - st
|
||||
|
||||
def __call__(self, img):
|
||||
# For image like poster with one side much greater than the other side,
|
||||
# splitting recursively and processing with overlap to enhance performance.
|
||||
MIN_BOUND_DISTANCE = 50
|
||||
dt_boxes = np.zeros((0, 4, 2), dtype=np.float32)
|
||||
elapse = 0
|
||||
if (
|
||||
img.shape[0] / img.shape[1] > 2
|
||||
and img.shape[0] > self.args.det_limit_side_len
|
||||
):
|
||||
start_h = 0
|
||||
end_h = 0
|
||||
while end_h <= img.shape[0]:
|
||||
end_h = start_h + img.shape[1] * 3 // 4
|
||||
subimg = img[start_h:end_h, :]
|
||||
if len(subimg) == 0:
|
||||
break
|
||||
sub_dt_boxes, sub_elapse = self.predict(subimg)
|
||||
offset = start_h
|
||||
# To prevent text blocks from being cut off, roll back a certain buffer area.
|
||||
if (
|
||||
len(sub_dt_boxes) == 0
|
||||
or img.shape[1] - max([x[-1][1] for x in sub_dt_boxes])
|
||||
> MIN_BOUND_DISTANCE
|
||||
):
|
||||
start_h = end_h
|
||||
else:
|
||||
sorted_indices = np.argsort(sub_dt_boxes[:, 2, 1])
|
||||
sub_dt_boxes = sub_dt_boxes[sorted_indices]
|
||||
bottom_line = (
|
||||
0
|
||||
if len(sub_dt_boxes) <= 1
|
||||
else int(np.max(sub_dt_boxes[:-1, 2, 1]))
|
||||
)
|
||||
if bottom_line > 0:
|
||||
start_h += bottom_line
|
||||
sub_dt_boxes = sub_dt_boxes[
|
||||
sub_dt_boxes[:, 2, 1] <= bottom_line
|
||||
]
|
||||
else:
|
||||
start_h = end_h
|
||||
if len(sub_dt_boxes) > 0:
|
||||
if dt_boxes.shape[0] == 0:
|
||||
dt_boxes = sub_dt_boxes + np.array(
|
||||
[0, offset], dtype=np.float32
|
||||
)
|
||||
else:
|
||||
dt_boxes = np.append(
|
||||
dt_boxes,
|
||||
sub_dt_boxes + np.array([0, offset], dtype=np.float32),
|
||||
axis=0,
|
||||
)
|
||||
elapse += sub_elapse
|
||||
elif (
|
||||
img.shape[1] / img.shape[0] > 3
|
||||
and img.shape[1] > self.args.det_limit_side_len * 3
|
||||
):
|
||||
start_w = 0
|
||||
end_w = 0
|
||||
while end_w <= img.shape[1]:
|
||||
end_w = start_w + img.shape[0] * 3 // 4
|
||||
subimg = img[:, start_w:end_w]
|
||||
if len(subimg) == 0:
|
||||
break
|
||||
sub_dt_boxes, sub_elapse = self.predict(subimg)
|
||||
offset = start_w
|
||||
if (
|
||||
len(sub_dt_boxes) == 0
|
||||
or img.shape[0] - max([x[-1][0] for x in sub_dt_boxes])
|
||||
> MIN_BOUND_DISTANCE
|
||||
):
|
||||
start_w = end_w
|
||||
else:
|
||||
sorted_indices = np.argsort(sub_dt_boxes[:, 2, 0])
|
||||
sub_dt_boxes = sub_dt_boxes[sorted_indices]
|
||||
right_line = (
|
||||
0
|
||||
if len(sub_dt_boxes) <= 1
|
||||
else int(np.max(sub_dt_boxes[:-1, 1, 0]))
|
||||
)
|
||||
if right_line > 0:
|
||||
start_w += right_line
|
||||
sub_dt_boxes = sub_dt_boxes[sub_dt_boxes[:, 1, 0] <= right_line]
|
||||
else:
|
||||
start_w = end_w
|
||||
if len(sub_dt_boxes) > 0:
|
||||
if dt_boxes.shape[0] == 0:
|
||||
dt_boxes = sub_dt_boxes + np.array(
|
||||
[offset, 0], dtype=np.float32
|
||||
)
|
||||
else:
|
||||
dt_boxes = np.append(
|
||||
dt_boxes,
|
||||
sub_dt_boxes + np.array([offset, 0], dtype=np.float32),
|
||||
axis=0,
|
||||
)
|
||||
elapse += sub_elapse
|
||||
else:
|
||||
dt_boxes, elapse = self.predict(img)
|
||||
return dt_boxes, elapse
|
||||
|
||||
383
src/models/thrid_party/paddleocr/infer/predict_rec.py
Executable file
383
src/models/thrid_party/paddleocr/infer/predict_rec.py
Executable file
@@ -0,0 +1,383 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import sys
|
||||
from PIL import Image
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "../..")))
|
||||
|
||||
os.environ["FLAGS_allocator_strategy"] = "auto_growth"
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import math
|
||||
import time
|
||||
|
||||
import utility
|
||||
from utility import get_logger
|
||||
|
||||
from CTCLabelDecode import CTCLabelDecode
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class TextRecognizer(object):
|
||||
def __init__(self, args):
|
||||
self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
|
||||
self.rec_batch_num = args.rec_batch_num
|
||||
self.rec_algorithm = args.rec_algorithm
|
||||
self.postprocess_op = CTCLabelDecode(character_dict_path=args.rec_char_dict_path, use_space_char=args.use_space_char)
|
||||
(
|
||||
self.predictor,
|
||||
self.input_tensor,
|
||||
self.output_tensors,
|
||||
self.config,
|
||||
) = utility.create_predictor(args, "rec", logger)
|
||||
self.benchmark = args.benchmark
|
||||
self.use_onnx = args.use_onnx
|
||||
self.return_word_box = args.return_word_box
|
||||
|
||||
def resize_norm_img(self, img, max_wh_ratio):
|
||||
imgC, imgH, imgW = self.rec_image_shape
|
||||
if self.rec_algorithm == "NRTR" or self.rec_algorithm == "ViTSTR":
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
# return padding_im
|
||||
image_pil = Image.fromarray(np.uint8(img))
|
||||
if self.rec_algorithm == "ViTSTR":
|
||||
img = image_pil.resize([imgW, imgH], Image.BICUBIC)
|
||||
else:
|
||||
img = image_pil.resize([imgW, imgH], Image.Resampling.LANCZOS)
|
||||
img = np.array(img)
|
||||
norm_img = np.expand_dims(img, -1)
|
||||
norm_img = norm_img.transpose((2, 0, 1))
|
||||
if self.rec_algorithm == "ViTSTR":
|
||||
norm_img = norm_img.astype(np.float32) / 255.0
|
||||
else:
|
||||
norm_img = norm_img.astype(np.float32) / 128.0 - 1.0
|
||||
return norm_img
|
||||
elif self.rec_algorithm == "RFL":
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
|
||||
resized_image = resized_image.astype("float32")
|
||||
resized_image = resized_image / 255
|
||||
resized_image = resized_image[np.newaxis, :]
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
return resized_image
|
||||
|
||||
assert imgC == img.shape[2]
|
||||
imgW = int((imgH * max_wh_ratio))
|
||||
if self.use_onnx:
|
||||
w = self.input_tensor.shape[3:][0]
|
||||
if isinstance(w, str):
|
||||
pass
|
||||
elif w is not None and w > 0:
|
||||
imgW = w
|
||||
h, w = img.shape[:2]
|
||||
ratio = w / float(h)
|
||||
if math.ceil(imgH * ratio) > imgW:
|
||||
resized_w = imgW
|
||||
else:
|
||||
resized_w = int(math.ceil(imgH * ratio))
|
||||
if self.rec_algorithm == "RARE":
|
||||
if resized_w > self.rec_image_shape[2]:
|
||||
resized_w = self.rec_image_shape[2]
|
||||
imgW = self.rec_image_shape[2]
|
||||
resized_image = cv2.resize(img, (resized_w, imgH))
|
||||
resized_image = resized_image.astype("float32")
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
||||
padding_im[:, :, 0:resized_w] = resized_image
|
||||
return padding_im
|
||||
|
||||
def resize_norm_img_vl(self, img, image_shape):
|
||||
imgC, imgH, imgW = image_shape
|
||||
img = img[:, :, ::-1] # bgr2rgb
|
||||
resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
|
||||
resized_image = resized_image.astype("float32")
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
return resized_image
|
||||
|
||||
def resize_norm_img_srn(self, img, image_shape):
|
||||
imgC, imgH, imgW = image_shape
|
||||
|
||||
img_black = np.zeros((imgH, imgW))
|
||||
im_hei = img.shape[0]
|
||||
im_wid = img.shape[1]
|
||||
|
||||
if im_wid <= im_hei * 1:
|
||||
img_new = cv2.resize(img, (imgH * 1, imgH))
|
||||
elif im_wid <= im_hei * 2:
|
||||
img_new = cv2.resize(img, (imgH * 2, imgH))
|
||||
elif im_wid <= im_hei * 3:
|
||||
img_new = cv2.resize(img, (imgH * 3, imgH))
|
||||
else:
|
||||
img_new = cv2.resize(img, (imgW, imgH))
|
||||
|
||||
img_np = np.asarray(img_new)
|
||||
img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
|
||||
img_black[:, 0 : img_np.shape[1]] = img_np
|
||||
img_black = img_black[:, :, np.newaxis]
|
||||
|
||||
row, col, c = img_black.shape
|
||||
c = 1
|
||||
|
||||
return np.reshape(img_black, (c, row, col)).astype(np.float32)
|
||||
|
||||
def srn_other_inputs(self, image_shape, num_heads, max_text_length):
|
||||
imgC, imgH, imgW = image_shape
|
||||
feature_dim = int((imgH / 8) * (imgW / 8))
|
||||
|
||||
encoder_word_pos = (
|
||||
np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype("int64")
|
||||
)
|
||||
gsrm_word_pos = (
|
||||
np.array(range(0, max_text_length))
|
||||
.reshape((max_text_length, 1))
|
||||
.astype("int64")
|
||||
)
|
||||
|
||||
gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
|
||||
gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
|
||||
[-1, 1, max_text_length, max_text_length]
|
||||
)
|
||||
gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [1, num_heads, 1, 1]).astype(
|
||||
"float32"
|
||||
) * [-1e9]
|
||||
|
||||
gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
|
||||
[-1, 1, max_text_length, max_text_length]
|
||||
)
|
||||
gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [1, num_heads, 1, 1]).astype(
|
||||
"float32"
|
||||
) * [-1e9]
|
||||
|
||||
encoder_word_pos = encoder_word_pos[np.newaxis, :]
|
||||
gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
|
||||
|
||||
return [
|
||||
encoder_word_pos,
|
||||
gsrm_word_pos,
|
||||
gsrm_slf_attn_bias1,
|
||||
gsrm_slf_attn_bias2,
|
||||
]
|
||||
|
||||
def process_image_srn(self, img, image_shape, num_heads, max_text_length):
|
||||
norm_img = self.resize_norm_img_srn(img, image_shape)
|
||||
norm_img = norm_img[np.newaxis, :]
|
||||
|
||||
[
|
||||
encoder_word_pos,
|
||||
gsrm_word_pos,
|
||||
gsrm_slf_attn_bias1,
|
||||
gsrm_slf_attn_bias2,
|
||||
] = self.srn_other_inputs(image_shape, num_heads, max_text_length)
|
||||
|
||||
gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
|
||||
gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
|
||||
encoder_word_pos = encoder_word_pos.astype(np.int64)
|
||||
gsrm_word_pos = gsrm_word_pos.astype(np.int64)
|
||||
|
||||
return (
|
||||
norm_img,
|
||||
encoder_word_pos,
|
||||
gsrm_word_pos,
|
||||
gsrm_slf_attn_bias1,
|
||||
gsrm_slf_attn_bias2,
|
||||
)
|
||||
|
||||
def resize_norm_img_sar(self, img, image_shape, width_downsample_ratio=0.25):
|
||||
imgC, imgH, imgW_min, imgW_max = image_shape
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
valid_ratio = 1.0
|
||||
# make sure new_width is an integral multiple of width_divisor.
|
||||
width_divisor = int(1 / width_downsample_ratio)
|
||||
# resize
|
||||
ratio = w / float(h)
|
||||
resize_w = math.ceil(imgH * ratio)
|
||||
if resize_w % width_divisor != 0:
|
||||
resize_w = round(resize_w / width_divisor) * width_divisor
|
||||
if imgW_min is not None:
|
||||
resize_w = max(imgW_min, resize_w)
|
||||
if imgW_max is not None:
|
||||
valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
|
||||
resize_w = min(imgW_max, resize_w)
|
||||
resized_image = cv2.resize(img, (resize_w, imgH))
|
||||
resized_image = resized_image.astype("float32")
|
||||
# norm
|
||||
if image_shape[0] == 1:
|
||||
resized_image = resized_image / 255
|
||||
resized_image = resized_image[np.newaxis, :]
|
||||
else:
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
resize_shape = resized_image.shape
|
||||
padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
|
||||
padding_im[:, :, 0:resize_w] = resized_image
|
||||
pad_shape = padding_im.shape
|
||||
|
||||
return padding_im, resize_shape, pad_shape, valid_ratio
|
||||
|
||||
def resize_norm_img_spin(self, img):
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
# return padding_im
|
||||
img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
|
||||
img = np.array(img, np.float32)
|
||||
img = np.expand_dims(img, -1)
|
||||
img = img.transpose((2, 0, 1))
|
||||
mean = [127.5]
|
||||
std = [127.5]
|
||||
mean = np.array(mean, dtype=np.float32)
|
||||
std = np.array(std, dtype=np.float32)
|
||||
mean = np.float32(mean.reshape(1, -1))
|
||||
stdinv = 1 / np.float32(std.reshape(1, -1))
|
||||
img -= mean
|
||||
img *= stdinv
|
||||
return img
|
||||
|
||||
def resize_norm_img_svtr(self, img, image_shape):
|
||||
imgC, imgH, imgW = image_shape
|
||||
resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
|
||||
resized_image = resized_image.astype("float32")
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
return resized_image
|
||||
|
||||
def resize_norm_img_cppd_padding(
|
||||
self, img, image_shape, padding=True, interpolation=cv2.INTER_LINEAR
|
||||
):
|
||||
imgC, imgH, imgW = image_shape
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
if not padding:
|
||||
resized_image = cv2.resize(img, (imgW, imgH), interpolation=interpolation)
|
||||
resized_w = imgW
|
||||
else:
|
||||
ratio = w / float(h)
|
||||
if math.ceil(imgH * ratio) > imgW:
|
||||
resized_w = imgW
|
||||
else:
|
||||
resized_w = int(math.ceil(imgH * ratio))
|
||||
resized_image = cv2.resize(img, (resized_w, imgH))
|
||||
resized_image = resized_image.astype("float32")
|
||||
if image_shape[0] == 1:
|
||||
resized_image = resized_image / 255
|
||||
resized_image = resized_image[np.newaxis, :]
|
||||
else:
|
||||
resized_image = resized_image.transpose((2, 0, 1)) / 255
|
||||
resized_image -= 0.5
|
||||
resized_image /= 0.5
|
||||
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
|
||||
padding_im[:, :, 0:resized_w] = resized_image
|
||||
|
||||
return padding_im
|
||||
|
||||
def resize_norm_img_abinet(self, img, image_shape):
|
||||
imgC, imgH, imgW = image_shape
|
||||
|
||||
resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
|
||||
resized_image = resized_image.astype("float32")
|
||||
resized_image = resized_image / 255.0
|
||||
|
||||
mean = np.array([0.485, 0.456, 0.406])
|
||||
std = np.array([0.229, 0.224, 0.225])
|
||||
resized_image = (resized_image - mean[None, None, ...]) / std[None, None, ...]
|
||||
resized_image = resized_image.transpose((2, 0, 1))
|
||||
resized_image = resized_image.astype("float32")
|
||||
|
||||
return resized_image
|
||||
|
||||
def norm_img_can(self, img, image_shape):
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image
|
||||
|
||||
if self.inverse:
|
||||
img = 255 - img
|
||||
|
||||
if self.rec_image_shape[0] == 1:
|
||||
h, w = img.shape
|
||||
_, imgH, imgW = self.rec_image_shape
|
||||
if h < imgH or w < imgW:
|
||||
padding_h = max(imgH - h, 0)
|
||||
padding_w = max(imgW - w, 0)
|
||||
img_padded = np.pad(
|
||||
img,
|
||||
((0, padding_h), (0, padding_w)),
|
||||
"constant",
|
||||
constant_values=(255),
|
||||
)
|
||||
img = img_padded
|
||||
|
||||
img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w
|
||||
img = img.astype("float32")
|
||||
|
||||
return img
|
||||
|
||||
def __call__(self, img_list):
|
||||
img_num = len(img_list)
|
||||
# Calculate the aspect ratio of all text bars
|
||||
width_list = []
|
||||
for img in img_list:
|
||||
width_list.append(img.shape[1] / float(img.shape[0]))
|
||||
# Sorting can speed up the recognition process
|
||||
indices = np.argsort(np.array(width_list))
|
||||
rec_res = [["", 0.0]] * img_num
|
||||
batch_num = self.rec_batch_num
|
||||
st = time.time()
|
||||
if self.benchmark:
|
||||
self.autolog.times.start()
|
||||
for beg_img_no in range(0, img_num, batch_num):
|
||||
end_img_no = min(img_num, beg_img_no + batch_num)
|
||||
norm_img_batch = []
|
||||
imgC, imgH, imgW = self.rec_image_shape[:3]
|
||||
max_wh_ratio = imgW / imgH
|
||||
wh_ratio_list = []
|
||||
for ino in range(beg_img_no, end_img_no):
|
||||
h, w = img_list[indices[ino]].shape[0:2]
|
||||
wh_ratio = w * 1.0 / h
|
||||
max_wh_ratio = max(max_wh_ratio, wh_ratio)
|
||||
wh_ratio_list.append(wh_ratio)
|
||||
for ino in range(beg_img_no, end_img_no):
|
||||
norm_img = self.resize_norm_img(
|
||||
img_list[indices[ino]], max_wh_ratio
|
||||
)
|
||||
norm_img = norm_img[np.newaxis, :]
|
||||
norm_img_batch.append(norm_img)
|
||||
norm_img_batch = np.concatenate(norm_img_batch)
|
||||
norm_img_batch = norm_img_batch.copy()
|
||||
if self.benchmark:
|
||||
self.autolog.times.stamp()
|
||||
|
||||
assert self.use_onnx
|
||||
input_dict = {}
|
||||
input_dict[self.input_tensor.name] = norm_img_batch
|
||||
outputs = self.predictor.run(self.output_tensors, input_dict)
|
||||
preds = outputs[0]
|
||||
rec_result = self.postprocess_op(
|
||||
preds,
|
||||
return_word_box=self.return_word_box,
|
||||
wh_ratio_list=wh_ratio_list,
|
||||
max_wh_ratio=max_wh_ratio,
|
||||
)
|
||||
for rno in range(len(rec_result)):
|
||||
rec_res[indices[beg_img_no + rno]] = rec_result[rno]
|
||||
if self.benchmark:
|
||||
self.autolog.times.end(stamp=True)
|
||||
return rec_res, time.time() - st
|
||||
713
src/models/thrid_party/paddleocr/infer/utility.py
Normal file
713
src/models/thrid_party/paddleocr/infer/utility.py
Normal file
@@ -0,0 +1,713 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import functools
|
||||
import logging
|
||||
import cv2
|
||||
import numpy as np
|
||||
import PIL
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import math
|
||||
import random
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
return v.lower() in ("true", "yes", "t", "y", "1")
|
||||
|
||||
|
||||
def str2int_tuple(v):
|
||||
return tuple([int(i.strip()) for i in v.split(",")])
|
||||
|
||||
|
||||
def init_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
# params for prediction engine
|
||||
parser.add_argument("--use_gpu", type=str2bool, default=True)
|
||||
parser.add_argument("--use_xpu", type=str2bool, default=False)
|
||||
parser.add_argument("--use_npu", type=str2bool, default=False)
|
||||
parser.add_argument("--use_mlu", type=str2bool, default=False)
|
||||
parser.add_argument("--ir_optim", type=str2bool, default=True)
|
||||
parser.add_argument("--use_tensorrt", type=str2bool, default=False)
|
||||
parser.add_argument("--min_subgraph_size", type=int, default=15)
|
||||
parser.add_argument("--precision", type=str, default="fp32")
|
||||
parser.add_argument("--gpu_mem", type=int, default=500)
|
||||
parser.add_argument("--gpu_id", type=int, default=0)
|
||||
|
||||
# params for text detector
|
||||
parser.add_argument("--image_dir", type=str)
|
||||
parser.add_argument("--page_num", type=int, default=0)
|
||||
parser.add_argument("--det_algorithm", type=str, default="DB")
|
||||
parser.add_argument("--det_model_dir", type=str)
|
||||
parser.add_argument("--det_limit_side_len", type=float, default=960)
|
||||
parser.add_argument("--det_limit_type", type=str, default="max")
|
||||
parser.add_argument("--det_box_type", type=str, default="quad")
|
||||
|
||||
# DB parmas
|
||||
parser.add_argument("--det_db_thresh", type=float, default=0.3)
|
||||
parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
|
||||
parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
|
||||
parser.add_argument("--max_batch_size", type=int, default=10)
|
||||
parser.add_argument("--use_dilation", type=str2bool, default=False)
|
||||
parser.add_argument("--det_db_score_mode", type=str, default="fast")
|
||||
|
||||
# EAST parmas
|
||||
parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
|
||||
parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
|
||||
parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
|
||||
|
||||
# SAST parmas
|
||||
parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
|
||||
parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
|
||||
|
||||
# PSE parmas
|
||||
parser.add_argument("--det_pse_thresh", type=float, default=0)
|
||||
parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
|
||||
parser.add_argument("--det_pse_min_area", type=float, default=16)
|
||||
parser.add_argument("--det_pse_scale", type=int, default=1)
|
||||
|
||||
# FCE parmas
|
||||
parser.add_argument("--scales", type=list, default=[8, 16, 32])
|
||||
parser.add_argument("--alpha", type=float, default=1.0)
|
||||
parser.add_argument("--beta", type=float, default=1.0)
|
||||
parser.add_argument("--fourier_degree", type=int, default=5)
|
||||
|
||||
# params for text recognizer
|
||||
parser.add_argument("--rec_algorithm", type=str, default="SVTR_LCNet")
|
||||
parser.add_argument("--rec_model_dir", type=str)
|
||||
parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
|
||||
parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
|
||||
parser.add_argument("--rec_batch_num", type=int, default=6)
|
||||
parser.add_argument("--max_text_length", type=int, default=25)
|
||||
parser.add_argument(
|
||||
"--rec_char_dict_path", type=str, default="./ppocr_keys_v1.txt"
|
||||
)
|
||||
parser.add_argument("--use_space_char", type=str2bool, default=True)
|
||||
parser.add_argument("--vis_font_path", type=str, default="./doc/fonts/simfang.ttf")
|
||||
parser.add_argument("--drop_score", type=float, default=0.5)
|
||||
|
||||
# params for e2e
|
||||
parser.add_argument("--e2e_algorithm", type=str, default="PGNet")
|
||||
parser.add_argument("--e2e_model_dir", type=str)
|
||||
parser.add_argument("--e2e_limit_side_len", type=float, default=768)
|
||||
parser.add_argument("--e2e_limit_type", type=str, default="max")
|
||||
|
||||
# PGNet parmas
|
||||
parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
|
||||
parser.add_argument(
|
||||
"--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt"
|
||||
)
|
||||
parser.add_argument("--e2e_pgnet_valid_set", type=str, default="totaltext")
|
||||
parser.add_argument("--e2e_pgnet_mode", type=str, default="fast")
|
||||
|
||||
# params for text classifier
|
||||
parser.add_argument("--use_angle_cls", type=str2bool, default=False)
|
||||
parser.add_argument("--cls_model_dir", type=str)
|
||||
parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
|
||||
parser.add_argument("--label_list", type=list, default=["0", "180"])
|
||||
parser.add_argument("--cls_batch_num", type=int, default=6)
|
||||
parser.add_argument("--cls_thresh", type=float, default=0.9)
|
||||
|
||||
parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
|
||||
parser.add_argument("--cpu_threads", type=int, default=10)
|
||||
parser.add_argument("--use_pdserving", type=str2bool, default=False)
|
||||
parser.add_argument("--warmup", type=str2bool, default=False)
|
||||
|
||||
# SR parmas
|
||||
parser.add_argument("--sr_model_dir", type=str)
|
||||
parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
|
||||
parser.add_argument("--sr_batch_num", type=int, default=1)
|
||||
|
||||
#
|
||||
parser.add_argument("--draw_img_save_dir", type=str, default="./inference_results")
|
||||
parser.add_argument("--save_crop_res", type=str2bool, default=False)
|
||||
parser.add_argument("--crop_res_save_dir", type=str, default="./output")
|
||||
|
||||
# multi-process
|
||||
parser.add_argument("--use_mp", type=str2bool, default=False)
|
||||
parser.add_argument("--total_process_num", type=int, default=1)
|
||||
parser.add_argument("--process_id", type=int, default=0)
|
||||
|
||||
parser.add_argument("--benchmark", type=str2bool, default=False)
|
||||
parser.add_argument("--save_log_path", type=str, default="./log_output/")
|
||||
|
||||
parser.add_argument("--show_log", type=str2bool, default=True)
|
||||
parser.add_argument("--use_onnx", type=str2bool, default=False)
|
||||
|
||||
# extended function
|
||||
parser.add_argument(
|
||||
"--return_word_box",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="Whether return the bbox of each word (split by space) or chinese character. Only used in ppstructure for layout recovery",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = init_args()
|
||||
return parser.parse_args([])
|
||||
|
||||
|
||||
def create_predictor(args, mode, logger):
|
||||
if mode == "det":
|
||||
model_dir = args.det_model_dir
|
||||
elif mode == "cls":
|
||||
model_dir = args.cls_model_dir
|
||||
elif mode == "rec":
|
||||
model_dir = args.rec_model_dir
|
||||
elif mode == "table":
|
||||
model_dir = args.table_model_dir
|
||||
elif mode == "ser":
|
||||
model_dir = args.ser_model_dir
|
||||
elif mode == "re":
|
||||
model_dir = args.re_model_dir
|
||||
elif mode == "sr":
|
||||
model_dir = args.sr_model_dir
|
||||
elif mode == "layout":
|
||||
model_dir = args.layout_model_dir
|
||||
else:
|
||||
model_dir = args.e2e_model_dir
|
||||
|
||||
if model_dir is None:
|
||||
logger.info("not find {} model file path {}".format(mode, model_dir))
|
||||
sys.exit(0)
|
||||
assert args.use_onnx
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
model_file_path = model_dir
|
||||
if not os.path.exists(model_file_path):
|
||||
raise ValueError("not find model file path {}".format(model_file_path))
|
||||
if args.use_gpu:
|
||||
sess = ort.InferenceSession(
|
||||
model_file_path, providers=["CUDAExecutionProvider"]
|
||||
)
|
||||
else:
|
||||
sess = ort.InferenceSession(model_file_path)
|
||||
return sess, sess.get_inputs()[0], None, None
|
||||
|
||||
|
||||
|
||||
def get_output_tensors(args, mode, predictor):
|
||||
output_names = predictor.get_output_names()
|
||||
output_tensors = []
|
||||
if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet", "SVTR_HGNet"]:
|
||||
output_name = "softmax_0.tmp_0"
|
||||
if output_name in output_names:
|
||||
return [predictor.get_output_handle(output_name)]
|
||||
else:
|
||||
for output_name in output_names:
|
||||
output_tensor = predictor.get_output_handle(output_name)
|
||||
output_tensors.append(output_tensor)
|
||||
else:
|
||||
for output_name in output_names:
|
||||
output_tensor = predictor.get_output_handle(output_name)
|
||||
output_tensors.append(output_tensor)
|
||||
return output_tensors
|
||||
|
||||
|
||||
def draw_e2e_res(dt_boxes, strs, img_path):
|
||||
src_im = cv2.imread(img_path)
|
||||
for box, str in zip(dt_boxes, strs):
|
||||
box = box.astype(np.int32).reshape((-1, 1, 2))
|
||||
cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
|
||||
cv2.putText(
|
||||
src_im,
|
||||
str,
|
||||
org=(int(box[0, 0, 0]), int(box[0, 0, 1])),
|
||||
fontFace=cv2.FONT_HERSHEY_COMPLEX,
|
||||
fontScale=0.7,
|
||||
color=(0, 255, 0),
|
||||
thickness=1,
|
||||
)
|
||||
return src_im
|
||||
|
||||
|
||||
def draw_text_det_res(dt_boxes, img):
|
||||
for box in dt_boxes:
|
||||
box = np.array(box).astype(np.int32).reshape(-1, 2)
|
||||
cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2)
|
||||
return img
|
||||
|
||||
|
||||
def resize_img(img, input_size=600):
|
||||
"""
|
||||
resize img and limit the longest side of the image to input_size
|
||||
"""
|
||||
img = np.array(img)
|
||||
im_shape = img.shape
|
||||
im_size_max = np.max(im_shape[0:2])
|
||||
im_scale = float(input_size) / float(im_size_max)
|
||||
img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
|
||||
return img
|
||||
|
||||
|
||||
def draw_ocr(
|
||||
image,
|
||||
boxes,
|
||||
txts=None,
|
||||
scores=None,
|
||||
drop_score=0.5,
|
||||
font_path="./doc/fonts/simfang.ttf",
|
||||
):
|
||||
"""
|
||||
Visualize the results of OCR detection and recognition
|
||||
args:
|
||||
image(Image|array): RGB image
|
||||
boxes(list): boxes with shape(N, 4, 2)
|
||||
txts(list): the texts
|
||||
scores(list): txxs corresponding scores
|
||||
drop_score(float): only scores greater than drop_threshold will be visualized
|
||||
font_path: the path of font which is used to draw text
|
||||
return(array):
|
||||
the visualized img
|
||||
"""
|
||||
if scores is None:
|
||||
scores = [1] * len(boxes)
|
||||
box_num = len(boxes)
|
||||
for i in range(box_num):
|
||||
if scores is not None and (scores[i] < drop_score or math.isnan(scores[i])):
|
||||
continue
|
||||
box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64)
|
||||
image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
|
||||
if txts is not None:
|
||||
img = np.array(resize_img(image, input_size=600))
|
||||
txt_img = text_visual(
|
||||
txts,
|
||||
scores,
|
||||
img_h=img.shape[0],
|
||||
img_w=600,
|
||||
threshold=drop_score,
|
||||
font_path=font_path,
|
||||
)
|
||||
img = np.concatenate([np.array(img), np.array(txt_img)], axis=1)
|
||||
return img
|
||||
return image
|
||||
|
||||
|
||||
def draw_ocr_box_txt(
|
||||
image,
|
||||
boxes,
|
||||
txts=None,
|
||||
scores=None,
|
||||
drop_score=0.5,
|
||||
font_path="./doc/fonts/simfang.ttf",
|
||||
):
|
||||
h, w = image.height, image.width
|
||||
img_left = image.copy()
|
||||
img_right = np.ones((h, w, 3), dtype=np.uint8) * 255
|
||||
random.seed(0)
|
||||
|
||||
draw_left = ImageDraw.Draw(img_left)
|
||||
if txts is None or len(txts) != len(boxes):
|
||||
txts = [None] * len(boxes)
|
||||
for idx, (box, txt) in enumerate(zip(boxes, txts)):
|
||||
if scores is not None and scores[idx] < drop_score:
|
||||
continue
|
||||
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
||||
draw_left.polygon(box, fill=color)
|
||||
img_right_text = draw_box_txt_fine((w, h), box, txt, font_path)
|
||||
pts = np.array(box, np.int32).reshape((-1, 1, 2))
|
||||
cv2.polylines(img_right_text, [pts], True, color, 1)
|
||||
img_right = cv2.bitwise_and(img_right, img_right_text)
|
||||
img_left = Image.blend(image, img_left, 0.5)
|
||||
img_show = Image.new("RGB", (w * 2, h), (255, 255, 255))
|
||||
img_show.paste(img_left, (0, 0, w, h))
|
||||
img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h))
|
||||
return np.array(img_show)
|
||||
|
||||
|
||||
def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"):
|
||||
box_height = int(
|
||||
math.sqrt((box[0][0] - box[3][0]) ** 2 + (box[0][1] - box[3][1]) ** 2)
|
||||
)
|
||||
box_width = int(
|
||||
math.sqrt((box[0][0] - box[1][0]) ** 2 + (box[0][1] - box[1][1]) ** 2)
|
||||
)
|
||||
|
||||
if box_height > 2 * box_width and box_height > 30:
|
||||
img_text = Image.new("RGB", (box_height, box_width), (255, 255, 255))
|
||||
draw_text = ImageDraw.Draw(img_text)
|
||||
if txt:
|
||||
font = create_font(txt, (box_height, box_width), font_path)
|
||||
draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
|
||||
img_text = img_text.transpose(Image.ROTATE_270)
|
||||
else:
|
||||
img_text = Image.new("RGB", (box_width, box_height), (255, 255, 255))
|
||||
draw_text = ImageDraw.Draw(img_text)
|
||||
if txt:
|
||||
font = create_font(txt, (box_width, box_height), font_path)
|
||||
draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
|
||||
|
||||
pts1 = np.float32(
|
||||
[[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]]
|
||||
)
|
||||
pts2 = np.array(box, dtype=np.float32)
|
||||
M = cv2.getPerspectiveTransform(pts1, pts2)
|
||||
|
||||
img_text = np.array(img_text, dtype=np.uint8)
|
||||
img_right_text = cv2.warpPerspective(
|
||||
img_text,
|
||||
M,
|
||||
img_size,
|
||||
flags=cv2.INTER_NEAREST,
|
||||
borderMode=cv2.BORDER_CONSTANT,
|
||||
borderValue=(255, 255, 255),
|
||||
)
|
||||
return img_right_text
|
||||
|
||||
|
||||
def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"):
|
||||
font_size = int(sz[1] * 0.99)
|
||||
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
|
||||
if int(PIL.__version__.split(".")[0]) < 10:
|
||||
length = font.getsize(txt)[0]
|
||||
else:
|
||||
length = font.getlength(txt)
|
||||
|
||||
if length > sz[0]:
|
||||
font_size = int(font_size * sz[0] / length)
|
||||
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
|
||||
return font
|
||||
|
||||
|
||||
def str_count(s):
|
||||
"""
|
||||
Count the number of Chinese characters,
|
||||
a single English character and a single number
|
||||
equal to half the length of Chinese characters.
|
||||
args:
|
||||
s(string): the input of string
|
||||
return(int):
|
||||
the number of Chinese characters
|
||||
"""
|
||||
import string
|
||||
|
||||
count_zh = count_pu = 0
|
||||
s_len = len(s)
|
||||
en_dg_count = 0
|
||||
for c in s:
|
||||
if c in string.ascii_letters or c.isdigit() or c.isspace():
|
||||
en_dg_count += 1
|
||||
elif c.isalpha():
|
||||
count_zh += 1
|
||||
else:
|
||||
count_pu += 1
|
||||
return s_len - math.ceil(en_dg_count / 2)
|
||||
|
||||
|
||||
def text_visual(
|
||||
texts, scores, img_h=400, img_w=600, threshold=0.0, font_path="./doc/simfang.ttf"
|
||||
):
|
||||
"""
|
||||
create new blank img and draw txt on it
|
||||
args:
|
||||
texts(list): the text will be draw
|
||||
scores(list|None): corresponding score of each txt
|
||||
img_h(int): the height of blank img
|
||||
img_w(int): the width of blank img
|
||||
font_path: the path of font which is used to draw text
|
||||
return(array):
|
||||
"""
|
||||
if scores is not None:
|
||||
assert len(texts) == len(
|
||||
scores
|
||||
), "The number of txts and corresponding scores must match"
|
||||
|
||||
def create_blank_img():
|
||||
blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255
|
||||
blank_img[:, img_w - 1 :] = 0
|
||||
blank_img = Image.fromarray(blank_img).convert("RGB")
|
||||
draw_txt = ImageDraw.Draw(blank_img)
|
||||
return blank_img, draw_txt
|
||||
|
||||
blank_img, draw_txt = create_blank_img()
|
||||
|
||||
font_size = 20
|
||||
txt_color = (0, 0, 0)
|
||||
font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
|
||||
|
||||
gap = font_size + 5
|
||||
txt_img_list = []
|
||||
count, index = 1, 0
|
||||
for idx, txt in enumerate(texts):
|
||||
index += 1
|
||||
if scores[idx] < threshold or math.isnan(scores[idx]):
|
||||
index -= 1
|
||||
continue
|
||||
first_line = True
|
||||
while str_count(txt) >= img_w // font_size - 4:
|
||||
tmp = txt
|
||||
txt = tmp[: img_w // font_size - 4]
|
||||
if first_line:
|
||||
new_txt = str(index) + ": " + txt
|
||||
first_line = False
|
||||
else:
|
||||
new_txt = " " + txt
|
||||
draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
|
||||
txt = tmp[img_w // font_size - 4 :]
|
||||
if count >= img_h // gap - 1:
|
||||
txt_img_list.append(np.array(blank_img))
|
||||
blank_img, draw_txt = create_blank_img()
|
||||
count = 0
|
||||
count += 1
|
||||
if first_line:
|
||||
new_txt = str(index) + ": " + txt + " " + "%.3f" % (scores[idx])
|
||||
else:
|
||||
new_txt = " " + txt + " " + "%.3f" % (scores[idx])
|
||||
draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
|
||||
# whether add new blank img or not
|
||||
if count >= img_h // gap - 1 and idx + 1 < len(texts):
|
||||
txt_img_list.append(np.array(blank_img))
|
||||
blank_img, draw_txt = create_blank_img()
|
||||
count = 0
|
||||
count += 1
|
||||
txt_img_list.append(np.array(blank_img))
|
||||
if len(txt_img_list) == 1:
|
||||
blank_img = np.array(txt_img_list[0])
|
||||
else:
|
||||
blank_img = np.concatenate(txt_img_list, axis=1)
|
||||
return np.array(blank_img)
|
||||
|
||||
|
||||
def base64_to_cv2(b64str):
|
||||
import base64
|
||||
|
||||
data = base64.b64decode(b64str.encode("utf8"))
|
||||
data = np.frombuffer(data, np.uint8)
|
||||
data = cv2.imdecode(data, cv2.IMREAD_COLOR)
|
||||
return data
|
||||
|
||||
|
||||
def draw_boxes(image, boxes, scores=None, drop_score=0.5):
|
||||
if scores is None:
|
||||
scores = [1] * len(boxes)
|
||||
for box, score in zip(boxes, scores):
|
||||
if score < drop_score:
|
||||
continue
|
||||
box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
|
||||
image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
|
||||
return image
|
||||
|
||||
|
||||
def get_rotate_crop_image(img, points):
|
||||
"""
|
||||
img_height, img_width = img.shape[0:2]
|
||||
left = int(np.min(points[:, 0]))
|
||||
right = int(np.max(points[:, 0]))
|
||||
top = int(np.min(points[:, 1]))
|
||||
bottom = int(np.max(points[:, 1]))
|
||||
img_crop = img[top:bottom, left:right, :].copy()
|
||||
points[:, 0] = points[:, 0] - left
|
||||
points[:, 1] = points[:, 1] - top
|
||||
"""
|
||||
assert len(points) == 4, "shape of points must be 4*2"
|
||||
img_crop_width = int(
|
||||
max(
|
||||
np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])
|
||||
)
|
||||
)
|
||||
img_crop_height = int(
|
||||
max(
|
||||
np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])
|
||||
)
|
||||
)
|
||||
pts_std = np.float32(
|
||||
[
|
||||
[0, 0],
|
||||
[img_crop_width, 0],
|
||||
[img_crop_width, img_crop_height],
|
||||
[0, img_crop_height],
|
||||
]
|
||||
)
|
||||
M = cv2.getPerspectiveTransform(points, pts_std)
|
||||
dst_img = cv2.warpPerspective(
|
||||
img,
|
||||
M,
|
||||
(img_crop_width, img_crop_height),
|
||||
borderMode=cv2.BORDER_REPLICATE,
|
||||
flags=cv2.INTER_CUBIC,
|
||||
)
|
||||
dst_img_height, dst_img_width = dst_img.shape[0:2]
|
||||
if dst_img_height * 1.0 / dst_img_width >= 1.5:
|
||||
dst_img = np.rot90(dst_img)
|
||||
return dst_img
|
||||
|
||||
|
||||
def get_minarea_rect_crop(img, points):
|
||||
bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
|
||||
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
|
||||
|
||||
index_a, index_b, index_c, index_d = 0, 1, 2, 3
|
||||
if points[1][1] > points[0][1]:
|
||||
index_a = 0
|
||||
index_d = 1
|
||||
else:
|
||||
index_a = 1
|
||||
index_d = 0
|
||||
if points[3][1] > points[2][1]:
|
||||
index_b = 2
|
||||
index_c = 3
|
||||
else:
|
||||
index_b = 3
|
||||
index_c = 2
|
||||
|
||||
box = [points[index_a], points[index_b], points[index_c], points[index_d]]
|
||||
crop_img = get_rotate_crop_image(img, np.array(box))
|
||||
return crop_img
|
||||
|
||||
|
||||
# def check_gpu(use_gpu):
|
||||
# if use_gpu and (
|
||||
# not paddle.is_compiled_with_cuda() or paddle.device.get_device() == "cpu"
|
||||
# ):
|
||||
# use_gpu = False
|
||||
# return use_gpu
|
||||
|
||||
|
||||
def _check_image_file(path):
|
||||
img_end = {"jpg", "bmp", "png", "jpeg", "rgb", "tif", "tiff", "gif", "pdf"}
|
||||
return any([path.lower().endswith(e) for e in img_end])
|
||||
|
||||
|
||||
def get_image_file_list(img_file, infer_list=None):
|
||||
imgs_lists = []
|
||||
if img_file is None or not os.path.exists(img_file):
|
||||
raise Exception("not found any img file in {}".format(img_file))
|
||||
|
||||
if os.path.isfile(img_file) and _check_image_file(img_file):
|
||||
imgs_lists.append(img_file)
|
||||
elif os.path.isdir(img_file):
|
||||
for single_file in os.listdir(img_file):
|
||||
file_path = os.path.join(img_file, single_file)
|
||||
if os.path.isfile(file_path) and _check_image_file(file_path):
|
||||
imgs_lists.append(file_path)
|
||||
|
||||
if len(imgs_lists) == 0:
|
||||
raise Exception("not found any img file in {}".format(img_file))
|
||||
imgs_lists = sorted(imgs_lists)
|
||||
return imgs_lists
|
||||
|
||||
|
||||
logger_initialized = {}
|
||||
@functools.lru_cache()
|
||||
def get_logger(name="ppocr", log_file=None, log_level=logging.DEBUG):
|
||||
"""Initialize and get a logger by name.
|
||||
If the logger has not been initialized, this method will initialize the
|
||||
logger by adding one or two handlers, otherwise the initialized logger will
|
||||
be directly returned. During initialization, a StreamHandler will always be
|
||||
added. If `log_file` is specified a FileHandler will also be added.
|
||||
Args:
|
||||
name (str): Logger name.
|
||||
log_file (str | None): The log filename. If specified, a FileHandler
|
||||
will be added to the logger.
|
||||
log_level (int): The logger level. Note that only the process of
|
||||
rank 0 is affected, and other processes will set the level to
|
||||
"Error" thus be silent most of the time.
|
||||
Returns:
|
||||
logging.Logger: The expected logger.
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
if name in logger_initialized:
|
||||
return logger
|
||||
for logger_name in logger_initialized:
|
||||
if name.startswith(logger_name):
|
||||
return logger
|
||||
|
||||
formatter = logging.Formatter(
|
||||
"[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%Y/%m/%d %H:%M:%S"
|
||||
)
|
||||
|
||||
stream_handler = logging.StreamHandler(stream=sys.stdout)
|
||||
stream_handler.setFormatter(formatter)
|
||||
logger.addHandler(stream_handler)
|
||||
logger_initialized[name] = True
|
||||
logger.propagate = False
|
||||
return logger
|
||||
|
||||
|
||||
def get_rotate_crop_image(img, points):
|
||||
"""
|
||||
img_height, img_width = img.shape[0:2]
|
||||
left = int(np.min(points[:, 0]))
|
||||
right = int(np.max(points[:, 0]))
|
||||
top = int(np.min(points[:, 1]))
|
||||
bottom = int(np.max(points[:, 1]))
|
||||
img_crop = img[top:bottom, left:right, :].copy()
|
||||
points[:, 0] = points[:, 0] - left
|
||||
points[:, 1] = points[:, 1] - top
|
||||
"""
|
||||
assert len(points) == 4, "shape of points must be 4*2"
|
||||
img_crop_width = int(
|
||||
max(
|
||||
np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])
|
||||
)
|
||||
)
|
||||
img_crop_height = int(
|
||||
max(
|
||||
np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])
|
||||
)
|
||||
)
|
||||
pts_std = np.float32(
|
||||
[
|
||||
[0, 0],
|
||||
[img_crop_width, 0],
|
||||
[img_crop_width, img_crop_height],
|
||||
[0, img_crop_height],
|
||||
]
|
||||
)
|
||||
M = cv2.getPerspectiveTransform(points, pts_std)
|
||||
dst_img = cv2.warpPerspective(
|
||||
img,
|
||||
M,
|
||||
(img_crop_width, img_crop_height),
|
||||
borderMode=cv2.BORDER_REPLICATE,
|
||||
flags=cv2.INTER_CUBIC,
|
||||
)
|
||||
dst_img_height, dst_img_width = dst_img.shape[0:2]
|
||||
if dst_img_height * 1.0 / dst_img_width >= 1.5:
|
||||
dst_img = np.rot90(dst_img)
|
||||
return dst_img
|
||||
|
||||
|
||||
def get_minarea_rect_crop(img, points):
|
||||
bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
|
||||
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
|
||||
|
||||
index_a, index_b, index_c, index_d = 0, 1, 2, 3
|
||||
if points[1][1] > points[0][1]:
|
||||
index_a = 0
|
||||
index_d = 1
|
||||
else:
|
||||
index_a = 1
|
||||
index_d = 0
|
||||
if points[3][1] > points[2][1]:
|
||||
index_b = 2
|
||||
index_c = 3
|
||||
else:
|
||||
index_b = 3
|
||||
index_c = 2
|
||||
|
||||
box = [points[index_a], points[index_b], points[index_c], points[index_d]]
|
||||
crop_img = get_rotate_crop_image(img, np.array(box))
|
||||
return crop_img
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
Reference in New Issue
Block a user