2024-03-25 16:35:34 +08:00
|
|
|
|
import torch
|
|
|
|
|
|
import random
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
import cv2
|
|
|
|
|
|
|
|
|
|
|
|
from torchvision.transforms import v2
|
|
|
|
|
|
from typing import List
|
|
|
|
|
|
from PIL import Image
|
|
|
|
|
|
|
2024-02-08 13:48:34 +00:00
|
|
|
|
from ...globals import (
|
2024-03-28 13:44:32 +00:00
|
|
|
|
IMG_CHANNELS,
|
2024-03-25 16:35:34 +08:00
|
|
|
|
FIXED_IMG_SIZE,
|
|
|
|
|
|
IMAGE_MEAN, IMAGE_STD,
|
|
|
|
|
|
MAX_RESIZE_RATIO, MIN_RESIZE_RATIO
|
|
|
|
|
|
)
|
2024-03-27 04:54:49 +00:00
|
|
|
|
from .ocr_aug import ocr_augmentation_pipeline
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
2024-03-27 04:54:49 +00:00
|
|
|
|
# train_pipeline = default_augraphy_pipeline(scan_only=True)
|
|
|
|
|
|
train_pipeline = ocr_augmentation_pipeline()
|
2024-03-25 16:35:34 +08:00
|
|
|
|
|
|
|
|
|
|
general_transform_pipeline = v2.Compose([
|
2024-01-31 10:11:07 +00:00
|
|
|
|
v2.ToImage(), # Convert to tensor, only needed if you had a PIL image
|
2024-03-04 05:35:59 +00:00
|
|
|
|
#+返回一个List of torchvision.Image,list的长度就是batch_size
|
|
|
|
|
|
#+因此在整个Compose pipeline的最后,输出的也是一个List of torchvision.Image
|
|
|
|
|
|
#+注意:不是返回一整个torchvision.Image,batch_size的维度是拿出来的
|
2024-01-31 10:11:07 +00:00
|
|
|
|
v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point
|
|
|
|
|
|
v2.Grayscale(), # 转灰度图(视具体任务而定)
|
|
|
|
|
|
|
|
|
|
|
|
v2.Resize( # 固定resize到一个正方形上
|
2024-03-28 13:44:32 +00:00
|
|
|
|
size=FIXED_IMG_SIZE - 1, # size必须小于max_size
|
2024-03-25 16:35:34 +08:00
|
|
|
|
interpolation=v2.InterpolationMode.BICUBIC,
|
|
|
|
|
|
max_size=FIXED_IMG_SIZE,
|
|
|
|
|
|
antialias=True
|
|
|
|
|
|
),
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
|
|
|
|
|
v2.ToDtype(torch.float32, scale=True), # Normalize expects float input
|
2024-03-25 16:35:34 +08:00
|
|
|
|
v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
|
|
|
|
|
# v2.ToPILImage() # 用于观察转换后的结果是否正确(debug用)
|
2024-03-25 16:35:34 +08:00
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def trim_white_border(image: np.ndarray):
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# image是一个3维的ndarray,RGB格式,维度分布为[H, W, C](通道维在第三维上)
|
|
|
|
|
|
|
2024-02-03 09:40:13 +00:00
|
|
|
|
# # 检查images中的第一个元素是否是嵌套的列表结构
|
|
|
|
|
|
# if isinstance(image, list):
|
|
|
|
|
|
# image = np.array(image, dtype=np.uint8)
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
|
|
|
|
|
# 检查图像是否为RGB格式,同时检查通道维是不是在第三维上
|
2024-03-25 16:35:34 +08:00
|
|
|
|
if len(image.shape) != 3 or image.shape[2] != 3:
|
|
|
|
|
|
raise ValueError("Image is not in RGB format or channel is not in third dimension")
|
|
|
|
|
|
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# 检查图片是否使用 uint8 类型
|
2024-03-25 16:35:34 +08:00
|
|
|
|
if image.dtype != np.uint8:
|
|
|
|
|
|
raise ValueError(f"Image should stored in uint8")
|
|
|
|
|
|
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# 创建与原图像同样大小的纯白背景图像
|
2024-03-25 16:35:34 +08:00
|
|
|
|
h, w = image.shape[:2]
|
|
|
|
|
|
bg = np.full((h, w, 3), 255, dtype=np.uint8)
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
|
|
|
|
|
# 计算差异
|
2024-03-25 16:35:34 +08:00
|
|
|
|
diff = cv2.absdiff(image, bg)
|
|
|
|
|
|
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# 只要差值大于1,就全部转化为255
|
2024-03-25 16:35:34 +08:00
|
|
|
|
_, diff = cv2.threshold(diff, 1, 255, cv2.THRESH_BINARY)
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
|
|
|
|
|
# 把差值转灰度图
|
2024-03-25 16:35:34 +08:00
|
|
|
|
gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# 计算图像中非零像素点的最小外接矩阵
|
2024-03-25 16:35:34 +08:00
|
|
|
|
x, y, w, h = cv2.boundingRect(gray_diff)
|
|
|
|
|
|
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# 裁剪图像
|
2024-03-25 16:35:34 +08:00
|
|
|
|
trimmed_image = image[y:y+h, x:x+w]
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
2024-03-25 16:35:34 +08:00
|
|
|
|
return trimmed_image
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-03-04 05:35:59 +00:00
|
|
|
|
def add_white_border(image: np.ndarray, max_size: int) -> np.ndarray:
|
|
|
|
|
|
randi = [random.randint(0, max_size) for _ in range(4)]
|
2024-03-27 04:54:49 +00:00
|
|
|
|
pad_height_size = randi[1] + randi[3]
|
|
|
|
|
|
pad_width_size = randi[0] + randi[2]
|
|
|
|
|
|
if (pad_height_size + image.shape[0] < 30):
|
|
|
|
|
|
compensate_height = int((30 - (pad_height_size + image.shape[0])) * 0.5) + 1
|
|
|
|
|
|
randi[1] += compensate_height
|
|
|
|
|
|
randi[3] += compensate_height
|
|
|
|
|
|
if (pad_width_size + image.shape[1] < 30):
|
|
|
|
|
|
compensate_width = int((30 - (pad_width_size + image.shape[1])) * 0.5) + 1
|
|
|
|
|
|
randi[0] += compensate_width
|
|
|
|
|
|
randi[2] += compensate_width
|
2024-03-04 05:35:59 +00:00
|
|
|
|
return v2.functional.pad(
|
2024-03-27 04:54:49 +00:00
|
|
|
|
torch.from_numpy(image).permute(2, 0, 1),
|
|
|
|
|
|
padding=randi,
|
|
|
|
|
|
padding_mode='constant',
|
|
|
|
|
|
fill=(255, 255, 255)
|
2024-03-04 05:35:59 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def padding(images: List[torch.Tensor], required_size: int) -> List[torch.Tensor]:
|
2024-03-25 16:35:34 +08:00
|
|
|
|
images = [
|
|
|
|
|
|
v2.functional.pad(
|
|
|
|
|
|
img,
|
|
|
|
|
|
padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]]
|
|
|
|
|
|
)
|
|
|
|
|
|
for img in images
|
|
|
|
|
|
]
|
|
|
|
|
|
return images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def random_resize(
|
|
|
|
|
|
images: List[np.ndarray],
|
|
|
|
|
|
minr: float,
|
|
|
|
|
|
maxr: float
|
|
|
|
|
|
) -> List[np.ndarray]:
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# np.ndarray的格式:3维,RGB格式,维度分布为[H, W, C](通道维在第三维上)
|
|
|
|
|
|
|
2024-02-03 09:40:13 +00:00
|
|
|
|
# # 检查images中的第一个元素是否是嵌套的列表结构
|
|
|
|
|
|
# if isinstance(images[0], list):
|
|
|
|
|
|
# # 将嵌套的列表结构转换为np.ndarray
|
|
|
|
|
|
# images = [np.array(img, dtype=np.uint8) for img in images]
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
2024-03-25 16:35:34 +08:00
|
|
|
|
if len(images[0].shape) != 3 or images[0].shape[2] != 3:
|
|
|
|
|
|
raise ValueError("Image is not in RGB format or channel is not in third dimension")
|
|
|
|
|
|
|
|
|
|
|
|
ratios = [random.uniform(minr, maxr) for _ in range(len(images))]
|
|
|
|
|
|
return [
|
|
|
|
|
|
cv2.resize(img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LANCZOS4) # 抗锯齿
|
|
|
|
|
|
for img, r in zip(images, ratios)
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-03-28 10:19:40 +00:00
|
|
|
|
def rotate(image: np.ndarray, min_angle: int, max_angle: int) -> np.ndarray:
|
|
|
|
|
|
# Get the center of the image to define the point of rotation
|
|
|
|
|
|
image_center = tuple(np.array(image.shape[1::-1]) / 2)
|
|
|
|
|
|
|
|
|
|
|
|
# Generate a random angle within the specified range
|
|
|
|
|
|
angle = random.randint(min_angle, max_angle)
|
|
|
|
|
|
|
|
|
|
|
|
# Get the rotation matrix for rotating the image around its center
|
|
|
|
|
|
rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
|
|
|
|
|
|
|
|
|
|
|
|
# Determine the size of the rotated image
|
|
|
|
|
|
cos = np.abs(rotation_mat[0, 0])
|
|
|
|
|
|
sin = np.abs(rotation_mat[0, 1])
|
|
|
|
|
|
new_width = int((image.shape[0] * sin) + (image.shape[1] * cos))
|
|
|
|
|
|
new_height = int((image.shape[0] * cos) + (image.shape[1] * sin))
|
|
|
|
|
|
|
|
|
|
|
|
# Adjust the rotation matrix to take into account translation
|
|
|
|
|
|
rotation_mat[0, 2] += (new_width / 2) - image_center[0]
|
|
|
|
|
|
rotation_mat[1, 2] += (new_height / 2) - image_center[1]
|
|
|
|
|
|
|
|
|
|
|
|
# Rotate the image with the specified border color (white in this case)
|
|
|
|
|
|
rotated_image = cv2.warpAffine(image, rotation_mat, (new_width, new_height), borderValue=(255, 255, 255))
|
|
|
|
|
|
|
|
|
|
|
|
return rotated_image
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-03-27 04:55:00 +00:00
|
|
|
|
def ocr_aug(image: np.ndarray) -> np.ndarray:
|
2024-03-28 10:19:40 +00:00
|
|
|
|
# 20%的概率进行随机旋转
|
|
|
|
|
|
if random.random() < 0.2:
|
|
|
|
|
|
image = rotate(image, -5, 5)
|
2024-03-27 04:55:00 +00:00
|
|
|
|
# 增加白边
|
2024-03-28 10:19:40 +00:00
|
|
|
|
image = add_white_border(image, max_size=25).permute(1, 2, 0).numpy()
|
2024-03-27 04:55:00 +00:00
|
|
|
|
# 数据增强
|
|
|
|
|
|
image = train_pipeline(image)
|
|
|
|
|
|
return image
|
2024-03-25 16:35:34 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
|
2024-03-28 13:44:32 +00:00
|
|
|
|
assert IMG_CHANNELS == 1 , "Only support grayscale images for now"
|
2024-03-04 05:35:59 +00:00
|
|
|
|
|
2024-03-25 16:35:34 +08:00
|
|
|
|
images = [np.array(img.convert('RGB')) for img in images]
|
2024-03-04 05:35:59 +00:00
|
|
|
|
# random resize first
|
2024-03-25 16:35:34 +08:00
|
|
|
|
images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO)
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# 裁剪掉白边
|
|
|
|
|
|
images = [trim_white_border(image) for image in images]
|
2024-03-27 04:55:00 +00:00
|
|
|
|
|
2024-03-28 10:19:40 +00:00
|
|
|
|
# OCR augmentation
|
2024-03-27 04:55:00 +00:00
|
|
|
|
images = [ocr_aug(image) for image in images]
|
|
|
|
|
|
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# general transform pipeline
|
2024-03-27 04:55:00 +00:00
|
|
|
|
images = [general_transform_pipeline(image) for image in images]
|
2024-01-31 10:11:07 +00:00
|
|
|
|
# padding to fixed size
|
2024-03-28 13:44:32 +00:00
|
|
|
|
images = padding(images, FIXED_IMG_SIZE)
|
2024-01-31 10:11:07 +00:00
|
|
|
|
return images
|
2024-03-25 16:35:34 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def inference_transform(images: List[np.ndarray]) -> List[torch.Tensor]:
|
2024-03-28 13:44:32 +00:00
|
|
|
|
assert IMG_CHANNELS == 1 , "Only support grayscale images for now"
|
2024-03-04 05:35:59 +00:00
|
|
|
|
# 裁剪掉白边
|
|
|
|
|
|
images = [trim_white_border(image) for image in images]
|
|
|
|
|
|
# general transform pipeline
|
2024-03-28 10:19:40 +00:00
|
|
|
|
images = [general_transform_pipeline(image) for image in images] # imgs: List[PIL.Image.Image]
|
2024-03-04 05:35:59 +00:00
|
|
|
|
# padding to fixed size
|
2024-03-28 13:44:32 +00:00
|
|
|
|
images = padding(images, FIXED_IMG_SIZE)
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
2024-03-04 05:35:59 +00:00
|
|
|
|
return images
|
2024-01-31 10:11:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from .helpers import convert2rgb
|
|
|
|
|
|
base_dir = Path('/home/lhy/code/TeXify/src/models/ocr_model/model')
|
|
|
|
|
|
imgs_path = [
|
|
|
|
|
|
base_dir / '1.jpg',
|
|
|
|
|
|
base_dir / '2.jpg',
|
|
|
|
|
|
base_dir / '3.jpg',
|
|
|
|
|
|
base_dir / '4.jpg',
|
|
|
|
|
|
base_dir / '5.jpg',
|
|
|
|
|
|
base_dir / '6.jpg',
|
|
|
|
|
|
base_dir / '7.jpg',
|
|
|
|
|
|
]
|
|
|
|
|
|
imgs_path = [str(img_path) for img_path in imgs_path]
|
|
|
|
|
|
imgs = convert2rgb(imgs_path)
|
|
|
|
|
|
res = random_resize(imgs, 0.5, 1.5)
|
|
|
|
|
|
pause = 1
|
|
|
|
|
|
|