texteller/models/ocr_model/utils/transforms.py

import torch
import random
import numpy as np
import cv2

from torchvision.transforms import v2
from typing import List, Union
from PIL import Image
from collections import Counter

from ...globals import (
    IMG_CHANNELS,
    FIXED_IMG_SIZE,
    IMAGE_MEAN,
    IMAGE_STD,
    MAX_RESIZE_RATIO,
    MIN_RESIZE_RATIO,
)
from .ocr_aug import ocr_augmentation_pipeline

# train_pipeline = default_augraphy_pipeline(scan_only=True)
train_pipeline = ocr_augmentation_pipeline()

general_transform_pipeline = v2.Compose(
    [
        v2.ToImage(),
        v2.ToDtype(torch.uint8, scale=True),  # optional, most input are already uint8 at this point
        v2.Grayscale(),
        v2.Resize(
            size=FIXED_IMG_SIZE - 1,
            interpolation=v2.InterpolationMode.BICUBIC,
            max_size=FIXED_IMG_SIZE,
            antialias=True,
        ),
        v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
        v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
        # v2.ToPILImage()
    ]
)


def trim_white_border(image: np.ndarray):
    if len(image.shape) != 3 or image.shape[2] != 3:
        raise ValueError("Image is not in RGB format or channel is not in third dimension")

    if image.dtype != np.uint8:
        raise ValueError(f"Image should stored in uint8")

    corners = [tuple(image[0, 0]), tuple(image[0, -1]), tuple(image[-1, 0]), tuple(image[-1, -1])]
    bg_color = Counter(corners).most_common(1)[0][0]
    bg_color_np = np.array(bg_color, dtype=np.uint8)

    h, w = image.shape[:2]
    bg = np.full((h, w, 3), bg_color_np, dtype=np.uint8)

    diff = cv2.absdiff(image, bg)
    mask = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)

    threshold = 15
    _, diff = cv2.threshold(mask, threshold, 255, cv2.THRESH_BINARY)

    x, y, w, h = cv2.boundingRect(diff)

    trimmed_image = image[y : y + h, x : x + w]

    return trimmed_image


def add_white_border(image: np.ndarray, max_size: int) -> np.ndarray:
    randi = [random.randint(0, max_size) for _ in range(4)]
    pad_height_size = randi[1] + randi[3]
    pad_width_size = randi[0] + randi[2]
    if pad_height_size + image.shape[0] < 30:
        compensate_height = int((30 - (pad_height_size + image.shape[0])) * 0.5) + 1
        randi[1] += compensate_height
        randi[3] += compensate_height
    if pad_width_size + image.shape[1] < 30:
        compensate_width = int((30 - (pad_width_size + image.shape[1])) * 0.5) + 1
        randi[0] += compensate_width
        randi[2] += compensate_width
    return v2.functional.pad(
        torch.from_numpy(image).permute(2, 0, 1),
        padding=randi,
        padding_mode='constant',
        fill=(255, 255, 255),
    )


def padding(images: List[torch.Tensor], required_size: int) -> List[torch.Tensor]:
    images = [
        v2.functional.pad(
            img, padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]]
        )
        for img in images
    ]
    return images


def random_resize(images: List[np.ndarray], minr: float, maxr: float) -> List[np.ndarray]:
    if len(images[0].shape) != 3 or images[0].shape[2] != 3:
        raise ValueError("Image is not in RGB format or channel is not in third dimension")

    ratios = [random.uniform(minr, maxr) for _ in range(len(images))]
    return [
        cv2.resize(
            img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LANCZOS4
        )  # 抗锯齿
        for img, r in zip(images, ratios)
    ]


def rotate(image: np.ndarray, min_angle: int, max_angle: int) -> np.ndarray:
    # Get the center of the image to define the point of rotation
    image_center = tuple(np.array(image.shape[1::-1]) / 2)

    # Generate a random angle within the specified range
    angle = random.randint(min_angle, max_angle)

    # Get the rotation matrix for rotating the image around its center
    rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)

    # Determine the size of the rotated image
    cos = np.abs(rotation_mat[0, 0])
    sin = np.abs(rotation_mat[0, 1])
    new_width = int((image.shape[0] * sin) + (image.shape[1] * cos))
    new_height = int((image.shape[0] * cos) + (image.shape[1] * sin))

    # Adjust the rotation matrix to take into account translation
    rotation_mat[0, 2] += (new_width / 2) - image_center[0]
    rotation_mat[1, 2] += (new_height / 2) - image_center[1]

    # Rotate the image with the specified border color (white in this case)
    rotated_image = cv2.warpAffine(
        image, rotation_mat, (new_width, new_height), borderValue=(255, 255, 255)
    )

    return rotated_image


def ocr_aug(image: np.ndarray) -> np.ndarray:
    if random.random() < 0.2:
        image = rotate(image, -5, 5)
    image = add_white_border(image, max_size=25).permute(1, 2, 0).numpy()
    image = train_pipeline(image)
    return image


def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
    assert IMG_CHANNELS == 1, "Only support grayscale images for now"

    images = [np.array(img.convert('RGB')) for img in images]
    # random resize first
    images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO)
    images = [trim_white_border(image) for image in images]

    # OCR augmentation
    images = [ocr_aug(image) for image in images]

    # general transform pipeline
    images = [general_transform_pipeline(image) for image in images]
    # padding to fixed size
    images = padding(images, FIXED_IMG_SIZE)
    return images


def inference_transform(images: List[Union[np.ndarray, Image.Image]]) -> List[torch.Tensor]:
    assert IMG_CHANNELS == 1, "Only support grayscale images for now"
    images = [
        np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images
    ]
    images = [trim_white_border(image) for image in images]
    # general transform pipeline
    images = [general_transform_pipeline(image) for image in images]  # imgs: List[PIL.Image.Image]
    # padding to fixed size
    images = padding(images, FIXED_IMG_SIZE)

    return images
Update README_zh.md 2024-03-25 16:35:34 +08:00			`import torch`
			`import random`
			`import numpy as np`
			`import cv2`

			`from torchvision.transforms import v2`
修改了transforms.py中inference_transform的bug: 在训练的eval阶段没有把png图片转化为np.ndarray 2024-04-10 17:06:44 +00:00			`from typing import List, Union`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`from PIL import Image`
优化了transform.py中的trim_white_border 2024-04-10 16:09:13 +00:00			`from collections import Counter`
Update README_zh.md 2024-03-25 16:35:34 +08:00
完成了web，ray server，重构了代码 2024-02-08 13:48:34 +00:00			`from ...globals import (`
merge v3_nature_scence 2024-03-28 13:44:32 +00:00			`IMG_CHANNELS,`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`FIXED_IMG_SIZE,`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`IMAGE_MEAN,`
			`IMAGE_STD,`
			`MAX_RESIZE_RATIO,`
			`MIN_RESIZE_RATIO,`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`)`
初步修改完成，但仍然有问题 2024-03-27 04:54:49 +00:00			`from .ocr_aug import ocr_augmentation_pipeline`
Initial Commit 2024-01-31 10:11:07 +00:00
初步修改完成，但仍然有问题 2024-03-27 04:54:49 +00:00			`# train_pipeline = default_augraphy_pipeline(scan_only=True)`
			`train_pipeline = ocr_augmentation_pipeline()`
Update README_zh.md 2024-03-25 16:35:34 +08:00
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`general_transform_pipeline = v2.Compose(`
			`[`
			`v2.ToImage(),`
			`v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point`
			`v2.Grayscale(),`
			`v2.Resize(`
			`size=FIXED_IMG_SIZE - 1,`
			`interpolation=v2.InterpolationMode.BICUBIC,`
			`max_size=FIXED_IMG_SIZE,`
			`antialias=True,`
			`),`
			`v2.ToDtype(torch.float32, scale=True), # Normalize expects float input`
			`v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),`
			`# v2.ToPILImage()`
			`]`
			`)`
Update README_zh.md 2024-03-25 16:35:34 +08:00

			`def trim_white_border(image: np.ndarray):`
			`if len(image.shape) != 3 or image.shape[2] != 3:`
			`raise ValueError("Image is not in RGB format or channel is not in third dimension")`

			`if image.dtype != np.uint8:`
			`raise ValueError(f"Image should stored in uint8")`

[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`corners = [tuple(image[0, 0]), tuple(image[0, -1]), tuple(image[-1, 0]), tuple(image[-1, -1])]`
优化了transform.py中的trim_white_border 2024-04-10 16:09:13 +00:00			`bg_color = Counter(corners).most_common(1)[0][0]`
			`bg_color_np = np.array(bg_color, dtype=np.uint8)`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00
Update README_zh.md 2024-03-25 16:35:34 +08:00			`h, w = image.shape[:2]`
优化了transform.py中的trim_white_border 2024-04-10 16:09:13 +00:00			`bg = np.full((h, w, 3), bg_color_np, dtype=np.uint8)`
Initial Commit 2024-01-31 10:11:07 +00:00
Update README_zh.md 2024-03-25 16:35:34 +08:00			`diff = cv2.absdiff(image, bg)`
优化了transform.py中的trim_white_border 2024-04-10 16:09:13 +00:00			`mask = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)`
Update README_zh.md 2024-03-25 16:35:34 +08:00
merge dev后调整了项目结构 2024-04-21 00:48:24 +08:00			`threshold = 15`
优化了transform.py中的trim_white_border 2024-04-10 16:09:13 +00:00			`_, diff = cv2.threshold(mask, threshold, 255, cv2.THRESH_BINARY)`
Initial Commit 2024-01-31 10:11:07 +00:00
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`x, y, w, h = cv2.boundingRect(diff)`
Update README_zh.md 2024-03-25 16:35:34 +08:00
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`trimmed_image = image[y : y + h, x : x + w]`
Initial Commit 2024-01-31 10:11:07 +00:00
Update README_zh.md 2024-03-25 16:35:34 +08:00			`return trimmed_image`


修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00			`def add_white_border(image: np.ndarray, max_size: int) -> np.ndarray:`
			`randi = [random.randint(0, max_size) for _ in range(4)]`
初步修改完成，但仍然有问题 2024-03-27 04:54:49 +00:00			`pad_height_size = randi[1] + randi[3]`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`pad_width_size = randi[0] + randi[2]`
			`if pad_height_size + image.shape[0] < 30:`
初步修改完成，但仍然有问题 2024-03-27 04:54:49 +00:00			`compensate_height = int((30 - (pad_height_size + image.shape[0])) * 0.5) + 1`
			`randi[1] += compensate_height`
			`randi[3] += compensate_height`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`if pad_width_size + image.shape[1] < 30:`
初步修改完成，但仍然有问题 2024-03-27 04:54:49 +00:00			`compensate_width = int((30 - (pad_width_size + image.shape[1])) * 0.5) + 1`
			`randi[0] += compensate_width`
			`randi[2] += compensate_width`
修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00			`return v2.functional.pad(`
初步修改完成，但仍然有问题 2024-03-27 04:54:49 +00:00			`torch.from_numpy(image).permute(2, 0, 1),`
			`padding=randi,`
			`padding_mode='constant',`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`fill=(255, 255, 255),`
修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00			`)`


			`def padding(images: List[torch.Tensor], required_size: int) -> List[torch.Tensor]:`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`images = [`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`v2.functional.pad(`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`img, padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]]`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`)`
			`for img in images`
			`]`
			`return images`


[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`def random_resize(images: List[np.ndarray], minr: float, maxr: float) -> List[np.ndarray]:`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`if len(images[0].shape) != 3 or images[0].shape[2] != 3:`
			`raise ValueError("Image is not in RGB format or channel is not in third dimension")`

			`ratios = [random.uniform(minr, maxr) for _ in range(len(images))]`
			`return [`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`cv2.resize(`
			`img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LANCZOS4`
			`) # 抗锯齿`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`for img, r in zip(images, ratios)`
			`]`


写好了v3版本的训练代码(v3版本加入了自然场景训练增强) 2024-03-28 10:19:40 +00:00			`def rotate(image: np.ndarray, min_angle: int, max_angle: int) -> np.ndarray:`
			`# Get the center of the image to define the point of rotation`
			`image_center = tuple(np.array(image.shape[1::-1]) / 2)`

			`# Generate a random angle within the specified range`
			`angle = random.randint(min_angle, max_angle)`

			`# Get the rotation matrix for rotating the image around its center`
			`rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)`

			`# Determine the size of the rotated image`
			`cos = np.abs(rotation_mat[0, 0])`
			`sin = np.abs(rotation_mat[0, 1])`
			`new_width = int((image.shape[0] * sin) + (image.shape[1] * cos))`
			`new_height = int((image.shape[0] * cos) + (image.shape[1] * sin))`

			`# Adjust the rotation matrix to take into account translation`
			`rotation_mat[0, 2] += (new_width / 2) - image_center[0]`
			`rotation_mat[1, 2] += (new_height / 2) - image_center[1]`

			`# Rotate the image with the specified border color (white in this case)`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`rotated_image = cv2.warpAffine(`
			`image, rotation_mat, (new_width, new_height), borderValue=(255, 255, 255)`
			`)`
写好了v3版本的训练代码(v3版本加入了自然场景训练增强) 2024-03-28 10:19:40 +00:00
			`return rotated_image`


初步修改完成，但仍然有问题 2024-03-27 04:55:00 +00:00			`def ocr_aug(image: np.ndarray) -> np.ndarray:`
写好了v3版本的训练代码(v3版本加入了自然场景训练增强) 2024-03-28 10:19:40 +00:00			`if random.random() < 0.2:`
			`image = rotate(image, -5, 5)`
			`image = add_white_border(image, max_size=25).permute(1, 2, 0).numpy()`
初步修改完成，但仍然有问题 2024-03-27 04:55:00 +00:00			`image = train_pipeline(image)`
			`return image`
Update README_zh.md 2024-03-25 16:35:34 +08:00

			`def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`assert IMG_CHANNELS == 1, "Only support grayscale images for now"`
修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00
Update README_zh.md 2024-03-25 16:35:34 +08:00			`images = [np.array(img.convert('RGB')) for img in images]`
修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00			`# random resize first`
Update README_zh.md 2024-03-25 16:35:34 +08:00			`images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO)`
Initial Commit 2024-01-31 10:11:07 +00:00			`images = [trim_white_border(image) for image in images]`
初步修改完成，但仍然有问题 2024-03-27 04:55:00 +00:00
写好了v3版本的训练代码(v3版本加入了自然场景训练增强) 2024-03-28 10:19:40 +00:00			`# OCR augmentation`
初步修改完成，但仍然有问题 2024-03-27 04:55:00 +00:00			`images = [ocr_aug(image) for image in images]`

Initial Commit 2024-01-31 10:11:07 +00:00			`# general transform pipeline`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`images = [general_transform_pipeline(image) for image in images]`
Initial Commit 2024-01-31 10:11:07 +00:00			`# padding to fixed size`
merge v3_nature_scence 2024-03-28 13:44:32 +00:00			`images = padding(images, FIXED_IMG_SIZE)`
Initial Commit 2024-01-31 10:11:07 +00:00			`return images`
Update README_zh.md 2024-03-25 16:35:34 +08:00

修改了transforms.py中inference_transform的bug: 在训练的eval阶段没有把png图片转化为np.ndarray 2024-04-10 17:06:44 +00:00			`def inference_transform(images: List[Union[np.ndarray, Image.Image]]) -> List[torch.Tensor]:`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`assert IMG_CHANNELS == 1, "Only support grayscale images for now"`
			`images = [`
			`np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images`
			`]`
修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00			`images = [trim_white_border(image) for image in images]`
			`# general transform pipeline`
[chore] exclude paddleocr directory from pre-commit hooks 2025-02-28 19:56:49 +08:00			`images = [general_transform_pipeline(image) for image in images] # imgs: List[PIL.Image.Image]`
修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00			`# padding to fixed size`
merge v3_nature_scence 2024-03-28 13:44:32 +00:00			`images = padding(images, FIXED_IMG_SIZE)`
Initial Commit 2024-01-31 10:11:07 +00:00
修改好了训练，加入了数据增强 2024-03-04 05:35:59 +00:00			`return images`