Initial commit

2024-02-11 08:06:50 +00:00
commit f057490bdb
56 changed files with 815 additions and 0 deletions
--- a/src/models/ocr_model/utils/functional.py
+++ b/src/models/ocr_model/utils/functional.py
@@ -0,0 +1,46 @@
+import torch
+import numpy as np 
+
+from transformers import DataCollatorForLanguageModeling
+from typing import List, Dict, Any
+from .transforms import train_transform
+
+
+def left_move(x: torch.Tensor, pad_val):
+    assert len(x.shape) == 2, 'x should be 2-dimensional'
+    lefted_x = torch.ones_like(x)
+    lefted_x[:, :-1] = x[:, 1:]
+    lefted_x[:, -1] = pad_val
+    return lefted_x
+
+
+def tokenize_fn(samples: Dict[str, List[Any]], tokenizer=None) -> Dict[str, List[Any]]:
+    assert tokenizer is not None, 'tokenizer should not be None'
+    tokenized_formula = tokenizer(samples['latex_formula'], return_special_tokens_mask=True)
+    tokenized_formula['pixel_values'] = samples['image']
+    return tokenized_formula
+
+
+def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[Any]]:
+    assert tokenizer is not None, 'tokenizer should not be None'
+    pixel_values = [dic.pop('pixel_values') for dic in samples]
+
+    clm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    
+    batch = clm_collator(samples)
+    batch['pixel_values'] = pixel_values
+    batch['decoder_input_ids'] = batch.pop('input_ids')
+    batch['decoder_attention_mask'] = batch.pop('attention_mask')
+
+    # left shift labels and decoder_attention_mask, padding with -100
+    batch['labels'] = left_move(batch['labels'], -100)
+
+    # convert list of Image to tensor with (B, C, H, W)
+    batch['pixel_values'] = torch.stack(batch['pixel_values'], dim=0)
+    return batch
+
+
+def img_transform_fn(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+    processed_img = train_transform(samples['pixel_values'])
+    samples['pixel_values'] = processed_img
+    return samples
--- a/src/models/ocr_model/utils/helpers.py
+++ b/src/models/ocr_model/utils/helpers.py
@@ -0,0 +1,26 @@
+import cv2
+import numpy as np
+from typing import List
+
+
+def convert2rgb(image_paths: List[str]) -> List[np.ndarray]:
+    processed_images = []
+    for path in image_paths:
+        image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
+        if image is None:
+            print(f"Image at {path} could not be read.")
+            continue
+        if image.dtype == np.uint16:
+            print(f'Converting {path} to 8-bit, image may be lossy.')
+            image = cv2.convertScaleAbs(image, alpha=(255.0/65535.0))
+
+        channels = 1 if len(image.shape) == 2 else image.shape[2]
+        if channels == 4:
+            image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
+        elif channels == 1:
+            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        elif channels == 3:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        processed_images.append(image)
+
+    return processed_images
--- a/src/models/ocr_model/utils/inference.py
+++ b/src/models/ocr_model/utils/inference.py
@@ -0,0 +1,38 @@
+import torch
+
+from transformers import RobertaTokenizerFast, GenerationConfig
+from typing import List
+
+from models.ocr_model.model.TexTeller import TexTeller
+from models.ocr_model.utils.transforms import inference_transform
+from models.ocr_model.utils.helpers import convert2rgb
+from models.globals import MAX_TOKEN_SIZE
+
+
+def inference(
+    model: TexTeller, 
+    tokenizer: RobertaTokenizerFast,
+    imgs_path: List[str], 
+    use_cuda: bool,
+    num_beams: int = 1,
+) -> List[str]:
+    model.eval()
+    imgs = convert2rgb(imgs_path)
+    imgs = inference_transform(imgs)
+    pixel_values = torch.stack(imgs)
+
+    if use_cuda:
+        model = model.to('cuda')
+        pixel_values = pixel_values.to('cuda')
+
+    generate_config = GenerationConfig(
+        max_new_tokens=MAX_TOKEN_SIZE,
+        num_beams=num_beams,
+        do_sample=False,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+    )
+    pred = model.generate(pixel_values, generation_config=generate_config)
+    res = tokenizer.batch_decode(pred, skip_special_tokens=True)
+    return res
--- a/src/models/ocr_model/utils/metrics.py
+++ b/src/models/ocr_model/utils/metrics.py
@@ -0,0 +1,23 @@
+import evaluate
+import numpy as np
+import os
+
+from pathlib import Path
+from typing import Dict
+from transformers import EvalPrediction, RobertaTokenizer
+
+
+def bleu_metric(eval_preds: EvalPrediction, tokenizer: RobertaTokenizer) -> Dict:
+    cur_dir = Path(os.getcwd())
+    os.chdir(Path(__file__).resolve().parent)
+    metric = evaluate.load('google_bleu')  # Will download the metric from huggingface if not already downloaded
+    os.chdir(cur_dir)
+    
+    logits, labels = eval_preds.predictions, eval_preds.label_ids
+    preds = logits
+
+    labels = np.where(labels == -100, 1, labels)
+
+    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    return metric.compute(predictions=preds, references=labels)
--- a/src/models/ocr_model/utils/transforms.py
+++ b/src/models/ocr_model/utils/transforms.py
@@ -0,0 +1,90 @@
+import torch
+import random
+import numpy as np
+import cv2
+
+from torchvision.transforms import v2
+from typing import List
+from PIL import Image
+
+from models.globals import (
+    FIXED_IMG_SIZE,
+    IMAGE_MEAN, IMAGE_STD,
+    MAX_RESIZE_RATIO, MIN_RESIZE_RATIO
+)
+
+general_transform_pipeline = v2.Compose([
+    v2.ToImage(),
+    v2.ToDtype(torch.uint8, scale=True),
+    v2.Grayscale(),
+    v2.Resize(
+        size=FIXED_IMG_SIZE - 1,
+        interpolation=v2.InterpolationMode.BICUBIC,
+        max_size=FIXED_IMG_SIZE,
+        antialias=True
+    ),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
+])
+
+
+def trim_white_border(image: np.ndarray):
+    if len(image.shape) != 3 or image.shape[2] != 3:
+        raise ValueError("Image is not in RGB format or channel is not in third dimension")
+
+    if image.dtype != np.uint8:
+        raise ValueError(f"Image should stored in uint8")
+
+    h, w = image.shape[:2]
+    bg = np.full((h, w, 3), 255, dtype=np.uint8)
+    diff = cv2.absdiff(image, bg)
+
+    _, diff = cv2.threshold(diff, 1, 255, cv2.THRESH_BINARY)
+    gray_diff = cv2.cvtColor(diff, cv2.COLOR_RGB2GRAY)
+    x, y, w, h = cv2.boundingRect(gray_diff) 
+
+    trimmed_image = image[y:y+h, x:x+w]
+    return trimmed_image
+
+
+def padding(images: List[torch.Tensor], required_size: int):
+    images = [  
+        v2.functional.pad(
+            img,
+            padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]]
+        )
+        for img in images
+    ]
+    return images
+
+
+def random_resize(
+    images: List[np.ndarray], 
+    minr: float, 
+    maxr: float
+) -> List[np.ndarray]:
+    if len(images[0].shape) != 3 or images[0].shape[2] != 3:
+        raise ValueError("Image is not in RGB format or channel is not in third dimension")
+
+    ratios = [random.uniform(minr, maxr) for _ in range(len(images))]
+    return [
+        cv2.resize(img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LANCZOS4)  # 抗锯齿
+        for img, r in zip(images, ratios)
+    ]
+
+
+def general_transform(images: List[np.ndarray]) -> List[torch.Tensor]:
+    images = [trim_white_border(image) for image in images]
+    images = general_transform_pipeline(images)
+    images = padding(images, FIXED_IMG_SIZE)
+    return images
+
+
+def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
+    images = [np.array(img.convert('RGB')) for img in images]
+    images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO)
+    return general_transform(images)
+
+
+def inference_transform(images: List[np.ndarray]) -> List[torch.Tensor]:
+    return general_transform(images)