写完了模型代码、Tokenizer、数据预处理、训练脚本，但目前的训练脚本没有配置generate（评估仅能看loss）

2024-01-28 06:19:23 +00:00
parent 9d27ee0585
commit c6d5c91955
18 changed files with 80058 additions and 78 deletions
--- a/src/models/ocr_model/utils/get.py
+++ b/src/models/ocr_model/utils/get.py
@@ -1,28 +0,0 @@
-from ....globals import VOCAB_SIZE
-from typing import (
-    Tuple
-)
-
-from transformers import (
-    RobertaConfig,
-    RobertaModel,
-    RobertaTokenizerFast
-)
-
-
-def get_encoder():
-    ...
-
-
-def get_tokenizer() -> RobertaTokenizerFast:
-    ...
-
-
-def get_decoder() -> RobertaModel:
-    configuration = RobertaConfig(
-        vocab_size=VOCAB_SIZE,
-        is_decoder=True
-    )
-    model = RobertaModel(configuration)
-    return model
-
--- a/src/models/ocr_model/utils/preprocess.py
+++ b/src/models/ocr_model/utils/preprocess.py
@@ -0,0 +1,79 @@
+import torch
+import datasets
+
+from datasets import load_dataset
+
+from functools import partial
+from transformers import DataCollatorForLanguageModeling
+from typing import List, Dict, Any
+from ...ocr_model.model.TexTeller import TexTeller
+from .transforms import train_transform
+
+
+def left_move(x: torch.Tensor, pad_val):
+    assert len(x.shape) == 2, 'x should be 2-dimensional'
+    lefted_x = torch.ones_like(x)
+    lefted_x[:, :-1] = x[:, 1:]
+    lefted_x[:, -1] = pad_val
+    return lefted_x
+
+
+def tokenize_fn(samples: Dict[str, List[Any]], tokenizer=None) -> Dict[str, List[Any]]:
+    assert tokenizer is not None, 'tokenizer should not be None'
+    tokenized_formula = tokenizer(samples['latex_formula'], return_special_tokens_mask=True)
+    tokenized_formula['pixel_values'] = samples['image']
+    return tokenized_formula
+
+
+def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[Any]]:
+    assert tokenizer is not None, 'tokenizer should not be None'
+    pixel_values = [dic.pop('pixel_values') for dic in samples]
+
+    clm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    
+    batch = clm_collator(samples)
+    batch['pixel_values'] = pixel_values
+    batch['decoder_input_ids'] = batch.pop('input_ids')
+    batch['decoder_attention_mask'] = batch.pop('attention_mask')
+
+    # 左移labels和decoder_attention_mask
+    batch['labels'] = left_move(batch['labels'], -100)
+    batch['decoder_attention_mask'] = left_move(batch['decoder_attention_mask'], 0)
+
+    # 把list of Image转成一个tensor with (B, C, H, W)
+    batch['pixel_values'] = torch.stack(batch['pixel_values'], dim=0)
+    return batch
+
+
+def img_preprocess(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+    processed_img = train_transform(samples['pixel_values'])
+    samples['pixel_values'] = processed_img
+    return samples
+
+
+if __name__ == '__main__':
+    dataset = load_dataset(
+        '/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py',
+        'cleaned_formulas'
+    )['train'].select(range(20))
+    tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas')
+
+    map_fn = partial(tokenize_fn, tokenizer=tokenizer)
+    collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer)
+
+    tokenized_formula = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names)
+    tokenized_formula = tokenized_formula.to_dict()
+    # tokenized_formula['pixel_values'] = dataset['image']
+    # tokenized_formula = dataset.from_dict(tokenized_formula)
+    tokenized_dataset = tokenized_formula.with_transform(img_preprocess)
+
+    dataset_dict = tokenized_dataset[:]
+    dataset_list = [dict(zip(dataset_dict.keys(), x)) for x in zip(*dataset_dict.values())]
+    batch = collate_fn_with_tokenizer(dataset_list)
+
+    from ..model.TexTeller import TexTeller
+    model = TexTeller()
+    out = model(**batch)
+
+    pause = 1
+
--- a/src/models/ocr_model/utils/transforms.py
+++ b/src/models/ocr_model/utils/transforms.py
@@ -0,0 +1,71 @@
+import torch
+import torchvision
+
+from torchvision.transforms import v2
+from PIL import ImageChops, Image
+from typing import Any, Dict, List
+
+from ....globals import OCR_IMG_CHANNELS, OCR_IMG_SIZE, OCR_FIX_SIZE, IMAGE_MEAN, IMAGE_STD
+
+
+def trim_white_border(image: Image.Image):
+    if image.mode == 'RGB':
+        bg_color = (255, 255, 255)
+    elif image.mode == 'RGBA':
+        bg_color = (255, 255, 255, 255)
+    elif image.mode == 'L':
+        bg_color = 255
+    else:
+        raise ValueError("Unsupported image mode")
+    # 创建一个与图片一样大小的白色背景
+    bg = Image.new(image.mode, image.size, bg_color)
+    # 计算原图像与背景图像的差异。如果原图像在边框区域与左上角像素颜色相同，那么这些区域在差异图像中将是黑色的。
+    diff = ImageChops.difference(image, bg)
+    # 这一步增强差异图像中的对比度，使非背景区域更加明显。这对确定边界框有帮助，但参数的选择可能需要根据具体图像进行调整。
+    diff = ImageChops.add(diff, diff, 2.0, -100)
+    # 找到差异图像中非黑色区域的边界框。如果找到，原图将根据这个边界框被裁剪。
+    bbox = diff.getbbox()
+    if bbox:
+        return image.crop(bbox)
+
+
+def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
+    assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now"
+    assert OCR_FIX_SIZE == True, "Only support fixed size images for now"
+    images = [trim_white_border(image) for image in images]
+    transforms = v2.Compose([
+        v2.ToImage(),    # Convert to tensor, only needed if you had a PIL image
+                         #+返回一个List of torchvision.Image，list的长度就是batch_size
+                         #+因此在整个Compose pipeline的最后，输出的也是一个List of torchvision.Image
+                         #+注意：不是返回一整个torchvision.Image，batch_size的维度是拿出来的
+        v2.ToDtype(torch.uint8, scale=True),  # optional, most input are already uint8 at this point
+        v2.Grayscale(),  # 转灰度图（视具体任务而定）
+
+        v2.Resize(       # 固定resize到一个正方形上
+            size=OCR_IMG_SIZE - 1,  # size必须小于max_size 
+            interpolation=v2.InterpolationMode.BICUBIC,
+            max_size=OCR_IMG_SIZE,
+            antialias=True
+        ),
+
+        v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
+        v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
+
+        # v2.ToPILImage()  # 用于观察转换后的结果是否正确（debug用）
+    ])
+
+    images = transforms(images)  # imgs: List[PIL.Image.Image]
+    images = [  
+        v2.functional.pad(
+            img,
+            padding=[0, 0, OCR_IMG_SIZE - img.shape[2], OCR_IMG_SIZE - img.shape[1]]
+        )
+        for img in images
+    ]
+    return images
+
+
+def inference_transform(images: List[Image.Image]) -> List[torch.Tensor]:
+    assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now"
+    assert OCR_FIX_SIZE == True, "Only support fixed size images for now"
+    return train_transform(images)