写完了模型代码、Tokenizer、数据预处理、训练脚本,但目前的训练脚本没有配置generate(评估仅能看loss)

This commit is contained in:
三洋三洋
2024-01-28 06:19:23 +00:00
parent 9d27ee0585
commit c6d5c91955
18 changed files with 80058 additions and 78 deletions

View File

@@ -1,28 +0,0 @@
from ....globals import VOCAB_SIZE
from typing import (
Tuple
)
from transformers import (
RobertaConfig,
RobertaModel,
RobertaTokenizerFast
)
def get_encoder():
...
def get_tokenizer() -> RobertaTokenizerFast:
...
def get_decoder() -> RobertaModel:
configuration = RobertaConfig(
vocab_size=VOCAB_SIZE,
is_decoder=True
)
model = RobertaModel(configuration)
return model

View File

@@ -0,0 +1,79 @@
import torch
import datasets
from datasets import load_dataset
from functools import partial
from transformers import DataCollatorForLanguageModeling
from typing import List, Dict, Any
from ...ocr_model.model.TexTeller import TexTeller
from .transforms import train_transform
def left_move(x: torch.Tensor, pad_val):
assert len(x.shape) == 2, 'x should be 2-dimensional'
lefted_x = torch.ones_like(x)
lefted_x[:, :-1] = x[:, 1:]
lefted_x[:, -1] = pad_val
return lefted_x
def tokenize_fn(samples: Dict[str, List[Any]], tokenizer=None) -> Dict[str, List[Any]]:
assert tokenizer is not None, 'tokenizer should not be None'
tokenized_formula = tokenizer(samples['latex_formula'], return_special_tokens_mask=True)
tokenized_formula['pixel_values'] = samples['image']
return tokenized_formula
def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[Any]]:
assert tokenizer is not None, 'tokenizer should not be None'
pixel_values = [dic.pop('pixel_values') for dic in samples]
clm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
batch = clm_collator(samples)
batch['pixel_values'] = pixel_values
batch['decoder_input_ids'] = batch.pop('input_ids')
batch['decoder_attention_mask'] = batch.pop('attention_mask')
# 左移labels和decoder_attention_mask
batch['labels'] = left_move(batch['labels'], -100)
batch['decoder_attention_mask'] = left_move(batch['decoder_attention_mask'], 0)
# 把list of Image转成一个tensor with (B, C, H, W)
batch['pixel_values'] = torch.stack(batch['pixel_values'], dim=0)
return batch
def img_preprocess(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
processed_img = train_transform(samples['pixel_values'])
samples['pixel_values'] = processed_img
return samples
if __name__ == '__main__':
dataset = load_dataset(
'/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py',
'cleaned_formulas'
)['train'].select(range(20))
tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas')
map_fn = partial(tokenize_fn, tokenizer=tokenizer)
collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer)
tokenized_formula = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names)
tokenized_formula = tokenized_formula.to_dict()
# tokenized_formula['pixel_values'] = dataset['image']
# tokenized_formula = dataset.from_dict(tokenized_formula)
tokenized_dataset = tokenized_formula.with_transform(img_preprocess)
dataset_dict = tokenized_dataset[:]
dataset_list = [dict(zip(dataset_dict.keys(), x)) for x in zip(*dataset_dict.values())]
batch = collate_fn_with_tokenizer(dataset_list)
from ..model.TexTeller import TexTeller
model = TexTeller()
out = model(**batch)
pause = 1

View File

@@ -0,0 +1,71 @@
import torch
import torchvision
from torchvision.transforms import v2
from PIL import ImageChops, Image
from typing import Any, Dict, List
from ....globals import OCR_IMG_CHANNELS, OCR_IMG_SIZE, OCR_FIX_SIZE, IMAGE_MEAN, IMAGE_STD
def trim_white_border(image: Image.Image):
if image.mode == 'RGB':
bg_color = (255, 255, 255)
elif image.mode == 'RGBA':
bg_color = (255, 255, 255, 255)
elif image.mode == 'L':
bg_color = 255
else:
raise ValueError("Unsupported image mode")
# 创建一个与图片一样大小的白色背景
bg = Image.new(image.mode, image.size, bg_color)
# 计算原图像与背景图像的差异。如果原图像在边框区域与左上角像素颜色相同,那么这些区域在差异图像中将是黑色的。
diff = ImageChops.difference(image, bg)
# 这一步增强差异图像中的对比度,使非背景区域更加明显。这对确定边界框有帮助,但参数的选择可能需要根据具体图像进行调整。
diff = ImageChops.add(diff, diff, 2.0, -100)
# 找到差异图像中非黑色区域的边界框。如果找到,原图将根据这个边界框被裁剪。
bbox = diff.getbbox()
if bbox:
return image.crop(bbox)
def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now"
assert OCR_FIX_SIZE == True, "Only support fixed size images for now"
images = [trim_white_border(image) for image in images]
transforms = v2.Compose([
v2.ToImage(), # Convert to tensor, only needed if you had a PIL image
#+返回一个List of torchvision.Imagelist的长度就是batch_size
#+因此在整个Compose pipeline的最后输出的也是一个List of torchvision.Image
#+注意不是返回一整个torchvision.Imagebatch_size的维度是拿出来的
v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point
v2.Grayscale(), # 转灰度图(视具体任务而定)
v2.Resize( # 固定resize到一个正方形上
size=OCR_IMG_SIZE - 1, # size必须小于max_size
interpolation=v2.InterpolationMode.BICUBIC,
max_size=OCR_IMG_SIZE,
antialias=True
),
v2.ToDtype(torch.float32, scale=True), # Normalize expects float input
v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
# v2.ToPILImage() # 用于观察转换后的结果是否正确debug用
])
images = transforms(images) # imgs: List[PIL.Image.Image]
images = [
v2.functional.pad(
img,
padding=[0, 0, OCR_IMG_SIZE - img.shape[2], OCR_IMG_SIZE - img.shape[1]]
)
for img in images
]
return images
def inference_transform(images: List[Image.Image]) -> List[torch.Tensor]:
assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now"
assert OCR_FIX_SIZE == True, "Only support fixed size images for now"
return train_transform(images)