写完了模型代码、Tokenizer、数据预处理、训练脚本,但目前的训练脚本没有配置generate(评估仅能看loss)

This commit is contained in:
三洋三洋
2024-01-28 06:19:23 +00:00
parent 9d27ee0585
commit c6d5c91955
18 changed files with 80058 additions and 78 deletions

View File

@@ -4,60 +4,43 @@ from ....globals import (
OCR_IMG_CHANNELS
)
from typing import (
Tuple
)
from transformers import (
DeiTConfig,
DeiTModel,
ViTConfig,
ViTModel,
TrOCRConfig,
TrOCRForCausalLM,
RobertaConfig,
RobertaModel,
RobertaTokenizerFast,
VisionEncoderDecoderConfig,
VisionEncoderDecoderModel
)
class TexTeller:
def __init__(self, encoder_path=None, decoder_path=None, tokenizer_path=None):
self.tokenizer = self.get_tokenizer(tokenizer_path)
assert not (encoder_path is None and decoder_path is not None)
assert not (encoder_path is not None and decoder_path is None)
if encoder_path is None:
encoder_config = DeiTConfig(
img_size=OCR_IMG_SIZE,
num_channels=OCR_IMG_CHANNELS
)
decoder_config = RobertaConfig(
vocab_size=VOCAB_SIZE,
is_decoder=True
)
model_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(
encoder_config,
decoder_config
)
self.model = VisionEncoderDecoderModel(model_config)
else:
self.model = VisionEncoderDecoderModel.from_pretrained(
encoder_path,
decoder_path
)
...
class TexTeller(VisionEncoderDecoderModel):
def __init__(self, decoder_path=None, tokenizer_path=None):
encoder = ViTModel(ViTConfig(
image_size=OCR_IMG_SIZE,
num_channels=OCR_IMG_CHANNELS
))
decoder = TrOCRForCausalLM(TrOCRConfig(
vocab_size=VOCAB_SIZE,
))
super().__init__(encoder=encoder, decoder=decoder)
@classmethod
def from_pretrained(cls, model_path: str):
return VisionEncoderDecoderModel.from_pretrained(model_path)
@classmethod
def get_tokenizer(tokenizer_path: str = None) -> RobertaTokenizerFast:
if tokenizer_path is None:
return RobertaTokenizerFast()
else:
return RobertaTokenizerFast.from_pretrained(tokenizer_path)
def get_tokenizer(cls, tokenizer_path: str) -> RobertaTokenizerFast:
return RobertaTokenizerFast.from_pretrained(tokenizer_path)
if __name__ == "__main__":
texteller = TexTeller()
tokenizer = texteller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas')
foo = ["Hello, my name is LHY.", "I am a researcher at the University of Science and Technology of China."]
bar = tokenizer(foo, return_special_tokens_mask=True)
pause = 1

View File

@@ -0,0 +1,158 @@
import os
from functools import partial
from pathlib import Path
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer
from ..model.TexTeller import TexTeller
from ..utils.preprocess import tokenize_fn, collate_fn, img_preprocess
training_args = TrainingArguments(
seed=42, # 随机种子,用于确保实验的可重复性
use_cpu=False, # 是否使用cpu刚开始测试代码的时候先用cpu跑会更容易debug
# data_seed=42, # data sampler的采样也固定
# full_determinism=True, # 使整个训练完全固定这个设置会有害于模型训练只用于debug
output_dir="train_result", # 输出目录
overwrite_output_dir=False, # 如果输出目录存在,不删除原先的内容
report_to=["tensorboard"], # 输出日志到TensorBoard
#+通过在命令行tensorboard --logdir ./logs 来查看日志
logging_dir=None, # TensorBoard日志文件的存储目录(使用默认值)
log_level="info", # 其他可选:debug, info, warning, error and critical由低级别到高级别
logging_strategy="steps", # 每隔一定步数记录一次日志
logging_steps=500, # 记录日志的步数间隔可以是int也可以是(0~1)的float当是float时表示总的训练步数的ratio(比方说可以设置成1.0 / 2000)
#+通常与eval_steps一致
logging_nan_inf_filter=False, # 对loss=nan或inf进行记录
num_train_epochs=10, # 总的训练轮数
# max_steps=3, # 训练的最大步骤数。如果设置了这个参数,
#+那么num_train_epochs将被忽略通常用于调试
# label_names = ['your_label_name'], # 指定data_loader中的标签名如果不指定则默认为'labels'
per_device_train_batch_size=128, # 每个GPU的batch size
per_device_eval_batch_size=16, # 每个GPU的evaluation batch size
auto_find_batch_size=True, # 自动搜索合适的batch size指数decay
optim = 'adamw_torch', # 还提供了很多AdamW的变体相较于经典的AdamW更加高效
#+当设置了optim后就不需要在Trainer中传入optimizer
lr_scheduler_type="cosine", # 设置lr_scheduler
warmup_ratio=0.1, # warmup占整个训练steps的比例(假如训练1000步那么前100步就是从lr=0慢慢长到参数设定的lr)
# warmup_steps=500, # 预热步数, 这个参数与warmup_ratio是矛盾的
weight_decay=0, # 权重衰减
learning_rate=5e-5, # 学习率
max_grad_norm=1.0, # 用于梯度裁剪确保梯度的范数不超过1.0默认1.0
fp16=False, # 是否使用16位浮点数进行训练一般不推荐loss很容易炸
bf16=False, # 是否使用16位宽浮点数进行训练如果架构支持的话推荐使用
gradient_accumulation_steps=2, # 梯度累积步数当batch size无法开很大时可以考虑这个参数来实现大batch size的效果
gradient_checkpointing=False, # 当为True时会在forward时适当丢弃一些中间量用于backward从而减轻显存压力但会增加forward的时间
label_smoothing_factor=0.0, # softlabel等于0时表示未开启
# debug='underflow_overflow', # 训练时检查溢出如果发生则会发出警告。该模式通常用于debug
jit_mode_eval=True, # 是否在eval的时候使用PyTorch jit trace可以加速模型但模型必须是静态的否则会报错
torch_compile=True, # 是否使用torch.compile来编译模型从而获得更好的训练和推理性能
#+ 要求torch > 2.0,这个功能很好使,当模型跑通的时候可以开起来
# deepspeed='your_json_path', # 使用deepspeed来训练需要指定ds_config.json的路径
#+ 在Trainer中使用Deepspeed时一定要注意ds_config.json中的配置是否与Trainer的一致如学习率batch size梯度累积步数等
#+ 如果不一致会出现很奇怪的bug而且一般还很难发现
dataloader_pin_memory=True, # 可以加快数据在cpu和gpu之间转移的速度
dataloader_num_workers=16, # 默认不会使用多进程来加载数据通常设成4*所用的显卡数
dataloader_drop_last=True, # 丢掉最后一个minibatch保证训练的梯度稳定
evaluation_strategy="steps", # 评估策略,可以是"steps"或"epoch"
eval_steps=500, # if evaluation_strategy="step"
#+默认情况下与logging_steps一样可以是int也可以是(0~1)的float当是float时表示总的训练步数的ratio(比方说可以设置成1.0 / 2000)
save_strategy="steps", # 保存checkpoint的策略
save_steps=500, # checkpoint保存的步数间隔可以是int也可以是(0~1)的float当是float时表示总的训练步数的ratio(比方说可以设置成1.0 / 2000)
save_total_limit=5, # 保存的模型的最大数量。如果超过这个数量,最旧的模型将被删除
load_best_model_at_end=True, # 训练结束时是否加载最佳模型
#+当设置True时会保存训练时评估结果最好的checkpoint
#+当设置True时evaluation_strategy必须与save_strategy一样并且save_steps必须是eval_steps的整数倍
metric_for_best_model="eval_loss", # 用于选择最佳模型的指标(必须与load_best_model_at_end一起用)
#+可以使用compute_metrics输出的evaluation的结果中一个字典的某个值
#+注意Trainer会在compute_metrics输出的字典的键前面加上一个prefix默认就是“eval_”
greater_is_better=False, # 指标值越小越好(必须与metric_for_best_model一起用)
do_train=True, # 是否进行训练,通常用于调试
do_eval=True, # 是否进行评估,通常用于调试
remove_unused_columns=False, # 是否删除没有用到的列特征默认为True
#+当删除了没用到的列后making it easier to unpack inputs into the models call function
#+注意remove_unused_columns去除列的操作会把传入的dataset的columns_names与模型forward方法中的参数名进行配对对于不存在forward方法中的列名就会直接删掉整个feature
#+因此如果在dataset.with_transform(..)中给数据进行改名那么这个remove操作会直接把原始的数据直接删掉从而导致之后会拿到一个空的dataset导致在对dataset进行切片取值时出问题
#+例如读进来的dataset图片对应的feature name叫"images"而模型forward方法中对应的参数名叫“pixel_values”
#+此时如果是在data.withtransfrom(..)中根据这个"images"生成其他模型forward方法中需要的参数然后再把"images"改名成“pixel_values”那么整个过程就会出问题
#+因为设置了remove_unused_columns=True后会先给dataset进行列名检查然后“images”这个feature会直接被删掉导致with_transform的transform_fn拿不到“images”这个feature
#+所以一个good practice就是对于要改名的特征先提前使用dataset.rename_column进行改名
push_to_hub=False, # 是否训练完后上传hub需要先在命令行huggingface-cli login进行登录认证的配置配置完后认证信息会存到cache文件夹里
)
def main():
# dataset = load_dataset(
# '/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py',
# 'cleaned_formulas'
# )['train'].select(range(500))
dataset = load_dataset(
'/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py',
'cleaned_formulas'
)['train']
tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas')
map_fn = partial(tokenize_fn, tokenizer=tokenizer)
collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer)
tokenized_dataset = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8)
# tokenized_formula = tokenized_formula.to_dict()
# tokenized_formula['pixel_values'] = dataset['image']
# tokenized_dataset = dataset.from_dict(tokenized_formula)
tokenized_dataset = tokenized_dataset.with_transform(img_preprocess)
split_dataset = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset, eval_dataset = split_dataset['train'], split_dataset['test']
model = TexTeller()
trainer = Trainer(
model,
training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=collate_fn_with_tokenizer,
)
trainer.train(resume_from_checkpoint=None)
"""
一个metric_function的另一个case
# Setup evaluation
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
"""
pause = 1
model.generate()
if __name__ == '__main__':
cur_path = os.getcwd()
script_dirpath = Path(__file__).resolve().parent
os.chdir(script_dirpath)
main()
os.chdir(cur_path)

View File

@@ -1,28 +0,0 @@
from ....globals import VOCAB_SIZE
from typing import (
Tuple
)
from transformers import (
RobertaConfig,
RobertaModel,
RobertaTokenizerFast
)
def get_encoder():
...
def get_tokenizer() -> RobertaTokenizerFast:
...
def get_decoder() -> RobertaModel:
configuration = RobertaConfig(
vocab_size=VOCAB_SIZE,
is_decoder=True
)
model = RobertaModel(configuration)
return model

View File

@@ -0,0 +1,79 @@
import torch
import datasets
from datasets import load_dataset
from functools import partial
from transformers import DataCollatorForLanguageModeling
from typing import List, Dict, Any
from ...ocr_model.model.TexTeller import TexTeller
from .transforms import train_transform
def left_move(x: torch.Tensor, pad_val):
assert len(x.shape) == 2, 'x should be 2-dimensional'
lefted_x = torch.ones_like(x)
lefted_x[:, :-1] = x[:, 1:]
lefted_x[:, -1] = pad_val
return lefted_x
def tokenize_fn(samples: Dict[str, List[Any]], tokenizer=None) -> Dict[str, List[Any]]:
assert tokenizer is not None, 'tokenizer should not be None'
tokenized_formula = tokenizer(samples['latex_formula'], return_special_tokens_mask=True)
tokenized_formula['pixel_values'] = samples['image']
return tokenized_formula
def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[Any]]:
assert tokenizer is not None, 'tokenizer should not be None'
pixel_values = [dic.pop('pixel_values') for dic in samples]
clm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
batch = clm_collator(samples)
batch['pixel_values'] = pixel_values
batch['decoder_input_ids'] = batch.pop('input_ids')
batch['decoder_attention_mask'] = batch.pop('attention_mask')
# 左移labels和decoder_attention_mask
batch['labels'] = left_move(batch['labels'], -100)
batch['decoder_attention_mask'] = left_move(batch['decoder_attention_mask'], 0)
# 把list of Image转成一个tensor with (B, C, H, W)
batch['pixel_values'] = torch.stack(batch['pixel_values'], dim=0)
return batch
def img_preprocess(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
processed_img = train_transform(samples['pixel_values'])
samples['pixel_values'] = processed_img
return samples
if __name__ == '__main__':
dataset = load_dataset(
'/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py',
'cleaned_formulas'
)['train'].select(range(20))
tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas')
map_fn = partial(tokenize_fn, tokenizer=tokenizer)
collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer)
tokenized_formula = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names)
tokenized_formula = tokenized_formula.to_dict()
# tokenized_formula['pixel_values'] = dataset['image']
# tokenized_formula = dataset.from_dict(tokenized_formula)
tokenized_dataset = tokenized_formula.with_transform(img_preprocess)
dataset_dict = tokenized_dataset[:]
dataset_list = [dict(zip(dataset_dict.keys(), x)) for x in zip(*dataset_dict.values())]
batch = collate_fn_with_tokenizer(dataset_list)
from ..model.TexTeller import TexTeller
model = TexTeller()
out = model(**batch)
pause = 1

View File

@@ -0,0 +1,71 @@
import torch
import torchvision
from torchvision.transforms import v2
from PIL import ImageChops, Image
from typing import Any, Dict, List
from ....globals import OCR_IMG_CHANNELS, OCR_IMG_SIZE, OCR_FIX_SIZE, IMAGE_MEAN, IMAGE_STD
def trim_white_border(image: Image.Image):
if image.mode == 'RGB':
bg_color = (255, 255, 255)
elif image.mode == 'RGBA':
bg_color = (255, 255, 255, 255)
elif image.mode == 'L':
bg_color = 255
else:
raise ValueError("Unsupported image mode")
# 创建一个与图片一样大小的白色背景
bg = Image.new(image.mode, image.size, bg_color)
# 计算原图像与背景图像的差异。如果原图像在边框区域与左上角像素颜色相同,那么这些区域在差异图像中将是黑色的。
diff = ImageChops.difference(image, bg)
# 这一步增强差异图像中的对比度,使非背景区域更加明显。这对确定边界框有帮助,但参数的选择可能需要根据具体图像进行调整。
diff = ImageChops.add(diff, diff, 2.0, -100)
# 找到差异图像中非黑色区域的边界框。如果找到,原图将根据这个边界框被裁剪。
bbox = diff.getbbox()
if bbox:
return image.crop(bbox)
def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now"
assert OCR_FIX_SIZE == True, "Only support fixed size images for now"
images = [trim_white_border(image) for image in images]
transforms = v2.Compose([
v2.ToImage(), # Convert to tensor, only needed if you had a PIL image
#+返回一个List of torchvision.Imagelist的长度就是batch_size
#+因此在整个Compose pipeline的最后输出的也是一个List of torchvision.Image
#+注意不是返回一整个torchvision.Imagebatch_size的维度是拿出来的
v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point
v2.Grayscale(), # 转灰度图(视具体任务而定)
v2.Resize( # 固定resize到一个正方形上
size=OCR_IMG_SIZE - 1, # size必须小于max_size
interpolation=v2.InterpolationMode.BICUBIC,
max_size=OCR_IMG_SIZE,
antialias=True
),
v2.ToDtype(torch.float32, scale=True), # Normalize expects float input
v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
# v2.ToPILImage() # 用于观察转换后的结果是否正确debug用
])
images = transforms(images) # imgs: List[PIL.Image.Image]
images = [
v2.functional.pad(
img,
padding=[0, 0, OCR_IMG_SIZE - img.shape[2], OCR_IMG_SIZE - img.shape[1]]
)
for img in images
]
return images
def inference_transform(images: List[Image.Image]) -> List[torch.Tensor]:
assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now"
assert OCR_FIX_SIZE == True, "Only support fixed size images for now"
return train_transform(images)