From ab1a05bf32463d3d3fdd049f8d0b999f8a6ffbeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=89=E6=B4=8B=E4=B8=89=E6=B4=8B?= <1258009915@qq.com> Date: Wed, 31 Jan 2024 15:27:35 +0000 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86=E6=89=80=E6=9C=89?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/globals.py | 3 +- ...t.tfevents.1706528656.ubuntu-xyp.1184426.0 | Bin 349 -> 0 bytes ...t.tfevents.1706528694.ubuntu-xyp.1185240.0 | Bin 349 -> 0 bytes ...t.tfevents.1706595465.ubuntu-xyp.1434641.0 | Bin 349 -> 0 bytes ...t.tfevents.1706595552.ubuntu-xyp.1435357.0 | Bin 349 -> 0 bytes ...t.tfevents.1706615305.ubuntu-xyp.1506187.0 | Bin 349 -> 0 bytes src/models/ocr_model/train/train.py | 71 +++--------------- src/models/ocr_model/train/training_args.py | 6 +- src/models/ocr_model/utils/functional.py | 46 ------------ src/models/ocr_model/utils/transforms.py | 9 +-- 10 files changed, 19 insertions(+), 116 deletions(-) delete mode 100644 src/models/ocr_model/train/debug_dir/runs/Jan29_11-44-00_ubuntu-xyp/events.out.tfevents.1706528656.ubuntu-xyp.1184426.0 delete mode 100644 src/models/ocr_model/train/debug_dir/runs/Jan29_11-44-43_ubuntu-xyp/events.out.tfevents.1706528694.ubuntu-xyp.1185240.0 delete mode 100644 src/models/ocr_model/train/debug_dir/runs/Jan30_06-17-24_ubuntu-xyp/events.out.tfevents.1706595465.ubuntu-xyp.1434641.0 delete mode 100644 src/models/ocr_model/train/debug_dir/runs/Jan30_06-18-27_ubuntu-xyp/events.out.tfevents.1706595552.ubuntu-xyp.1435357.0 delete mode 100644 src/models/ocr_model/train/debug_dir/runs/Jan30_11-48-17_ubuntu-xyp/events.out.tfevents.1706615305.ubuntu-xyp.1506187.0 diff --git a/src/globals.py b/src/globals.py index 4ede0ca..96afec9 100644 --- a/src/globals.py +++ b/src/globals.py @@ -30,7 +30,8 @@ OCR_IMG_MAX_WIDTH = 768 OCR_IMG_CHANNELS = 1 # 灰度图 # ocr模型训练数据集的最长token数 -MAX_TOKEN_SIZE = 600 +MAX_TOKEN_SIZE = 512 # 模型最长的embedding长度被设置成了512,所以这里必须是512 +# MAX_TOKEN_SIZE = 600 # ocr模型训练时随机缩放的比例 MAX_RESIZE_RATIO = 1.15 diff --git a/src/models/ocr_model/train/debug_dir/runs/Jan29_11-44-00_ubuntu-xyp/events.out.tfevents.1706528656.ubuntu-xyp.1184426.0 b/src/models/ocr_model/train/debug_dir/runs/Jan29_11-44-00_ubuntu-xyp/events.out.tfevents.1706528656.ubuntu-xyp.1184426.0 deleted file mode 100644 index f9a268a128ea5fd44498979f420b696182738074..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 349 zcmeZZfPjCKJmzw$tFu0NoO{zziZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh zG&eV~s8X-ID6=HBNG}znDn2bUCp8`-a&E`XBeGC)6ZG#Zb6&KCn5!kkCBVg*T9%lj zpOar)EXrb7Zda>twNnME*h#y=my^{Vs#uasoJ#<#I6Xf<9cX$|PHL&BO_hzkhxjou z1*jg)w<`lUpThKratU+sfb|rW=9OgTriw;i_Hq3Dtnh_4RDGNDVQa|?1(i{lGYi{gt@lk@XZL>U+s*gyM|7_SM{YW_c-fiuSus#T6lhD#KpwIsCw OMPvEC`L-+S?WzGt(sSnk diff --git a/src/models/ocr_model/train/debug_dir/runs/Jan29_11-44-43_ubuntu-xyp/events.out.tfevents.1706528694.ubuntu-xyp.1185240.0 b/src/models/ocr_model/train/debug_dir/runs/Jan29_11-44-43_ubuntu-xyp/events.out.tfevents.1706528694.ubuntu-xyp.1185240.0 deleted file mode 100644 index 4695c5ca8d5e66790afa5b8e32610d3821f1db24..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 349 zcmeZZfPjCKJmzw;OHOdVrHJ6YguYuiIq{19+yr@YF=@EQBrM*}QKepaQD#YMkzOiDReV}zPHH?vr1GZtGg+v)3HtYyIoXXM=4uIX32LR&Lsd=oSvVb4m3R}C$&`6rpm^?cA3#V z1*jg)w<`lUZB3whM7e~yc))s!O7luGb5lhRu26Bj7HV-_8>+rd`mi_WH<)?_E;%l7 zu=?V}+=86c;`oBpqWI#}NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ znwy(gRH;{9lv$Emq?Za(6`z)wlNt{ZS;@96UKVO@g8qGF&ctSjxmrS80$iM_Wr;cZ zIr+uKqE0u1>`s};TvCB5cG7O}<&NT9%quVr3Mh$E8z}npd1(l$4)Xl%iK$ znwy(gRH;{9lv$Emq?Za(6`z)wlNt{Z3Hf7{E(}pnqSPGp`e3u9gs&02gO!Sz?ZU zPJVH*XhC(g{m1KND^#G0owOT#IhS@p6-#o7a|wVIr|0LV15HoLNi7wftiRh%pM@=0 z0jfvy?aBa7=WeJTQ7&OF9VcqlGFke OjoUu|wY8SNFdG1-l5u7L diff --git a/src/models/ocr_model/train/train.py b/src/models/ocr_model/train/train.py index 05e7ec2..01d6b6e 100644 --- a/src/models/ocr_model/train/train.py +++ b/src/models/ocr_model/train/train.py @@ -32,6 +32,7 @@ def train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokeniz def evaluate(model, tokenizer, eval_dataset, collate_fn): eval_config = CONFIG.copy() + eval_config['predict_with_generate'] = True generate_config = GenerationConfig( max_new_tokens=MAX_TOKEN_SIZE, num_beams=1, @@ -40,106 +41,54 @@ def evaluate(model, tokenizer, eval_dataset, collate_fn): eos_token_id=tokenizer.eos_token_id, bos_token_id=tokenizer.bos_token_id, ) - # eval_config['use_cpu'] = True - eval_config['output_dir'] = 'debug_dir' - eval_config['predict_with_generate'] = True - eval_config['predict_with_generate'] = True - eval_config['dataloader_num_workers'] = 1 - eval_config['jit_mode_eval'] = False - eval_config['torch_compile'] = False - eval_config['auto_find_batch_size'] = False eval_config['generation_config'] = generate_config + eval_config['auto_find_batch_size'] = False seq2seq_config = Seq2SeqTrainingArguments(**eval_config) trainer = Seq2SeqTrainer( model, seq2seq_config, - eval_dataset=eval_dataset.select(range(100)), + eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=collate_fn, compute_metrics=partial(bleu_metric, tokenizer=tokenizer) ) res = trainer.evaluate() - pause = 1 - ... + print(res) - if __name__ == '__main__': cur_path = os.getcwd() script_dirpath = Path(__file__).resolve().parent os.chdir(script_dirpath) - dataset = load_dataset( '/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py', 'cleaned_formulas' )['train'] - # dataset = load_dataset( - # '/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py', - # 'cleaned_formulas' - # )['train'].select(range(1000)) + dataset = dataset.shuffle(seed=42) + dataset = dataset.flatten_indices() tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas') map_fn = partial(tokenize_fn, tokenizer=tokenizer) tokenized_dataset = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8, load_from_cache_file=True) - # tokenized_dataset = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names, num_proc=1) tokenized_dataset = tokenized_dataset.with_transform(img_transform_fn) split_dataset = tokenized_dataset.train_test_split(test_size=0.05, seed=42) train_dataset, eval_dataset = split_dataset['train'], split_dataset['test'] collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer) # model = TexTeller() - model = TexTeller.from_pretrained('/home/lhy/code/TeXify/src/models/ocr_model/train/train_result/checkpoint-80500') + model = TexTeller.from_pretrained('/home/lhy/code/TeXify/src/models/ocr_model/train/train_result/bugy_train_without_random_resize/checkpoint-82000') - enable_train = False - enable_evaluate = True + enable_train = True + enable_evaluate = False if enable_train: train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer) if enable_evaluate: evaluate(model, tokenizer, eval_dataset, collate_fn_with_tokenizer) - os.chdir(cur_path) - - -''' -if __name__ == '__main__': - cur_path = os.getcwd() - script_dirpath = Path(__file__).resolve().parent - os.chdir(script_dirpath) - - - dataset = load_dataset( - '/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py', - 'cleaned_formulas' - )['train'] - - pause = dataset[0]['image'] - tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas') - - map_fn = partial(tokenize_fn, tokenizer=tokenizer) - tokenized_dataset = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8) - tokenized_dataset = tokenized_dataset.with_transform(img_preprocess) - - split_dataset = tokenized_dataset.train_test_split(test_size=0.05, seed=42) - train_dataset, eval_dataset = split_dataset['train'], split_dataset['test'] - collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer) - # model = TexTeller() - model = TexTeller.from_pretrained('/home/lhy/code/TeXify/src/models/ocr_model/train/train_result/checkpoint-81000') - - enable_train = False - enable_evaluate = True - if enable_train: - train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer) - if enable_evaluate: - evaluate(model, tokenizer, eval_dataset, collate_fn_with_tokenizer) - - - os.chdir(cur_path) - - -''' \ No newline at end of file + os.chdir(cur_path) \ No newline at end of file diff --git a/src/models/ocr_model/train/training_args.py b/src/models/ocr_model/train/training_args.py index ddec056..4d818d9 100644 --- a/src/models/ocr_model/train/training_args.py +++ b/src/models/ocr_model/train/training_args.py @@ -4,13 +4,13 @@ CONFIG = { # "data_seed": 42, # data sampler的采样也固定 # "full_determinism": True, # 使整个训练完全固定(这个设置会有害于模型训练,只用于debug) - "output_dir": "train_result", # 输出目录 + "output_dir": "train_result/train_with_random_resize", # 输出目录 "overwrite_output_dir": False, # 如果输出目录存在,不删除原先的内容 "report_to": ["tensorboard"], # 输出日志到TensorBoard, #+通过在命令行:tensorboard --logdir ./logs 来查看日志 "logging_dir": None, # TensorBoard日志文件的存储目录(使用默认值) - "log_level": "info", # 其他可选:‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’(由低级别到高级别) + "log_level": "warning", # 其他可选:‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’(由低级别到高级别) "logging_strategy": "steps", # 每隔一定步数记录一次日志 "logging_steps": 500, # 记录日志的步数间隔,可以是int也可以是(0~1)的float,当是float时表示总的训练步数的ratio(比方说可以设置成1.0 / 2000) #+通常与eval_steps一致 @@ -22,7 +22,7 @@ CONFIG = { # "label_names": ['your_label_name'], # 指定data_loader中的标签名,如果不指定则默认为'labels' - "per_device_train_batch_size": 128, # 每个GPU的batch size + "per_device_train_batch_size": 64, # 每个GPU的batch size "per_device_eval_batch_size": 16, # 每个GPU的evaluation batch size "auto_find_batch_size": True, # 自动搜索合适的batch size(指数decay) diff --git a/src/models/ocr_model/utils/functional.py b/src/models/ocr_model/utils/functional.py index 748f42c..92f40c0 100644 --- a/src/models/ocr_model/utils/functional.py +++ b/src/models/ocr_model/utils/functional.py @@ -38,7 +38,6 @@ def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[ # 左移labels和decoder_attention_mask batch['labels'] = left_move(batch['labels'], -100) - # batch['decoder_attention_mask'] = left_move(batch['decoder_attention_mask'], 0) # 把list of Image转成一个tensor with (B, C, H, W) batch['pixel_values'] = torch.stack(batch['pixel_values'], dim=0) @@ -76,48 +75,3 @@ if __name__ == '__main__': out = model(**batch) pause = 1 - - -''' -def left_move(x: torch.Tensor, pad_val): - assert len(x.shape) == 2, 'x should be 2-dimensional' - lefted_x = torch.ones_like(x) - lefted_x[:, :-1] = x[:, 1:] - lefted_x[:, -1] = pad_val - return lefted_x - - -def tokenize_fn(samples: Dict[str, List[Any]], tokenizer=None) -> Dict[str, List[Any]]: - assert tokenizer is not None, 'tokenizer should not be None' - tokenized_formula = tokenizer(samples['latex_formula'], return_special_tokens_mask=True) - tokenized_formula['pixel_values'] = samples['image'] - return tokenized_formula - - -def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[Any]]: - assert tokenizer is not None, 'tokenizer should not be None' - pixel_values = [dic.pop('pixel_values') for dic in samples] - - clm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) - - batch = clm_collator(samples) - batch['pixel_values'] = pixel_values - batch['decoder_input_ids'] = batch.pop('input_ids') - batch['decoder_attention_mask'] = batch.pop('attention_mask') - - # 左移labels和decoder_attention_mask - batch['labels'] = left_move(batch['labels'], -100) - batch['decoder_attention_mask'] = left_move(batch['decoder_attention_mask'], 0) - - # 把list of Image转成一个tensor with (B, C, H, W) - batch['pixel_values'] = torch.stack(batch['pixel_values'], dim=0) - return batch - - -def img_preprocess(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]: - processed_img = train_transform(samples['pixel_values']) - samples['pixel_values'] = processed_img - return samples - -''' - diff --git a/src/models/ocr_model/utils/transforms.py b/src/models/ocr_model/utils/transforms.py index 9f744e4..b5b833f 100644 --- a/src/models/ocr_model/utils/transforms.py +++ b/src/models/ocr_model/utils/transforms.py @@ -4,7 +4,6 @@ import numpy as np import cv2 from torchvision.transforms import v2 -from PIL import ImageChops, Image from typing import List, Union from ....globals import ( @@ -107,7 +106,7 @@ def random_resize( ] -def general_transform(images: List[Image.Image]) -> List[torch.Tensor]: +def general_transform(images: List[np.ndarray]) -> List[torch.Tensor]: # 裁剪掉白边 images = [trim_white_border(image) for image in images] # general transform pipeline @@ -117,16 +116,16 @@ def general_transform(images: List[Image.Image]) -> List[torch.Tensor]: return images -def train_transform(images: List[np.ndarray]) -> List[torch.Tensor]: +def train_transform(images: List[List[List[List]]]) -> List[torch.Tensor]: assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now" assert OCR_FIX_SIZE == True, "Only support fixed size images for now" # random resize first - # images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO) + images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO) return general_transform(images) -def inference_transform(images: List[Image.Image]) -> List[torch.Tensor]: +def inference_transform(images: List[np.ndarray]) -> List[torch.Tensor]: assert OCR_IMG_CHANNELS == 1 , "Only support grayscale images for now" assert OCR_FIX_SIZE == True, "Only support fixed size images for now"