完成了load1) er.py, 以 2) 部分代码的loader加载路径的更改

This commit is contained in:
三洋三洋
2024-03-03 15:59:15 +00:00
parent 69b10eccc7
commit 38877d90b8
5 changed files with 9 additions and 7 deletions

View File

@@ -5,7 +5,7 @@ from ...globals import VOCAB_SIZE
if __name__ == '__main__':
tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-raw')
dataset = load_dataset("/home/lhy/code/TeXify/src/models/ocr_model/train/dataset/latex-formulas/latex-formulas.py", "cleaned_formulas")['train']
dataset = load_dataset("/home/lhy/code/TexTeller/src/models/ocr_model/train/data/loader.py")['train']
new_tokenizer = tokenizer.train_new_from_iterator(text_iterator=dataset['latex_formula'], vocab_size=VOCAB_SIZE)
new_tokenizer.save_pretrained('/home/lhy/code/TeXify/src/models/tokenizer/roberta-tokenizer-550Kformulas')
pause = 1