Files
TexTeller/texteller/models/tokenizer/train.py

25 lines
791 B
Python
Raw Normal View History

2024-02-11 08:06:50 +00:00
import os
from pathlib import Path
from datasets import load_dataset
from ..ocr_model.model.TexTeller import TexTeller
from ..globals import VOCAB_SIZE
if __name__ == '__main__':
script_dirpath = Path(__file__).resolve().parent
os.chdir(script_dirpath)
tokenizer = TexTeller.get_tokenizer()
# Don't forget to config your dataset path in loader.py
dataset = load_dataset('../ocr_model/train/dataset/loader.py')['train']
new_tokenizer = tokenizer.train_new_from_iterator(
text_iterator=dataset['latex_formula'],
2024-02-11 08:06:50 +00:00
# If you want to use a different vocab size, **change VOCAB_SIZE from globals.py**
vocab_size=VOCAB_SIZE,
2024-02-11 08:06:50 +00:00
)
# Save the new tokenizer for later training and inference
new_tokenizer.save_pretrained('./your_dir_name')