25 lines
791 B
Python
25 lines
791 B
Python
import os
|
|
from pathlib import Path
|
|
from datasets import load_dataset
|
|
from ..ocr_model.model.TexTeller import TexTeller
|
|
from ..globals import VOCAB_SIZE
|
|
|
|
|
|
if __name__ == '__main__':
|
|
script_dirpath = Path(__file__).resolve().parent
|
|
os.chdir(script_dirpath)
|
|
|
|
tokenizer = TexTeller.get_tokenizer()
|
|
|
|
# Don't forget to config your dataset path in loader.py
|
|
dataset = load_dataset('../ocr_model/train/dataset/loader.py')['train']
|
|
|
|
new_tokenizer = tokenizer.train_new_from_iterator(
|
|
text_iterator=dataset['latex_formula'],
|
|
# If you want to use a different vocab size, **change VOCAB_SIZE from globals.py**
|
|
vocab_size=VOCAB_SIZE,
|
|
)
|
|
|
|
# Save the new tokenizer for later training and inference
|
|
new_tokenizer.save_pretrained('./your_dir_name')
|