Update README_zh.md
This commit is contained in:
25
src/models/tokenizer/train.py
Normal file
25
src/models/tokenizer/train.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datasets import load_dataset
|
||||
from ..ocr_model.model.TexTeller import TexTeller
|
||||
from ..globals import VOCAB_SIZE
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
script_dirpath = Path(__file__).resolve().parent
|
||||
os.chdir(script_dirpath)
|
||||
|
||||
tokenizer = TexTeller.get_tokenizer()
|
||||
|
||||
# Don't forget to config your dataset path in loader.py
|
||||
dataset = load_dataset('../ocr_model/train/dataset/loader.py')['train']
|
||||
|
||||
new_tokenizer = tokenizer.train_new_from_iterator(
|
||||
text_iterator=dataset['latex_formula'],
|
||||
|
||||
# If you want to use a different vocab size, **change VOCAB_SIZE from globals.py**
|
||||
vocab_size=VOCAB_SIZE
|
||||
)
|
||||
|
||||
# Save the new tokenizer for later training and inference
|
||||
new_tokenizer.save_pretrained('./your_dir_name')
|
||||
Reference in New Issue
Block a user