merge v3_nature_scence
This commit is contained in:
9740
src/models/tokenizer/roberta-tokenizer-550K/merges.txt
Normal file
9740
src/models/tokenizer/roberta-tokenizer-550K/merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"bos_token": "<s>",
|
||||
"cls_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"mask_token": {
|
||||
"content": "<mask>",
|
||||
"lstrip": true,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": "<pad>",
|
||||
"sep_token": "</s>",
|
||||
"unk_token": "<unk>"
|
||||
}
|
||||
19830
src/models/tokenizer/roberta-tokenizer-550K/tokenizer.json
Normal file
19830
src/models/tokenizer/roberta-tokenizer-550K/tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"0": {
|
||||
"content": "<s>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"1": {
|
||||
"content": "<pad>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"2": {
|
||||
"content": "</s>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"3": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"4": {
|
||||
"content": "<mask>",
|
||||
"lstrip": true,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<s>",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"cls_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"errors": "replace",
|
||||
"mask_token": "<mask>",
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"pad_token": "<pad>",
|
||||
"sep_token": "</s>",
|
||||
"tokenizer_class": "RobertaTokenizer",
|
||||
"trim_offsets": true,
|
||||
"unk_token": "<unk>"
|
||||
}
|
||||
1
src/models/tokenizer/roberta-tokenizer-550K/vocab.json
Normal file
1
src/models/tokenizer/roberta-tokenizer-550K/vocab.json
Normal file
File diff suppressed because one or more lines are too long
14740
src/models/tokenizer/roberta-tokenizer-7Mformulas/merges.txt
Normal file
14740
src/models/tokenizer/roberta-tokenizer-7Mformulas/merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"bos_token": "<s>",
|
||||
"cls_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"mask_token": {
|
||||
"content": "<mask>",
|
||||
"lstrip": true,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": "<pad>",
|
||||
"sep_token": "</s>",
|
||||
"unk_token": "<unk>"
|
||||
}
|
||||
29830
src/models/tokenizer/roberta-tokenizer-7Mformulas/tokenizer.json
Normal file
29830
src/models/tokenizer/roberta-tokenizer-7Mformulas/tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"0": {
|
||||
"content": "<s>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"1": {
|
||||
"content": "<pad>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"2": {
|
||||
"content": "</s>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"3": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": true,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"4": {
|
||||
"content": "<mask>",
|
||||
"lstrip": true,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"bos_token": "<s>",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"cls_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"errors": "replace",
|
||||
"mask_token": "<mask>",
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"pad_token": "<pad>",
|
||||
"sep_token": "</s>",
|
||||
"tokenizer_class": "RobertaTokenizer",
|
||||
"trim_offsets": true,
|
||||
"unk_token": "<unk>"
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
21
src/models/tokenizer/roberta-tokenizer-raw/config.json
Normal file
21
src/models/tokenizer/roberta-tokenizer-raw/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"architectures": [
|
||||
"RobertaForMaskedLM"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"bos_token_id": 0,
|
||||
"eos_token_id": 2,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"max_position_embeddings": 514,
|
||||
"model_type": "roberta",
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"pad_token_id": 1,
|
||||
"type_vocab_size": 1,
|
||||
"vocab_size": 50265
|
||||
}
|
||||
50001
src/models/tokenizer/roberta-tokenizer-raw/merges.txt
Normal file
50001
src/models/tokenizer/roberta-tokenizer-raw/merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
1
src/models/tokenizer/roberta-tokenizer-raw/vocab.json
Normal file
1
src/models/tokenizer/roberta-tokenizer-raw/vocab.json
Normal file
File diff suppressed because one or more lines are too long
31
src/models/tokenizer/test_long_formulas.txt
Normal file
31
src/models/tokenizer/test_long_formulas.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
\begin{aligned}
|
||||
&\begin{aligned}(\tau\lambda)\psi(a)(\lambda^{-1}\tau)(X,Y,\xi,\eta)=(\tau\lambda)\psi(a)(-\tau Y,\tau X,-\tau\eta,\tau\xi)\end{aligned} \\
|
||||
&=(\tau\lambda)\bigg(\begin{pmatrix}-a\tau\eta_1&-\tau y_3&-\tau\overline{y}_2\\-\tau\overline{y}_3&-a^{-1}\tau\eta_2&-a^{-1}\tau y_1\\-\tau y_2&-a^{-1}\tau\overline{y}_1&-a^{-1}\tau\eta_3\end{pmatrix},\begin{pmatrix}a^{-1}\tau\xi_1&\tau x_3&\tau\overline{x}_2\\\tau\overline{x}_3&a\tau\xi_2&a\tau x_1\\\tau x_2&a\tau\overline{x}_1&a\tau\xi_3\end{pmatrix},-a\tau\eta,a^{-1}\tau\xi\bigg) \\
|
||||
&\left.=\left(\begin{pmatrix}\tau a^{-1}\xi_1&x_3&\overline{x}_2\\\overline{x}_3&\tau a\xi_2&\tau ax_1\\x_2&\tau a\overline{x}_1&\tau a\xi_3\end{pmatrix}\right.,\begin{pmatrix}\tau a\eta_1&y_3&\overline{y}_2\\\overline{y}_3&\tau a^{-1}\eta_2&\tau a^{-1}y_1\\y_2&\tau a^{-1}\overline{y}_1&\tau a^{-1}\eta_3\end{pmatrix},\tau a^{-1}\xi,\tau a\eta\right) \\
|
||||
&=\psi(\tau a^{-1}).
|
||||
\end{aligned}
|
||||
|
||||
\begin{aligned}
|
||||
&\begin{aligned}-L_{X_{13}}&=\left(\frac{1}{2}\sin\alpha\cos\beta\sin2\gamma+\cos\alpha\tan\beta\sin^2\gamma-\frac{1}{2}\sin\alpha\sin\beta\tan\beta\sin2\gamma\right)\frac{\partial}{\partial\alpha}\end{aligned} \\
|
||||
&\begin{aligned}+\left(\frac12\cos\alpha\sin\beta\sin2\gamma-\sin\alpha\sin^2\beta\cos^2\gamma-\sin\alpha\cos^2\beta\sin^2\gamma\right)\frac\partial{\partial\beta}\end{aligned} \\
|
||||
&\begin{aligned}+\left(\frac14\sin\alpha\sin2\beta\sin2\gamma-\frac12\sin\alpha\tan\beta\sin2\gamma+\cos\alpha\sec\beta\sin^2\gamma\right)\frac{\partial}{\partial\gamma}\end{aligned} \\
|
||||
&+\left(\left(\frac12\sin\alpha\sin2\beta\cos^2\gamma+\frac12\sin\alpha\sin2\beta-\frac12\cos\alpha\cos\beta\sin2\gamma\right)z_{12}\right. \\
|
||||
&+(\sin\alpha\cos2\beta\cos\gamma+\cos\alpha\sin\beta\sin\gamma)\biggr)\frac{\partial}{\partial z_{12}} \\
|
||||
&+\left(\left(\frac12\sin\alpha\sin2\beta\cos2\gamma-\cos\alpha\cos\beta\sin2\gamma\right)z_{13}+(\sin\alpha\cos2\beta\cos\gamma\right. \\
|
||||
&\left.\left.+\cos\alpha\sin\beta\sin\gamma\right)z_{23}+\left(\frac12\sin\alpha\sin2\beta\sin2\gamma+\cos\alpha\cos\beta\cos2\gamma\right)\right)\frac{\partial}{\partial z_{13}} \\
|
||||
&+\left(\left(-\frac12\sin\alpha\sin2\beta-\frac12\sin\alpha\sin2\beta\sin^2\gamma-\frac12\cos\alpha\cos\beta\sin2\gamma\right)z_{23}\right. \\
|
||||
&+(\sin\alpha\cos2\beta\sin\gamma-\cos\alpha\sin\beta\cos\gamma)\Bigg)\frac{\partial}{\partial z_{23}}.
|
||||
\end{aligned}
|
||||
|
||||
\begin{aligned}
|
||||
&\sum_S(-1)^{|S|}\frac{1-\prod_{i\notin S}\left(\frac{X_i(1+X_i)}{Q+X_i}\right)^{m+1}}{1-\prod_{i\notin S}\frac{X_i(1+X_i)}{Q+X_i}}\prod_iX_i \\
|
||||
&\times\prod_{i\in S}X_{i}^{m+n-1}(1+X_{i})^{m+1}(Q+X_{i})^{-m}(X_{i}+r+Q)^{n-1} \\
|
||||
&\times\prod_{i\notin S}(1+X_i)(Q+rX_i+QX_i)^{n-1} \\
|
||||
&&\times\prod_{1\leq i<j\leq n,\{i,j\}\cap S\neq\emptyset}\left(\frac{Y_j(1+Y_j)}{Q+rY_j+QY_j}-\frac{Y_i(1+Y_i)}{Q+rY_i+QY_i}\right) \\
|
||||
&&&\times\sum_{k\notin S}(Q-X_{k}^{2})X_{k}^{-1}(1+X_{k})^{-1} \\
|
||||
&&&\times\prod_{\overset{1\leq i\leq k-1}{i\notin S}}\frac{(Q+(Q+r)X_k+X_i+X_iX_k)(X_iX_k-Q)}{(Q+rX_k+QX_k)(Q+rX_i+QX_i)} \\
|
||||
&&&\times\prod_{\overset{k+1\leq i\leq n}{i\notin S}}\frac{(Q+(Q+r)X_k+X_i+X_iX_k)(Q-X_iX_k)}{(Q+rX_k+QX_k)(Q+rX_i+QX_i)} \\
|
||||
&&&&\times\prod_{1\leq i<j\leq n,i,j\notin S\cup\{k\}}\left(\frac{X_j(1+X_j)}{Q+rX_j+QX_j}-\frac{X_i(1+X_i)}{Q+rX_i+QX_i}\right).
|
||||
\end{aligned}
|
||||
|
||||
\[w_{\mathbb{A}}\left(\begin{bmatrix}T_{1}&T_{2}&T_{3}\\ T_{2}&T_{3}&iT_{1}\\ T_{3}&iT_{1}&iT_{2}\end{bmatrix}\right)=w_{\mathbb{A}}\left(\mathbb{V}^{\#_{ \mathbb{A}}}\begin{bmatrix}T_{1}&T_{2}&T_{3}\\ T_{2}&T_{3}&iT_{1}\\ T_{3}&iT_{1}&iT_{2}\end{bmatrix}\mathbb{V}\right)\] \[=\frac{1}{2}w_{\mathbb{A}}\left(\begin{bmatrix}T_{1}^{\#_{A}}- iT_{2}^{\#_{A}}&-i\sqrt{2}(T_{1}^{\#_{A}}+T_{2}^{\#_{A}})&2T_{3}^{\#_{A}}- iT_{1}^{\#_{A}}+T_{2}^{\#_{A}}\\ i\sqrt{2}(T_{2}^{\#_{A}}-T_{1}^{\#_{A}})&2T_{3}^{\#_{A}}&\sqrt{2}(T_{1}^{\#_{A} }+T_{2}^{\#_{A}})\\ 2T_{3}^{\#_{A}}-(-iT_{1}^{\#_{A}}+T_{2}^{\#_{A}})&\sqrt{2}(T_{2}^{\#_{A}}-T_{ 1}^{\#_{A}})&T_{1}^{\#_{A}}-iT_{2}^{\#_{A}}\end{bmatrix}\right)\] \[\leq w_{\mathbb{A}}\left(\begin{bmatrix}O&O&T_{3}\\ O&T_{3}&O\\ T_{3}&O&O\end{bmatrix}^{\#_{\mathbb{A}}}\right)+\frac{1}{2}w_{\mathbb{A}}\left( \begin{bmatrix}T_{1}+iT_{2}&O&-(iT_{1}+T_{2})\\ O&O&O\\ iT_{1}+T_{2}&O&T_{1}+iT_{2}\end{bmatrix}^{\#_{\mathbb{A}}}\right)\] \[+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&-i(T_{2} -T_{1})&O\\ i(T_{1}+T_{2})&O&O\\ O&O&O\end{bmatrix}^{\#_{\mathbb{A}}}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}} \left(\begin{bmatrix}O&O&O\\ O&O&(T_{2}-T_{1})\\ O&T_{1}+T_{2}&O\end{bmatrix}^{\#_{\mathbb{A}}}\right)\] \[=w_{\mathbb{A}}\left(\begin{bmatrix}O&O&T_{3}\\ O&T_{3}&O\\ T_{3}&O&O\end{bmatrix}\right)+\frac{1}{2}w_{\mathbb{A}}\left(\begin{bmatrix}T_{ 1}+iT_{2}&O&-(iT_{1}+T_{2})\\ O&O&O\\ iT_{1}+T_{2}&O&T_{1}+iT_{2}\end{bmatrix}\right)\] \[+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&-i(T_{2} -T_{1})&O\\ i(T_{1}+T_{2})&O&O\\ O&O&O\end{bmatrix}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix} O&O&O\\ O&O&(T_{2}-T_{1})\\ O&T_{1}+T_{2}&O\end{bmatrix}\right)\] \[\leq w_{A}(T_{3})+\max\{w_{A}(T_{1}),w_{A}(T_{2})\}+\frac{1}{ \sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&-i(T_{2}-T_{1})&O\\ O&O&O\\ O&O&O\end{bmatrix}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix} O&O&O\\ i(T_{1}+T_{2})&O&O\\ O&O&O\end{bmatrix}\right)\] \[+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&O&O\\ O&O&(T_{2}-T_{1})\\ O&O&O\end{bmatrix}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix} O&O&O\\ O&O&O\\ O&T_{1}+T_{2}&O\end{bmatrix}\right)\] \[=w_{A}(T_{3})+\max\{w_{A}(T_{1}),w_{A}(T_{2})\}+\frac{1}{\sqrt{2 }}\left(\|T_{1}-T_{2}\|_{A}+\|T_{1}+T_{2}\|_{A}\right),\]
|
||||
11
src/models/tokenizer/train/train.py
Normal file
11
src/models/tokenizer/train/train.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from datasets import load_dataset
|
||||
from ...ocr_model.model.TexTeller import TexTeller
|
||||
from ...globals import VOCAB_SIZE
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TexTeller/src/models/tokenizer/roberta-tokenizer-raw')
|
||||
dataset = load_dataset("/home/lhy/code/TexTeller/src/models/ocr_model/train/data/loader.py")['train']
|
||||
new_tokenizer = tokenizer.train_new_from_iterator(text_iterator=dataset['latex_formula'], vocab_size=VOCAB_SIZE)
|
||||
new_tokenizer.save_pretrained('/home/lhy/code/TexTeller/src/models/tokenizer/roberta-tokenizer-7Mformulas')
|
||||
pause = 1
|
||||
Reference in New Issue
Block a user