merge dev后调整了项目结构

This commit is contained in:
三洋三洋
2024-04-21 00:48:24 +08:00
parent eac7f455d6
commit 9b7e392c66
66 changed files with 190 additions and 124855 deletions

4
.gitignore vendored
View File

@@ -7,9 +7,9 @@
**/*cache
**/.cache
**/tmp
**/log
**/data
**/log
**/logs
**/tmp*
**/data
@@ -18,4 +18,4 @@
**/*.bin
**/*.safetensor
**/*.onnx
**/*.onnx

View File

@@ -1,3 +1,5 @@
import os
from pathlib import Path
from PIL import Image, ImageDraw
from typing import List
@@ -63,6 +65,10 @@ class Bbox:
def draw_bboxes(img: Image.Image, bboxes: List[Bbox], name="annotated_image.png"):
curr_work_dir = Path(os.getcwd())
log_dir = curr_work_dir / "logs"
log_dir.mkdir(exist_ok=True)
drawer = ImageDraw.Draw(img)
for bbox in bboxes:
# Calculate the coordinates for the rectangle to be drawn
@@ -82,4 +88,4 @@ def draw_bboxes(img: Image.Image, bboxes: List[Bbox], name="annotated_image.png"
drawer.text((left, bottom - 10), bbox.content[:10], fill="red")
# Save the image with drawn rectangles
img.save(name)
img.save(name)

View File

@@ -0,0 +1,35 @@
{"img_name": "0.png", "formula": "\\[\\mathbb{C}^{4}\\stackrel{{\\pi_{1}}}{{\\longleftarrow}}\\mathcal{ F}\\stackrel{{\\pi_{2}}}{{\\rightarrow}}\\mathcal{PT},\\]"}
{"img_name": "1.png", "formula": "\\[W^{*}_{Z}(x_{1},x_{2})=W_{f\\lrcorner Z}(y_{1},y_{2})=\\mathcal{P}\\exp\\left( \\int_{\\gamma}A_{\\mu}dx^{\\mu}\\right).\\]"}
{"img_name": "2.png", "formula": "\\[G=W^{*}_{Z}(q,p)=\\tilde{H}H^{-1}\\]"}
{"img_name": "3.png", "formula": "\\[H=W^{*}_{Z}(p,x),\\ \\ \\tilde{H}=W^{*}_{Z}(q,x).\\]"}
{"img_name": "4.png", "formula": "\\[v\\cdot f^{*}A|_{x}=(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)},\\quad x\\in Z, \\ v\\in T_{x}Z.\\]"}
{"img_name": "5.png", "formula": "\\[(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)}=v^{\\alpha\\dot{\\alpha}}\\Big{(} \\frac{\\partial y^{\\beta\\dot{\\beta}}}{\\partial x^{\\alpha\\dot{\\alpha}}}A_{\\beta \\dot{\\beta}}\\Big{)}\\Big{|}_{f\\lrcorner Z(x)},\\ x\\in Z,\\ v\\in T_{x}Z,\\]"}
{"img_name": "6.png", "formula": "\\[\\{T_{i},T_{j}\\}=\\{\\tilde{T}^{i},\\tilde{T}^{j}\\}=0,\\ \\ \\{T_{i},\\tilde{T}^{j}\\}=2i \\delta^{j}_{i}D,\\]"}
{"img_name": "7.png", "formula": "\\[(\\partial_{s},q_{i},\\tilde{q}^{k})\\rightarrow(D,M^{j}_{i}T_{j},\\tilde{M}^{k}_ {l}\\tilde{T}^{l}),\\]"}
{"img_name": "8.png", "formula": "\\[M^{i}_{j}\\tilde{M}^{j}_{k}=\\delta^{i}_{k}.\\]"}
{"img_name": "9.png", "formula": "\\[Q_{i\\alpha}=q_{i\\alpha}+\\omega_{i\\alpha},\\ \\tilde{Q}^{i}_{\\dot{\\alpha}}=q^{i}_{ \\dot{\\alpha}}+\\tilde{\\omega}^{i}_{\\dot{\\alpha}},\\ D_{\\alpha\\dot{\\alpha}}= \\partial_{\\alpha\\dot{\\alpha}}+A_{\\alpha\\dot{\\alpha}}.\\]"}
{"img_name": "10.png", "formula": "\\[\\hat{f}(g,\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{j})=(f(g),[V^{-1}]^ {\\alpha}_{\\beta}\\theta^{i\\beta},[\\tilde{V}^{-1}]^{\\dot{\\alpha}}_{\\dot{\\beta}} \\tilde{\\theta}^{\\dot{\\beta}}_{j}),\\ g\\in{\\cal G},\\]"}
{"img_name": "11.png", "formula": "\\[v^{\\beta\\dot{\\beta}}V^{\\alpha}_{\\beta}\\tilde{V}^{\\dot{\\alpha}}_{\\dot{\\beta}} =((f\\lrcorner L_{0})_{*}v)^{\\alpha\\dot{\\alpha}},\\]"}
{"img_name": "12.png", "formula": "\\[\\omega_{i\\alpha}=\\tilde{\\theta}^{\\dot{\\alpha}}_{i}h_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\ \\ \\tilde{\\omega}^{i}_{\\alpha}=\\theta^{i\\alpha}\\tilde{h}_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\]"}
{"img_name": "13.png", "formula": "\\[\\begin{split}&\\lambda^{\\alpha}\\hat{f}^{*}\\omega_{i\\alpha}(z)= \\tilde{\\theta}^{\\dot{\\beta}}_{i}\\lambda^{\\alpha}\\left(V^{\\beta}_{\\alpha}h_{ \\beta\\dot{\\beta}}(x^{\\prime},\\tau^{\\prime})\\right),\\\\ &\\tilde{\\lambda}^{\\dot{\\alpha}}\\hat{f}^{*}\\tilde{\\omega}^{i}_{ \\dot{\\alpha}}(z)=\\theta^{i\\beta}\\tilde{\\lambda}^{\\dot{\\alpha}}\\left(\\tilde{V}^ {\\dot{\\beta}}_{\\dot{\\alpha}}\\tilde{h}_{\\beta\\dot{\\beta}}(x^{\\prime},\\tau^{ \\prime})\\right),\\end{split}\\]"}
{"img_name": "14.png", "formula": "\\[A_{\\alpha\\dot{\\alpha}}=A_{\\alpha\\dot{\\alpha}}(x^{\\beta\\dot{\\beta}},\\tau^{ \\beta\\dot{\\beta}})\\]"}
{"img_name": "15.png", "formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}D_{\\alpha\\dot{\\alpha}}\\]"}
{"img_name": "16.png", "formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}\\]"}
{"img_name": "17.png", "formula": "\\[[v_{1}\\cdot D^{*},v_{2}\\cdot D^{*}]=0\\]"}
{"img_name": "18.png", "formula": "\\[\\Phi_{A}=(\\omega_{i\\alpha},\\tilde{\\omega}^{i}_{\\dot{\\alpha}},A_{\\alpha\\dot{ \\alpha}})\\]"}
{"img_name": "19.png", "formula": "\\[\\hat{f}:{\\cal F}^{6|4N}\\rightarrow{\\cal F}^{6|4N}\\]"}
{"img_name": "20.png", "formula": "\\[\\sigma=(s,\\xi^{i},\\tilde{\\xi}_{j})\\in\\mathbb{C}^{1|2N}\\]"}
{"img_name": "21.png", "formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}(h_{\\alpha\\dot{\\alpha}}+\\tilde{h}_{\\alpha\\dot{\\alpha} })=0\\]"}
{"img_name": "22.png", "formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}\\rightarrow[V^{-1}]^{\\alpha}_{\\beta}[\\tilde{V}^{-1}]^{ \\dot{\\alpha}}_{\\dot{\\beta}}\\tau^{\\beta\\dot{\\beta}}\\]"}
{"img_name": "23.png", "formula": "\\[\\tau^{\\beta\\dot{\\beta}}=\\sum_{i}\\theta^{i\\beta}\\tilde{\\theta}^{\\dot{\\beta}}_{i}\\]"}
{"img_name": "24.png", "formula": "\\[\\theta^{i\\alpha}\\omega_{i\\alpha}+\\tilde{\\theta}^{i}_{\\dot{\\alpha}}\\tilde{ \\omega}^{\\dot{\\alpha}}_{i}=0\\]"}
{"img_name": "25.png", "formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{Q}^{i}_{\\dot{\\alpha}}\\]"}
{"img_name": "26.png", "formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{q}^{i}_{\\dot{\\alpha}}\\]"}
{"img_name": "27.png", "formula": "\\[\\tilde{\\lambda}^{\\dot{\\alpha}}f^{*}A_{\\alpha\\dot{\\alpha}}=H^{-1}\\tilde{ \\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}H\\]"}
{"img_name": "28.png", "formula": "\\[\\tilde{q}^{i}=\\partial_{\\tilde{\\xi}_{i}}+i\\xi^{i}\\partial_{s}\\]"}
{"img_name": "29.png", "formula": "\\[\\tilde{q}^{i}_{\\dot{\\alpha}}=\\frac{\\partial}{\\partial\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}}+i\\theta^{i\\alpha}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"}
{"img_name": "30.png", "formula": "\\[f\\lrcorner L(z)=\\pi_{1}\\circ f(z,\\lambda,\\tilde{\\lambda})\\ \\forall z\\in L\\]"}
{"img_name": "31.png", "formula": "\\[q_{i\\alpha}=\\frac{\\partial}{\\partial\\theta^{i\\alpha}}+i\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"}
{"img_name": "32.png", "formula": "\\[q_{i}=\\partial_{\\xi^{i}}+i\\tilde{\\xi}_{i}\\partial_{s}\\]"}
{"img_name": "33.png", "formula": "\\[v^{\\alpha\\dot{\\alpha}}=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\]"}
{"img_name": "34.png", "formula": "\\[z^{A}=(x^{\\alpha\\dot{\\alpha}},\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{ j})\\]"}

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

View File

@@ -0,0 +1,50 @@
from PIL import Image
from pathlib import Path
import datasets
import json
DIR_URL = Path('absolute/path/to/dataset/directory')
# e.g. DIR_URL = Path('/home/OleehyO/TeXTeller/src/models/ocr_model/train/dataset')
class LatexFormulas(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = []
def _info(self):
return datasets.DatasetInfo(
features=datasets.Features({
"image": datasets.Image(),
"latex_formula": datasets.Value("string")
})
)
def _split_generators(self, dl_manager: datasets.DownloadManager):
dir_path = Path(dl_manager.download(str(DIR_URL)))
assert dir_path.is_dir()
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
'dir_path': dir_path,
}
)
]
def _generate_examples(self, dir_path: Path):
images_path = dir_path / 'images'
formulas_path = dir_path / 'formulas.jsonl'
img2formula = {}
with formulas_path.open('r', encoding='utf-8') as f:
for line in f:
single_json = json.loads(line)
img2formula[single_json['img_name']] = single_json['formula']
for img_path in images_path.iterdir():
if img_path.suffix not in ['.jpg', '.png']:
continue
yield str(img_path), {
"image": Image.open(img_path),
"latex_formula": img2formula[img_path.name]
}

View File

@@ -1,14 +0,0 @@
Congratulations on your download of this fine Rotodesign brand font product. We hope it will bring you many hours of typesetting pleasure and riches beyond your wildest dreams. We DO NOT, however, guarantee either of these things. Your mileage may vary.
This font is freeware, and is provided with no warranties as to its quality or its utility. After all, how much did you pay? Anyway, this font can be copied and used as you wish provided all copies include this readme file. Don't lie to your friends and tell 'em you made it yourself. You only cheat yourself when you do that. In the unlikely event you use this font to design something really cool or that makes you a ton of cash money, that's okay with me, just send me a copy or two of the finished item, and remember me when you get rich and famous. Enjoy!
©2006
Patrick Broderick
Rotodesign
http://www.rotodesign.com
roto@rotodesign.net
Rotodesign
1288 Columbus Ave. #176
San Francisco, CA 94133

View File

@@ -1,168 +0,0 @@
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Google BLEU (aka GLEU) metric. """
from typing import Dict, List
import datasets
from nltk.translate import gleu_score
import evaluate
from evaluate import MetricInfo
from .tokenizer_13a import Tokenizer13a
_CITATION = """\
@misc{wu2016googles,
title={Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
author={Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V. Le and Mohammad Norouzi and Wolfgang Macherey
and Maxim Krikun and Yuan Cao and Qin Gao and Klaus Macherey and Jeff Klingner and Apurva Shah and Melvin
Johnson and Xiaobing Liu and Łukasz Kaiser and Stephan Gouws and Yoshikiyo Kato and Taku Kudo and Hideto
Kazawa and Keith Stevens and George Kurian and Nishant Patil and Wei Wang and Cliff Young and
Jason Smith and Jason Riesa and Alex Rudnick and Oriol Vinyals and Greg Corrado and Macduff Hughes
and Jeffrey Dean},
year={2016},
eprint={1609.08144},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """\
The BLEU score has some undesirable properties when used for single
sentences, as it was designed to be a corpus measure. We therefore
use a slightly different score for our RL experiments which we call
the 'GLEU score'. For the GLEU score, we record all sub-sequences of
1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
compute a recall, which is the ratio of the number of matching n-grams
to the number of total n-grams in the target (ground truth) sequence,
and a precision, which is the ratio of the number of matching n-grams
to the number of total n-grams in the generated output sequence. Then
GLEU score is simply the minimum of recall and precision. This GLEU
score's range is always between 0 (no matches) and 1 (all match) and
it is symmetrical when switching output and target. According to
our experiments, GLEU score correlates quite well with the BLEU
metric on a corpus level but does not have its drawbacks for our per
sentence reward objective.
"""
_KWARGS_DESCRIPTION = """\
Computes corpus-level Google BLEU (GLEU) score of translated segments against one or more references.
Instead of averaging the sentence level GLEU scores (i.e. macro-average precision), Wu et al. (2016) sum up the matching
tokens and the max of hypothesis and reference tokens for each sentence, then compute using the aggregate values.
Args:
predictions (list of str): list of translations to score.
references (list of list of str): list of lists of references for each translation.
tokenizer : approach used for tokenizing `predictions` and `references`.
The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
This can be replaced by any function that takes a string as input and returns a list of tokens as output.
min_len (int): The minimum order of n-gram this function should extract. Defaults to 1.
max_len (int): The maximum order of n-gram this function should extract. Defaults to 4.
Returns:
'google_bleu': google_bleu score
Examples:
Example 1:
>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
'he read the book because he was interested in world history']
>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat'], \
['he was interested in world history because he read the book']]
>>> google_bleu = evaluate.load("google_bleu")
>>> results = google_bleu.compute(predictions=predictions, references=references)
>>> print(round(results["google_bleu"], 2))
0.44
Example 2:
>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
'he read the book because he was interested in world history']
>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
['he was interested in world history because he read the book']]
>>> google_bleu = evaluate.load("google_bleu")
>>> results = google_bleu.compute(predictions=predictions, references=references)
>>> print(round(results["google_bleu"], 2))
0.61
Example 3:
>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
'he read the book because he was interested in world history']
>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
['he was interested in world history because he read the book']]
>>> google_bleu = evaluate.load("google_bleu")
>>> results = google_bleu.compute(predictions=predictions, references=references, min_len=2)
>>> print(round(results["google_bleu"], 2))
0.53
Example 4:
>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
'he read the book because he was interested in world history']
>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
['he was interested in world history because he read the book']]
>>> google_bleu = evaluate.load("google_bleu")
>>> results = google_bleu.compute(predictions=predictions,references=references, min_len=2, max_len=6)
>>> print(round(results["google_bleu"], 2))
0.4
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class GoogleBleu(evaluate.Metric):
def _info(self) -> MetricInfo:
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
}
),
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
)
def _compute(
self,
predictions: List[str],
references: List[List[str]],
tokenizer=Tokenizer13a(),
min_len: int = 1,
max_len: int = 4,
) -> Dict[str, float]:
# if only one reference is provided make sure we still use list of lists
if isinstance(references[0], str):
references = [[ref] for ref in references]
references = [[tokenizer(r) for r in ref] for ref in references]
predictions = [tokenizer(p) for p in predictions]
return {
"google_bleu": gleu_score.corpus_gleu(
list_of_references=references, hypotheses=predictions, min_len=min_len, max_len=max_len
)
}

View File

@@ -1,100 +0,0 @@
# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
# Copyright 2020 SacreBLEU Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from functools import lru_cache
class BaseTokenizer:
"""A base dummy tokenizer to derive from."""
def signature(self):
"""
Returns a signature for the tokenizer.
:return: signature string
"""
return "none"
def __call__(self, line):
"""
Tokenizes an input line with the tokenizer.
:param line: a segment to tokenize
:return: the tokenized line
"""
return line
class TokenizerRegexp(BaseTokenizer):
def signature(self):
return "re"
def __init__(self):
self._re = [
# language-dependent part (assuming Western languages)
(re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
# tokenize period and comma unless preceded by a digit
(re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
# tokenize period and comma unless followed by a digit
(re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
# tokenize dash when preceded by a digit
(re.compile(r"([0-9])(-)"), r"\1 \2 "),
# one space only between words
# NOTE: Doing this in Python (below) is faster
# (re.compile(r'\s+'), r' '),
]
@lru_cache(maxsize=2**16)
def __call__(self, line):
"""Common post-processing tokenizer for `13a` and `zh` tokenizers.
:param line: a segment to tokenize
:return: the tokenized line
"""
for (_re, repl) in self._re:
line = _re.sub(repl, line)
# no leading or trailing spaces, single space within words
# return ' '.join(line.split())
# This line is changed with regards to the original tokenizer (seen above) to return individual words
return line.split()
class Tokenizer13a(BaseTokenizer):
def signature(self):
return "13a"
def __init__(self):
self._post_tokenizer = TokenizerRegexp()
@lru_cache(maxsize=2**16)
def __call__(self, line):
"""Tokenizes an input line using a relatively minimal tokenization
that is however equivalent to mteval-v13a, used by WMT.
:param line: a segment to tokenize
:return: the tokenized line
"""
# language-independent part:
line = line.replace("<skipped>", "")
line = line.replace("-\n", "")
line = line.replace("\n", " ")
if "&" in line:
line = line.replace("&quot;", '"')
line = line.replace("&amp;", "&")
line = line.replace("&lt;", "<")
line = line.replace("&gt;", ">")
return self._post_tokenizer(f" {line} ")

View File

@@ -4,28 +4,23 @@ from functools import partial
from pathlib import Path
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig
from transformers import (
Trainer,
TrainingArguments,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
GenerationConfig
)
from .training_args import CONFIG
from ..model.TexTeller import TexTeller
from ..utils.functional import tokenize_fn, collate_fn, img_train_transform, img_inf_transform, filter_fn
from ..utils.functional import tokenize_fn, collate_fn, img_transform_fn
from ..utils.metrics import bleu_metric
from ...globals import MAX_TOKEN_SIZE
from ...globals import MAX_TOKEN_SIZE, MIN_WIDTH, MIN_HEIGHT
def train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer):
training_args = TrainingArguments(**CONFIG)
debug_mode = False
if debug_mode:
training_args.auto_find_batch_size = False
training_args.num_train_epochs = 2
# training_args.per_device_train_batch_size = 3
training_args.per_device_train_batch_size = 2
training_args.per_device_eval_batch_size = 2 * training_args.per_device_train_batch_size
training_args.jit_mode_eval = False
training_args.torch_compile = False
training_args.dataloader_num_workers = 1
trainer = Trainer(
model,
training_args,
@@ -37,15 +32,14 @@ def train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokeniz
data_collator=collate_fn_with_tokenizer,
)
# trainer.train(resume_from_checkpoint=None)
trainer.train(resume_from_checkpoint='/home/lhy/code/TexTeller/src/models/ocr_model/train/train_result/TexTellerv3/checkpoint-644000')
trainer.train(resume_from_checkpoint=None)
def evaluate(model, tokenizer, eval_dataset, collate_fn):
eval_config = CONFIG.copy()
eval_config['predict_with_generate'] = True
generate_config = GenerationConfig(
max_length=MAX_TOKEN_SIZE-100,
max_new_tokens=MAX_TOKEN_SIZE,
num_beams=1,
do_sample=False,
pad_token_id=tokenizer.pad_token_id,
@@ -53,7 +47,6 @@ def evaluate(model, tokenizer, eval_dataset, collate_fn):
bos_token_id=tokenizer.bos_token_id,
)
eval_config['generation_config'] = generate_config
eval_config['auto_find_batch_size'] = False
seq2seq_config = Seq2SeqTrainingArguments(**eval_config)
trainer = Seq2SeqTrainer(
@@ -66,48 +59,45 @@ def evaluate(model, tokenizer, eval_dataset, collate_fn):
compute_metrics=partial(bleu_metric, tokenizer=tokenizer)
)
res = trainer.evaluate()
print(res)
eval_res = trainer.evaluate()
print(eval_res)
if __name__ == '__main__':
cur_path = os.getcwd()
script_dirpath = Path(__file__).resolve().parent
os.chdir(script_dirpath)
dataset = load_dataset(
'/home/lhy/code/TexTeller/src/models/ocr_model/train/data/loader.py'
)['train']
tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TexTeller/src/models/tokenizer/roberta-tokenizer-7Mformulas')
filter_fn_with_tokenizer = partial(filter_fn, tokenizer=tokenizer)
dataset = dataset.filter(filter_fn_with_tokenizer, num_proc=16)
dataset = load_dataset(str(Path('./dataset/loader.py').resolve()))['train']
dataset = dataset.filter(lambda x: x['image'].height > MIN_HEIGHT and x['image'].width > MIN_WIDTH)
dataset = dataset.shuffle(seed=42)
dataset = dataset.flatten_indices()
tokenizer = TexTeller.get_tokenizer()
# If you want use your own tokenizer, please modify the path to your tokenizer
#+tokenizer = TexTeller.get_tokenizer('/path/to/your/tokenizer')
map_fn = partial(tokenize_fn, tokenizer=tokenizer)
tokenized_dataset = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8, load_from_cache_file=True)
tokenized_dataset = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8)
tokenized_dataset = tokenized_dataset.with_transform(img_transform_fn)
split_dataset = tokenized_dataset.train_test_split(test_size=0.005, seed=42)
# Split dataset into train and eval, ratio 9:1
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset, eval_dataset = split_dataset['train'], split_dataset['test']
train_dataset = train_dataset.with_transform(img_train_transform)
eval_dataset = eval_dataset.with_transform(img_inf_transform)
collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer)
# model = TexTeller()
model = TexTeller.from_pretrained('/home/lhy/code/TexTeller/src/models/ocr_model/train/train_result/TexTellerv3/checkpoint-644000')
# ================= debug =======================
# foo = train_dataset[:50]
# bar = eval_dataset[:50]
# ================= debug =======================
# Train from scratch
model = TexTeller()
# or train from TexTeller pre-trained model: model = TexTeller.from_pretrained()
enable_train = True
enable_evaluate = True
# If you want to train from pre-trained model, please modify the path to your pre-trained checkpoint
#+e.g.
#+model = TexTeller.from_pretrained(
#+ '/path/to/your/model_checkpoint'
#+)
enable_train = True
enable_evaluate = False
if enable_train:
train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer)
if enable_evaluate:
train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer)
if enable_evaluate and len(eval_dataset) > 0:
evaluate(model, tokenizer, eval_dataset, collate_fn_with_tokenizer)
os.chdir(cur_path)

View File

@@ -1,84 +1,38 @@
CONFIG = {
"seed": 42, # 随机种子,用于确保实验的可重复性
"use_cpu": False, # 是否使用cpu刚开始测试代码的时候先用cpu跑会更容易debug
# "data_seed": 42, # data sampler的采样也固定
# "full_determinism": True, # 使整个训练完全固定这个设置会有害于模型训练只用于debug
"seed": 42, # Random seed for reproducibility
"use_cpu": False, # Whether to use CPU (it's easier to debug with CPU when starting to test the code)
"learning_rate": 5e-5, # Learning rate
"num_train_epochs": 10, # Total number of training epochs
"per_device_train_batch_size": 4, # Batch size per GPU for training
"per_device_eval_batch_size": 8, # Batch size per GPU for evaluation
"output_dir": "train_result/TexTellerv3", # 输出目录
"overwrite_output_dir": False, # 如果输出目录存在,不删除原先的内容
"report_to": ["tensorboard"], # 输出日志到TensorBoard
#+通过在命令行tensorboard --logdir ./logs 来查看日志
"output_dir": "train_result", # Output directory
"overwrite_output_dir": False, # If the output directory exists, do not delete its content
"report_to": ["tensorboard"], # Report logs to TensorBoard
"logging_dir": None, # TensorBoard日志文件的存储目录(使用默认值)
"log_level": "warning", # 其他可选:debug, info, warning, error and critical由低级别到高级别
"logging_strategy": "steps", # 每隔一定步数记录一次日志
"logging_steps": 4000, # 记录日志的步数间隔可以是int也可以是(0~1)的float当是float时表示总的训练步数的ratio(比方说可以设置成1.0 / 2000)
#+通常与eval_steps一致
"logging_nan_inf_filter": False, # 对loss=nan或inf进行记录
"save_strategy": "steps", # Strategy to save checkpoints
"save_steps": 500, # Interval of steps to save checkpoints, can be int or a float (0~1), when float it represents the ratio of total training steps (e.g., can set to 1.0 / 2000)
"save_total_limit": 5, # Maximum number of models to save. The oldest models will be deleted if this number is exceeded
"num_train_epochs": 4, # 总的训练轮数
# "max_steps": 3, # 训练的最大步骤数。如果设置了这个参数,
#+那么num_train_epochs将被忽略通常用于调试
"logging_strategy": "steps", # Log every certain number of steps
"logging_steps": 500, # Number of steps between each log
"logging_nan_inf_filter": False, # Record logs for loss=nan or inf
# "label_names": ['your_label_name'], # 指定data_loader中的标签名如果不指定则默认为'labels'
"optim": "adamw_torch", # Optimizer
"lr_scheduler_type": "cosine", # Learning rate scheduler
"warmup_ratio": 0.1, # Ratio of warmup steps in total training steps (e.g., for 1000 steps, the first 100 steps gradually increase lr from 0 to the set lr)
"max_grad_norm": 1.0, # For gradient clipping, ensure the norm of the gradients does not exceed 1.0 (default 1.0)
"fp16": False, # Whether to use 16-bit floating point for training (generally not recommended, as loss can easily explode)
"bf16": False, # Whether to use Brain Floating Point (bfloat16) for training (recommended if architecture supports it)
"gradient_accumulation_steps": 1, # Gradient accumulation steps, consider this parameter to achieve large batch size effects when batch size cannot be large
"jit_mode_eval": False, # Whether to use PyTorch jit trace during eval (can speed up the model, but the model must be static, otherwise will throw errors)
"torch_compile": False, # Whether to use torch.compile to compile the model (for better training and inference performance)
"per_device_train_batch_size": 3, # 每个GPU的batch size
"per_device_eval_batch_size": 6, # 每个GPU的evaluation batch size
# "auto_find_batch_size": True, # 自动搜索合适的batch size指数decay
"auto_find_batch_size": False, # 自动搜索合适的batch size指数decay
"dataloader_pin_memory": True, # Can speed up data transfer between CPU and GPU
"dataloader_num_workers": 1, # Default is not to use multiprocessing for data loading, usually set to 4*number of GPUs used
"optim": "adamw_torch", # 还提供了很多AdamW的变体相较于经典的AdamW更加高效
#+当设置了optim后就不需要在Trainer中传入optimizer
"lr_scheduler_type": "cosine", # 设置lr_scheduler
"warmup_ratio": 0.1, # warmup占整个训练steps的比例(假如训练1000步那么前100步就是从lr=0慢慢长到参数设定的lr)
# "warmup_steps": 500, # 预热步数, 这个参数与warmup_ratio是矛盾的
"weight_decay": 0, # 权重衰减
"learning_rate": 5e-5, # 学习率
"max_grad_norm": 1.0, # 用于梯度裁剪确保梯度的范数不超过1.0默认1.0
"fp16": False, # 是否使用16位浮点数进行训练一般不推荐loss很容易炸
"bf16": False, # 是否使用16位宽浮点数进行训练如果架构支持的话推荐使用
"gradient_accumulation_steps": 2, # 梯度累积步数当batch size无法开很大时可以考虑这个参数来实现大batch size的效果
"gradient_checkpointing": False, # 当为True时会在forward时适当丢弃一些中间量用于backward从而减轻显存压力但会增加forward的时间
"label_smoothing_factor": 0.0, # softlabel等于0时表示未开启
# "debug": "underflow_overflow", # 训练时检查溢出如果发生则会发出警告。该模式通常用于debug
"jit_mode_eval": True, # 是否在eval的时候使用PyTorch jit trace可以加速模型但模型必须是静态的否则会报错
"torch_compile": True, # 是否使用torch.compile来编译模型从而获得更好的训练和推理性能
#+ 要求torch > 2.0,这个功能很好使,当模型跑通的时候可以开起来
# "deepspeed": "your_json_path", # 使用deepspeed来训练需要指定ds_config.json的路径
#+ 在Trainer中使用Deepspeed时一定要注意ds_config.json中的配置是否与Trainer的一致如学习率batch size梯度累积步数等
#+ 如果不一致会出现很奇怪的bug而且一般还很难发现
"evaluation_strategy": "steps", # Evaluation strategy, can be "steps" or "epoch"
"eval_steps": 500, # If evaluation_strategy="step"
"dataloader_pin_memory": True, # 可以加快数据在cpu和gpu之间转移的速度
"dataloader_num_workers": 16, # 默认不会使用多进程来加载数据通常设成4*所用的显卡数
"dataloader_drop_last": True, # 丢掉最后一个minibatch保证训练的梯度稳定
"evaluation_strategy": "steps", # 评估策略,可以是"steps"或"epoch"
"eval_steps": 4000, # if evaluation_strategy="step"
#+默认情况下与logging_steps一样可以是int也可以是(0~1)的float当是float时表示总的训练步数的ratio(比方说可以设置成1.0 / 2000)
"save_strategy": "steps", # 保存checkpoint的策略
"save_steps": 4000, # checkpoint保存的步数间隔可以是int也可以是(0~1)的float当是float时表示总的训练步数的ratio(比方说可以设置成1.0 / 2000)
"save_total_limit": 10, # 保存的模型的最大数量。如果超过这个数量,最旧的模型将被删除
"load_best_model_at_end": True, # 训练结束时是否加载最佳模型
#+当设置True时会保存训练时评估结果最好的checkpoint
#+当设置True时evaluation_strategy必须与save_strategy一样并且save_steps必须是eval_steps的整数倍
"metric_for_best_model": "eval_loss", # 用于选择最佳模型的指标(必须与load_best_model_at_end一起用)
#+可以使用compute_metrics输出的evaluation的结果中一个字典的某个值
#+注意Trainer会在compute_metrics输出的字典的键前面加上一个prefix默认就是“eval_”
"greater_is_better": False, # 指标值越小越好(必须与metric_for_best_model一起用)
"do_train": True, # 是否进行训练,通常用于调试
"do_eval": True, # 是否进行评估,通常用于调试
"remove_unused_columns": False, # 是否删除没有用到的列特征默认为True
#+当删除了没用到的列后making it easier to unpack inputs into the models call function
#+注意remove_unused_columns去除列的操作会把传入的dataset的columns_names与模型forward方法中的参数名进行配对对于不存在forward方法中的列名就会直接删掉整个feature
#+因此如果在dataset.with_transform(..)中给数据进行改名那么这个remove操作会直接把原始的数据直接删掉从而导致之后会拿到一个空的dataset导致在对dataset进行切片取值时出问题
#+例如读进来的dataset图片对应的feature name叫"images"而模型forward方法中对应的参数名叫“pixel_values”
#+此时如果是在data.withtransfrom(..)中根据这个"images"生成其他模型forward方法中需要的参数然后再把"images"改名成“pixel_values”那么整个过程就会出问题
#+因为设置了remove_unused_columns=True后会先给dataset进行列名检查然后“images”这个feature会直接被删掉导致with_transform的transform_fn拿不到“images”这个feature
#+所以一个good practice就是对于要改名的特征先提前使用dataset.rename_column进行改名
"push_to_hub": False, # 是否训练完后上传hub需要先在命令行huggingface-cli login进行登录认证的配置配置完后认证信息会存到cache文件夹里
}
"remove_unused_columns": False, # Don't change this unless you really know what you are doing.
}

View File

@@ -1,17 +1,23 @@
import evaluate
import numpy as np
from transformers import EvalPrediction, RobertaTokenizer
from typing import Dict
import os
def bleu_metric(eval_preds:EvalPrediction, tokenizer:RobertaTokenizer) -> Dict:
metric = evaluate.load('/home/lhy/code/TexTeller/src/models/ocr_model/train/google_bleu') # 这里需要联网,所以会卡住
from pathlib import Path
from typing import Dict
from transformers import EvalPrediction, RobertaTokenizer
def bleu_metric(eval_preds: EvalPrediction, tokenizer: RobertaTokenizer) -> Dict:
cur_dir = Path(os.getcwd())
os.chdir(Path(__file__).resolve().parent)
metric = evaluate.load('google_bleu') # Will download the metric from huggingface if not already downloaded
os.chdir(cur_dir)
logits, labels = eval_preds.predictions, eval_preds.label_ids
preds = logits
# preds = np.argmax(logits, axis=1) # 把logits转成对应的预测标签
labels = np.where(labels == -100, 1, labels)
preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
return metric.compute(predictions=preds, references=labels)
return metric.compute(predictions=preds, references=labels)

View File

@@ -156,4 +156,4 @@ def ocr_augmentation_pipeline():
log=False
)
return pipeline
return pipeline

View File

@@ -20,15 +20,12 @@ from .ocr_aug import ocr_augmentation_pipeline
train_pipeline = ocr_augmentation_pipeline()
general_transform_pipeline = v2.Compose([
v2.ToImage(), # Convert to tensor, only needed if you had a PIL image
#+返回一个List of torchvision.Imagelist的长度就是batch_size
#+因此在整个Compose pipeline的最后输出的也是一个List of torchvision.Image
#+注意不是返回一整个torchvision.Imagebatch_size的维度是拿出来的
v2.ToImage(),
v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point
v2.Grayscale(), # 转灰度图(视具体任务而定)
v2.Grayscale(),
v2.Resize( # 固定resize到一个正方形上
size=FIXED_IMG_SIZE - 1, # size必须小于max_size
v2.Resize(
size=FIXED_IMG_SIZE - 1,
interpolation=v2.InterpolationMode.BICUBIC,
max_size=FIXED_IMG_SIZE,
antialias=True
@@ -37,22 +34,14 @@ general_transform_pipeline = v2.Compose([
v2.ToDtype(torch.float32, scale=True), # Normalize expects float input
v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
# v2.ToPILImage() # 用于观察转换后的结果是否正确debug用
# v2.ToPILImage()
])
def trim_white_border(image: np.ndarray):
# image是一个3维的ndarrayRGB格式维度分布为[H, W, C](通道维在第三维上)
# # 检查images中的第一个元素是否是嵌套的列表结构
# if isinstance(image, list):
# image = np.array(image, dtype=np.uint8)
# 检查图像是否为RGB格式同时检查通道维是不是在第三维上
if len(image.shape) != 3 or image.shape[2] != 3:
raise ValueError("Image is not in RGB format or channel is not in third dimension")
# 检查图片是否使用 uint8 类型
if image.dtype != np.uint8:
raise ValueError(f"Image should stored in uint8")
@@ -61,21 +50,17 @@ def trim_white_border(image: np.ndarray):
bg_color = Counter(corners).most_common(1)[0][0]
bg_color_np = np.array(bg_color, dtype=np.uint8)
# 创建与原图像同样大小的纯白背景图像
h, w = image.shape[:2]
bg = np.full((h, w, 3), bg_color_np, dtype=np.uint8)
# 计算差异
diff = cv2.absdiff(image, bg)
mask = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
threshold = 15 # 接近背景色的也裁剪掉
threshold = 15
_, diff = cv2.threshold(mask, threshold, 255, cv2.THRESH_BINARY)
# 计算图像中非零像素点的最小外接矩阵
x, y, w, h = cv2.boundingRect(diff)
# 裁剪图像
trimmed_image = image[y:y+h, x:x+w]
return trimmed_image
@@ -117,13 +102,6 @@ def random_resize(
minr: float,
maxr: float
) -> List[np.ndarray]:
# np.ndarray的格式3维RGB格式维度分布为[H, W, C](通道维在第三维上)
# # 检查images中的第一个元素是否是嵌套的列表结构
# if isinstance(images[0], list):
# # 将嵌套的列表结构转换为np.ndarray
# images = [np.array(img, dtype=np.uint8) for img in images]
if len(images[0].shape) != 3 or images[0].shape[2] != 3:
raise ValueError("Image is not in RGB format or channel is not in third dimension")
@@ -161,12 +139,9 @@ def rotate(image: np.ndarray, min_angle: int, max_angle: int) -> np.ndarray:
def ocr_aug(image: np.ndarray) -> np.ndarray:
# 20%的概率进行随机旋转
if random.random() < 0.2:
image = rotate(image, -5, 5)
# 增加白边
image = add_white_border(image, max_size=25).permute(1, 2, 0).numpy()
# 数据增强
image = train_pipeline(image)
return image
@@ -177,7 +152,6 @@ def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
images = [np.array(img.convert('RGB')) for img in images]
# random resize first
images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO)
# 裁剪掉白边
images = [trim_white_border(image) for image in images]
# OCR augmentation
@@ -193,7 +167,6 @@ def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
def inference_transform(images: List[Union[np.ndarray, Image.Image]]) -> List[torch.Tensor]:
assert IMG_CHANNELS == 1 , "Only support grayscale images for now"
images = [np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images]
# 裁剪掉白边
images = [trim_white_border(image) for image in images]
# general transform pipeline
images = [general_transform_pipeline(image) for image in images] # imgs: List[PIL.Image.Image]
@@ -201,23 +174,3 @@ def inference_transform(images: List[Union[np.ndarray, Image.Image]]) -> List[to
images = padding(images, FIXED_IMG_SIZE)
return images
if __name__ == '__main__':
from pathlib import Path
from .helpers import convert2rgb
base_dir = Path('/home/lhy/code/TeXify/src/models/ocr_model/model')
imgs_path = [
base_dir / '1.jpg',
base_dir / '2.jpg',
base_dir / '3.jpg',
base_dir / '4.jpg',
base_dir / '5.jpg',
base_dir / '6.jpg',
base_dir / '7.jpg',
]
imgs_path = [str(img_path) for img_path in imgs_path]
imgs = convert2rgb(imgs_path)
res = random_resize(imgs, 0.5, 1.5)
pause = 1

File diff suppressed because it is too large Load Diff

View File

@@ -1,15 +0,0 @@
{
"bos_token": "<s>",
"cls_token": "<s>",
"eos_token": "</s>",
"mask_token": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": "<pad>",
"sep_token": "</s>",
"unk_token": "<unk>"
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,57 +0,0 @@
{
"add_prefix_space": false,
"added_tokens_decoder": {
"0": {
"content": "<s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<pad>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<unk>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"4": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "<s>",
"eos_token": "</s>",
"errors": "replace",
"mask_token": "<mask>",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "<pad>",
"sep_token": "</s>",
"tokenizer_class": "RobertaTokenizer",
"trim_offsets": true,
"unk_token": "<unk>"
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -1,15 +0,0 @@
{
"bos_token": "<s>",
"cls_token": "<s>",
"eos_token": "</s>",
"mask_token": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": "<pad>",
"sep_token": "</s>",
"unk_token": "<unk>"
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,57 +0,0 @@
{
"add_prefix_space": false,
"added_tokens_decoder": {
"0": {
"content": "<s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<pad>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<unk>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false,
"special": true
},
"4": {
"content": "<mask>",
"lstrip": true,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "<s>",
"eos_token": "</s>",
"errors": "replace",
"mask_token": "<mask>",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "<pad>",
"sep_token": "</s>",
"tokenizer_class": "RobertaTokenizer",
"trim_offsets": true,
"unk_token": "<unk>"
}

File diff suppressed because one or more lines are too long

View File

@@ -1,21 +0,0 @@
{
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"type_vocab_size": 1,
"vocab_size": 50265
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,31 +0,0 @@
\begin{aligned}
&\begin{aligned}(\tau\lambda)\psi(a)(\lambda^{-1}\tau)(X,Y,\xi,\eta)=(\tau\lambda)\psi(a)(-\tau Y,\tau X,-\tau\eta,\tau\xi)\end{aligned} \\
&=(\tau\lambda)\bigg(\begin{pmatrix}-a\tau\eta_1&-\tau y_3&-\tau\overline{y}_2\\-\tau\overline{y}_3&-a^{-1}\tau\eta_2&-a^{-1}\tau y_1\\-\tau y_2&-a^{-1}\tau\overline{y}_1&-a^{-1}\tau\eta_3\end{pmatrix},\begin{pmatrix}a^{-1}\tau\xi_1&\tau x_3&\tau\overline{x}_2\\\tau\overline{x}_3&a\tau\xi_2&a\tau x_1\\\tau x_2&a\tau\overline{x}_1&a\tau\xi_3\end{pmatrix},-a\tau\eta,a^{-1}\tau\xi\bigg) \\
&\left.=\left(\begin{pmatrix}\tau a^{-1}\xi_1&x_3&\overline{x}_2\\\overline{x}_3&\tau a\xi_2&\tau ax_1\\x_2&\tau a\overline{x}_1&\tau a\xi_3\end{pmatrix}\right.,\begin{pmatrix}\tau a\eta_1&y_3&\overline{y}_2\\\overline{y}_3&\tau a^{-1}\eta_2&\tau a^{-1}y_1\\y_2&\tau a^{-1}\overline{y}_1&\tau a^{-1}\eta_3\end{pmatrix},\tau a^{-1}\xi,\tau a\eta\right) \\
&=\psi(\tau a^{-1}).
\end{aligned}
\begin{aligned}
&\begin{aligned}-L_{X_{13}}&=\left(\frac{1}{2}\sin\alpha\cos\beta\sin2\gamma+\cos\alpha\tan\beta\sin^2\gamma-\frac{1}{2}\sin\alpha\sin\beta\tan\beta\sin2\gamma\right)\frac{\partial}{\partial\alpha}\end{aligned} \\
&\begin{aligned}+\left(\frac12\cos\alpha\sin\beta\sin2\gamma-\sin\alpha\sin^2\beta\cos^2\gamma-\sin\alpha\cos^2\beta\sin^2\gamma\right)\frac\partial{\partial\beta}\end{aligned} \\
&\begin{aligned}+\left(\frac14\sin\alpha\sin2\beta\sin2\gamma-\frac12\sin\alpha\tan\beta\sin2\gamma+\cos\alpha\sec\beta\sin^2\gamma\right)\frac{\partial}{\partial\gamma}\end{aligned} \\
&+\left(\left(\frac12\sin\alpha\sin2\beta\cos^2\gamma+\frac12\sin\alpha\sin2\beta-\frac12\cos\alpha\cos\beta\sin2\gamma\right)z_{12}\right. \\
&+(\sin\alpha\cos2\beta\cos\gamma+\cos\alpha\sin\beta\sin\gamma)\biggr)\frac{\partial}{\partial z_{12}} \\
&+\left(\left(\frac12\sin\alpha\sin2\beta\cos2\gamma-\cos\alpha\cos\beta\sin2\gamma\right)z_{13}+(\sin\alpha\cos2\beta\cos\gamma\right. \\
&\left.\left.+\cos\alpha\sin\beta\sin\gamma\right)z_{23}+\left(\frac12\sin\alpha\sin2\beta\sin2\gamma+\cos\alpha\cos\beta\cos2\gamma\right)\right)\frac{\partial}{\partial z_{13}} \\
&+\left(\left(-\frac12\sin\alpha\sin2\beta-\frac12\sin\alpha\sin2\beta\sin^2\gamma-\frac12\cos\alpha\cos\beta\sin2\gamma\right)z_{23}\right. \\
&+(\sin\alpha\cos2\beta\sin\gamma-\cos\alpha\sin\beta\cos\gamma)\Bigg)\frac{\partial}{\partial z_{23}}.
\end{aligned}
\begin{aligned}
&\sum_S(-1)^{|S|}\frac{1-\prod_{i\notin S}\left(\frac{X_i(1+X_i)}{Q+X_i}\right)^{m+1}}{1-\prod_{i\notin S}\frac{X_i(1+X_i)}{Q+X_i}}\prod_iX_i \\
&\times\prod_{i\in S}X_{i}^{m+n-1}(1+X_{i})^{m+1}(Q+X_{i})^{-m}(X_{i}+r+Q)^{n-1} \\
&\times\prod_{i\notin S}(1+X_i)(Q+rX_i+QX_i)^{n-1} \\
&&\times\prod_{1\leq i<j\leq n,\{i,j\}\cap S\neq\emptyset}\left(\frac{Y_j(1+Y_j)}{Q+rY_j+QY_j}-\frac{Y_i(1+Y_i)}{Q+rY_i+QY_i}\right) \\
&&&\times\sum_{k\notin S}(Q-X_{k}^{2})X_{k}^{-1}(1+X_{k})^{-1} \\
&&&\times\prod_{\overset{1\leq i\leq k-1}{i\notin S}}\frac{(Q+(Q+r)X_k+X_i+X_iX_k)(X_iX_k-Q)}{(Q+rX_k+QX_k)(Q+rX_i+QX_i)} \\
&&&\times\prod_{\overset{k+1\leq i\leq n}{i\notin S}}\frac{(Q+(Q+r)X_k+X_i+X_iX_k)(Q-X_iX_k)}{(Q+rX_k+QX_k)(Q+rX_i+QX_i)} \\
&&&&\times\prod_{1\leq i<j\leq n,i,j\notin S\cup\{k\}}\left(\frac{X_j(1+X_j)}{Q+rX_j+QX_j}-\frac{X_i(1+X_i)}{Q+rX_i+QX_i}\right).
\end{aligned}
\[w_{\mathbb{A}}\left(\begin{bmatrix}T_{1}&T_{2}&T_{3}\\ T_{2}&T_{3}&iT_{1}\\ T_{3}&iT_{1}&iT_{2}\end{bmatrix}\right)=w_{\mathbb{A}}\left(\mathbb{V}^{\#_{ \mathbb{A}}}\begin{bmatrix}T_{1}&T_{2}&T_{3}\\ T_{2}&T_{3}&iT_{1}\\ T_{3}&iT_{1}&iT_{2}\end{bmatrix}\mathbb{V}\right)\] \[=\frac{1}{2}w_{\mathbb{A}}\left(\begin{bmatrix}T_{1}^{\#_{A}}- iT_{2}^{\#_{A}}&-i\sqrt{2}(T_{1}^{\#_{A}}+T_{2}^{\#_{A}})&2T_{3}^{\#_{A}}- iT_{1}^{\#_{A}}+T_{2}^{\#_{A}}\\ i\sqrt{2}(T_{2}^{\#_{A}}-T_{1}^{\#_{A}})&2T_{3}^{\#_{A}}&\sqrt{2}(T_{1}^{\#_{A} }+T_{2}^{\#_{A}})\\ 2T_{3}^{\#_{A}}-(-iT_{1}^{\#_{A}}+T_{2}^{\#_{A}})&\sqrt{2}(T_{2}^{\#_{A}}-T_{ 1}^{\#_{A}})&T_{1}^{\#_{A}}-iT_{2}^{\#_{A}}\end{bmatrix}\right)\] \[\leq w_{\mathbb{A}}\left(\begin{bmatrix}O&O&T_{3}\\ O&T_{3}&O\\ T_{3}&O&O\end{bmatrix}^{\#_{\mathbb{A}}}\right)+\frac{1}{2}w_{\mathbb{A}}\left( \begin{bmatrix}T_{1}+iT_{2}&O&-(iT_{1}+T_{2})\\ O&O&O\\ iT_{1}+T_{2}&O&T_{1}+iT_{2}\end{bmatrix}^{\#_{\mathbb{A}}}\right)\] \[+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&-i(T_{2} -T_{1})&O\\ i(T_{1}+T_{2})&O&O\\ O&O&O\end{bmatrix}^{\#_{\mathbb{A}}}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}} \left(\begin{bmatrix}O&O&O\\ O&O&(T_{2}-T_{1})\\ O&T_{1}+T_{2}&O\end{bmatrix}^{\#_{\mathbb{A}}}\right)\] \[=w_{\mathbb{A}}\left(\begin{bmatrix}O&O&T_{3}\\ O&T_{3}&O\\ T_{3}&O&O\end{bmatrix}\right)+\frac{1}{2}w_{\mathbb{A}}\left(\begin{bmatrix}T_{ 1}+iT_{2}&O&-(iT_{1}+T_{2})\\ O&O&O\\ iT_{1}+T_{2}&O&T_{1}+iT_{2}\end{bmatrix}\right)\] \[+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&-i(T_{2} -T_{1})&O\\ i(T_{1}+T_{2})&O&O\\ O&O&O\end{bmatrix}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix} O&O&O\\ O&O&(T_{2}-T_{1})\\ O&T_{1}+T_{2}&O\end{bmatrix}\right)\] \[\leq w_{A}(T_{3})+\max\{w_{A}(T_{1}),w_{A}(T_{2})\}+\frac{1}{ \sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&-i(T_{2}-T_{1})&O\\ O&O&O\\ O&O&O\end{bmatrix}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix} O&O&O\\ i(T_{1}+T_{2})&O&O\\ O&O&O\end{bmatrix}\right)\] \[+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix}O&O&O\\ O&O&(T_{2}-T_{1})\\ O&O&O\end{bmatrix}\right)+\frac{1}{\sqrt{2}}w_{\mathbb{A}}\left(\begin{bmatrix} O&O&O\\ O&O&O\\ O&T_{1}+T_{2}&O\end{bmatrix}\right)\] \[=w_{A}(T_{3})+\max\{w_{A}(T_{1}),w_{A}(T_{2})\}+\frac{1}{\sqrt{2 }}\left(\|T_{1}-T_{2}\|_{A}+\|T_{1}+T_{2}\|_{A}\right),\]

View File

@@ -1,11 +0,0 @@
from datasets import load_dataset
from ...ocr_model.model.TexTeller import TexTeller
from ...globals import VOCAB_SIZE
if __name__ == '__main__':
tokenizer = TexTeller.get_tokenizer('/home/lhy/code/TexTeller/src/models/tokenizer/roberta-tokenizer-raw')
dataset = load_dataset("/home/lhy/code/TexTeller/src/models/ocr_model/train/data/loader.py")['train']
new_tokenizer = tokenizer.train_new_from_iterator(text_iterator=dataset['latex_formula'], vocab_size=VOCAB_SIZE)
new_tokenizer.save_pretrained('/home/lhy/code/TexTeller/src/models/tokenizer/roberta-tokenizer-7Mformulas')
pause = 1

View File

@@ -3,24 +3,17 @@ import heapq
import cv2
import numpy as np
from onnxruntime import InferenceSession
from collections import Counter
from typing import List
from PIL import Image
from surya.ocr import run_ocr
from surya.detection import batch_text_detection
from surya.input.processing import slice_polys_from_image, slice_bboxes_from_image
from surya.recognition import batch_recognition
from surya.model.detection import segformer
from surya.model.recognition.model import load_model
from surya.model.recognition.processor import load_processor
from ..det_model.inference import PredictConfig
from surya.detection import batch_text_detection
from surya.input.processing import slice_polys_from_image
from surya.recognition import batch_recognition
from ..det_model.inference import predict as latex_det_predict
from ..det_model.Bbox import Bbox, draw_bboxes
from ..ocr_model.model.TexTeller import TexTeller
from ..ocr_model.utils.inference import inference as latex_rec_predict
from ..ocr_model.utils.to_katex import to_katex
@@ -59,11 +52,10 @@ def split_conflict(ocr_bboxes: List[Bbox], latex_bboxes: List[Bbox]) -> List[Bbo
bboxes = sorted(ocr_bboxes + latex_bboxes)
######## debug #########
# log results
for idx, bbox in enumerate(bboxes):
bbox.content = str(idx)
draw_bboxes(Image.fromarray(img), bboxes, name="before_split_confict.png")
######## debug ###########
assert len(bboxes) > 1
@@ -125,11 +117,12 @@ def split_conflict(ocr_bboxes: List[Bbox], latex_bboxes: List[Bbox]) -> List[Bbo
assert False
res.append(candidate)
res.append(curr)
######## debug #########
# log results
for idx, bbox in enumerate(res):
bbox.content = str(idx)
draw_bboxes(Image.fromarray(img), res, name="after_split_confict.png")
######## debug ###########
return res
@@ -156,14 +149,17 @@ def mix_inference(
latex_bboxes = latex_det_predict(img_path, latex_det_model, infer_config)
latex_bboxes = sorted(latex_bboxes)
# log results
draw_bboxes(Image.fromarray(img), latex_bboxes, name="latex_bboxes(unmerged).png")
latex_bboxes = bbox_merge(latex_bboxes)
# log results
draw_bboxes(Image.fromarray(img), latex_bboxes, name="latex_bboxes(merged).png")
masked_img = mask_img(img, latex_bboxes, bg_color)
det_model, det_processor, rec_model, rec_processor = lang_ocr_models
images = [Image.fromarray(masked_img)]
det_prediction = batch_text_detection(images, det_model, det_processor)[0]
# log results
draw_bboxes(Image.fromarray(img), latex_bboxes, name="ocr_bboxes(unmerged).png")
lang = [language]
@@ -222,7 +218,6 @@ def mix_inference(
md = ""
prev = Bbox(bboxes[0].p.x, bboxes[0].p.y, -1, -1, label="guard")
# prev = bboxes[0]
for curr in bboxes:
if not prev.same_row(curr):
md += "\n"
@@ -235,23 +230,3 @@ def mix_inference(
md += '\n'
prev = curr
return md
if __name__ == '__main__':
img_path = "/Users/Leehy/Code/TexTeller/test3.png"
# latex_det_model = InferenceSession("/Users/Leehy/Code/TexTeller/src/models/det_model/model/rtdetr_r50vd_6x_coco_trained_on_IBEM_en_papers.onnx")
latex_det_model = InferenceSession("/Users/Leehy/Code/TexTeller/src/models/det_model/model/rtdetr_r50vd_6x_coco.onnx")
infer_config = PredictConfig("/Users/Leehy/Code/TexTeller/src/models/det_model/model/infer_cfg.yml")
det_processor, det_model = segformer.load_processor(), segformer.load_model()
rec_model, rec_processor = load_model(), load_processor()
lang_ocr_models = (det_model, det_processor, rec_model, rec_processor)
texteller = TexTeller.from_pretrained()
tokenizer = TexTeller.get_tokenizer()
latex_rec_models = (texteller, tokenizer)
res = mix_inference(img_path, "zh", infer_config, latex_det_model, lang_ocr_models, latex_rec_models)
print(res)
pause = 1