diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ac95090..6ada4fd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,8 +4,10 @@ repos: hooks: - id: ruff args: [--fix, --respect-gitignore, --config=pyproject.toml] + exclude: ^texteller/models/thrid_party/paddleocr/ - id: ruff-format args: [--config=pyproject.toml] + exclude: ^texteller/models/thrid_party/paddleocr/ - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 diff --git a/src/models/ocr_model/train/dataset/formulas.jsonl b/src/models/ocr_model/train/dataset/formulas.jsonl deleted file mode 100644 index 5a07425..0000000 --- a/src/models/ocr_model/train/dataset/formulas.jsonl +++ /dev/null @@ -1,35 +0,0 @@ -{"img_name": "0.png", "formula": "\\[\\mathbb{C}^{4}\\stackrel{{\\pi_{1}}}{{\\longleftarrow}}\\mathcal{ F}\\stackrel{{\\pi_{2}}}{{\\rightarrow}}\\mathcal{PT},\\]"} -{"img_name": "1.png", "formula": "\\[W^{*}_{Z}(x_{1},x_{2})=W_{f\\lrcorner Z}(y_{1},y_{2})=\\mathcal{P}\\exp\\left( \\int_{\\gamma}A_{\\mu}dx^{\\mu}\\right).\\]"} -{"img_name": "2.png", "formula": "\\[G=W^{*}_{Z}(q,p)=\\tilde{H}H^{-1}\\]"} -{"img_name": "3.png", "formula": "\\[H=W^{*}_{Z}(p,x),\\ \\ \\tilde{H}=W^{*}_{Z}(q,x).\\]"} -{"img_name": "4.png", "formula": "\\[v\\cdot f^{*}A|_{x}=(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)},\\quad x\\in Z, \\ v\\in T_{x}Z.\\]"} -{"img_name": "5.png", "formula": "\\[(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)}=v^{\\alpha\\dot{\\alpha}}\\Big{(} \\frac{\\partial y^{\\beta\\dot{\\beta}}}{\\partial x^{\\alpha\\dot{\\alpha}}}A_{\\beta \\dot{\\beta}}\\Big{)}\\Big{|}_{f\\lrcorner Z(x)},\\ x\\in Z,\\ v\\in T_{x}Z,\\]"} -{"img_name": "6.png", "formula": "\\[\\{T_{i},T_{j}\\}=\\{\\tilde{T}^{i},\\tilde{T}^{j}\\}=0,\\ \\ \\{T_{i},\\tilde{T}^{j}\\}=2i \\delta^{j}_{i}D,\\]"} -{"img_name": "7.png", "formula": "\\[(\\partial_{s},q_{i},\\tilde{q}^{k})\\rightarrow(D,M^{j}_{i}T_{j},\\tilde{M}^{k}_ {l}\\tilde{T}^{l}),\\]"} -{"img_name": "8.png", "formula": "\\[M^{i}_{j}\\tilde{M}^{j}_{k}=\\delta^{i}_{k}.\\]"} -{"img_name": "9.png", "formula": "\\[Q_{i\\alpha}=q_{i\\alpha}+\\omega_{i\\alpha},\\ \\tilde{Q}^{i}_{\\dot{\\alpha}}=q^{i}_{ \\dot{\\alpha}}+\\tilde{\\omega}^{i}_{\\dot{\\alpha}},\\ D_{\\alpha\\dot{\\alpha}}= \\partial_{\\alpha\\dot{\\alpha}}+A_{\\alpha\\dot{\\alpha}}.\\]"} -{"img_name": "10.png", "formula": "\\[\\hat{f}(g,\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{j})=(f(g),[V^{-1}]^ {\\alpha}_{\\beta}\\theta^{i\\beta},[\\tilde{V}^{-1}]^{\\dot{\\alpha}}_{\\dot{\\beta}} \\tilde{\\theta}^{\\dot{\\beta}}_{j}),\\ g\\in{\\cal G},\\]"} -{"img_name": "11.png", "formula": "\\[v^{\\beta\\dot{\\beta}}V^{\\alpha}_{\\beta}\\tilde{V}^{\\dot{\\alpha}}_{\\dot{\\beta}} =((f\\lrcorner L_{0})_{*}v)^{\\alpha\\dot{\\alpha}},\\]"} -{"img_name": "12.png", "formula": "\\[\\omega_{i\\alpha}=\\tilde{\\theta}^{\\dot{\\alpha}}_{i}h_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\ \\ \\tilde{\\omega}^{i}_{\\alpha}=\\theta^{i\\alpha}\\tilde{h}_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\]"} -{"img_name": "13.png", "formula": "\\[\\begin{split}&\\lambda^{\\alpha}\\hat{f}^{*}\\omega_{i\\alpha}(z)= \\tilde{\\theta}^{\\dot{\\beta}}_{i}\\lambda^{\\alpha}\\left(V^{\\beta}_{\\alpha}h_{ \\beta\\dot{\\beta}}(x^{\\prime},\\tau^{\\prime})\\right),\\\\ &\\tilde{\\lambda}^{\\dot{\\alpha}}\\hat{f}^{*}\\tilde{\\omega}^{i}_{ \\dot{\\alpha}}(z)=\\theta^{i\\beta}\\tilde{\\lambda}^{\\dot{\\alpha}}\\left(\\tilde{V}^ {\\dot{\\beta}}_{\\dot{\\alpha}}\\tilde{h}_{\\beta\\dot{\\beta}}(x^{\\prime},\\tau^{ \\prime})\\right),\\end{split}\\]"} -{"img_name": "14.png", "formula": "\\[A_{\\alpha\\dot{\\alpha}}=A_{\\alpha\\dot{\\alpha}}(x^{\\beta\\dot{\\beta}},\\tau^{ \\beta\\dot{\\beta}})\\]"} -{"img_name": "15.png", "formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}D_{\\alpha\\dot{\\alpha}}\\]"} -{"img_name": "16.png", "formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}\\]"} -{"img_name": "17.png", "formula": "\\[[v_{1}\\cdot D^{*},v_{2}\\cdot D^{*}]=0\\]"} -{"img_name": "18.png", "formula": "\\[\\Phi_{A}=(\\omega_{i\\alpha},\\tilde{\\omega}^{i}_{\\dot{\\alpha}},A_{\\alpha\\dot{ \\alpha}})\\]"} -{"img_name": "19.png", "formula": "\\[\\hat{f}:{\\cal F}^{6|4N}\\rightarrow{\\cal F}^{6|4N}\\]"} -{"img_name": "20.png", "formula": "\\[\\sigma=(s,\\xi^{i},\\tilde{\\xi}_{j})\\in\\mathbb{C}^{1|2N}\\]"} -{"img_name": "21.png", "formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}(h_{\\alpha\\dot{\\alpha}}+\\tilde{h}_{\\alpha\\dot{\\alpha} })=0\\]"} -{"img_name": "22.png", "formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}\\rightarrow[V^{-1}]^{\\alpha}_{\\beta}[\\tilde{V}^{-1}]^{ \\dot{\\alpha}}_{\\dot{\\beta}}\\tau^{\\beta\\dot{\\beta}}\\]"} -{"img_name": "23.png", "formula": "\\[\\tau^{\\beta\\dot{\\beta}}=\\sum_{i}\\theta^{i\\beta}\\tilde{\\theta}^{\\dot{\\beta}}_{i}\\]"} -{"img_name": "24.png", "formula": "\\[\\theta^{i\\alpha}\\omega_{i\\alpha}+\\tilde{\\theta}^{i}_{\\dot{\\alpha}}\\tilde{ \\omega}^{\\dot{\\alpha}}_{i}=0\\]"} -{"img_name": "25.png", "formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{Q}^{i}_{\\dot{\\alpha}}\\]"} -{"img_name": "26.png", "formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{q}^{i}_{\\dot{\\alpha}}\\]"} -{"img_name": "27.png", "formula": "\\[\\tilde{\\lambda}^{\\dot{\\alpha}}f^{*}A_{\\alpha\\dot{\\alpha}}=H^{-1}\\tilde{ \\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}H\\]"} -{"img_name": "28.png", "formula": "\\[\\tilde{q}^{i}=\\partial_{\\tilde{\\xi}_{i}}+i\\xi^{i}\\partial_{s}\\]"} -{"img_name": "29.png", "formula": "\\[\\tilde{q}^{i}_{\\dot{\\alpha}}=\\frac{\\partial}{\\partial\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}}+i\\theta^{i\\alpha}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"} -{"img_name": "30.png", "formula": "\\[f\\lrcorner L(z)=\\pi_{1}\\circ f(z,\\lambda,\\tilde{\\lambda})\\ \\forall z\\in L\\]"} -{"img_name": "31.png", "formula": "\\[q_{i\\alpha}=\\frac{\\partial}{\\partial\\theta^{i\\alpha}}+i\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"} -{"img_name": "32.png", "formula": "\\[q_{i}=\\partial_{\\xi^{i}}+i\\tilde{\\xi}_{i}\\partial_{s}\\]"} -{"img_name": "33.png", "formula": "\\[v^{\\alpha\\dot{\\alpha}}=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\]"} -{"img_name": "34.png", "formula": "\\[z^{A}=(x^{\\alpha\\dot{\\alpha}},\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{ j})\\]"} diff --git a/src/models/ocr_model/train/dataset/loader.py b/src/models/ocr_model/train/dataset/loader.py deleted file mode 100644 index f782f36..0000000 --- a/src/models/ocr_model/train/dataset/loader.py +++ /dev/null @@ -1,50 +0,0 @@ -from PIL import Image -from pathlib import Path -import datasets -import json - -DIR_URL = Path('absolute/path/to/dataset/directory') -# e.g. DIR_URL = Path('/home/OleehyO/TeXTeller/src/models/ocr_model/train/dataset') - - -class LatexFormulas(datasets.GeneratorBasedBuilder): - BUILDER_CONFIGS = [] - - def _info(self): - return datasets.DatasetInfo( - features=datasets.Features({ - "image": datasets.Image(), - "latex_formula": datasets.Value("string") - }) - ) - - def _split_generators(self, dl_manager: datasets.DownloadManager): - dir_path = Path(dl_manager.download(str(DIR_URL))) - assert dir_path.is_dir() - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - 'dir_path': dir_path, - } - ) - ] - - def _generate_examples(self, dir_path: Path): - images_path = dir_path / 'images' - formulas_path = dir_path / 'formulas.jsonl' - - img2formula = {} - with formulas_path.open('r', encoding='utf-8') as f: - for line in f: - single_json = json.loads(line) - img2formula[single_json['img_name']] = single_json['formula'] - - for img_path in images_path.iterdir(): - if img_path.suffix not in ['.jpg', '.png']: - continue - yield str(img_path), { - "image": Image.open(img_path), - "latex_formula": img2formula[img_path.name] - } diff --git a/src/models/ocr_model/train/training_args.py b/src/models/ocr_model/train/training_args.py deleted file mode 100644 index 07334fa..0000000 --- a/src/models/ocr_model/train/training_args.py +++ /dev/null @@ -1,38 +0,0 @@ -CONFIG = { - "seed": 42, # Random seed for reproducibility - "use_cpu": False, # Whether to use CPU (it's easier to debug with CPU when starting to test the code) - "learning_rate": 5e-5, # Learning rate - "num_train_epochs": 10, # Total number of training epochs - "per_device_train_batch_size": 4, # Batch size per GPU for training - "per_device_eval_batch_size": 8, # Batch size per GPU for evaluation - - "output_dir": "train_result", # Output directory - "overwrite_output_dir": False, # If the output directory exists, do not delete its content - "report_to": ["tensorboard"], # Report logs to TensorBoard - - "save_strategy": "steps", # Strategy to save checkpoints - "save_steps": 500, # Interval of steps to save checkpoints, can be int or a float (0~1), when float it represents the ratio of total training steps (e.g., can set to 1.0 / 2000) - "save_total_limit": 5, # Maximum number of models to save. The oldest models will be deleted if this number is exceeded - - "logging_strategy": "steps", # Log every certain number of steps - "logging_steps": 500, # Number of steps between each log - "logging_nan_inf_filter": False, # Record logs for loss=nan or inf - - "optim": "adamw_torch", # Optimizer - "lr_scheduler_type": "cosine", # Learning rate scheduler - "warmup_ratio": 0.1, # Ratio of warmup steps in total training steps (e.g., for 1000 steps, the first 100 steps gradually increase lr from 0 to the set lr) - "max_grad_norm": 1.0, # For gradient clipping, ensure the norm of the gradients does not exceed 1.0 (default 1.0) - "fp16": False, # Whether to use 16-bit floating point for training (generally not recommended, as loss can easily explode) - "bf16": False, # Whether to use Brain Floating Point (bfloat16) for training (recommended if architecture supports it) - "gradient_accumulation_steps": 1, # Gradient accumulation steps, consider this parameter to achieve large batch size effects when batch size cannot be large - "jit_mode_eval": False, # Whether to use PyTorch jit trace during eval (can speed up the model, but the model must be static, otherwise will throw errors) - "torch_compile": False, # Whether to use torch.compile to compile the model (for better training and inference performance) - - "dataloader_pin_memory": True, # Can speed up data transfer between CPU and GPU - "dataloader_num_workers": 1, # Default is not to use multiprocessing for data loading, usually set to 4*number of GPUs used - - "evaluation_strategy": "steps", # Evaluation strategy, can be "steps" or "epoch" - "eval_steps": 500, # If evaluation_strategy="step" - - "remove_unused_columns": False, # Don't change this unless you really know what you are doing. -} diff --git a/src/models/utils/__init__.py b/src/models/utils/__init__.py deleted file mode 100644 index 775dc11..0000000 --- a/src/models/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .mix_inference import mix_inference \ No newline at end of file diff --git a/src/rec_infer_from_crop_imgs.py b/src/rec_infer_from_crop_imgs.py deleted file mode 100644 index 89bef18..0000000 --- a/src/rec_infer_from_crop_imgs.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import argparse -import cv2 as cv -from pathlib import Path -from models.ocr_model.utils.to_katex import to_katex -from models.ocr_model.utils.inference import inference as latex_inference -from models.ocr_model.model.TexTeller import TexTeller - - -if __name__ == '__main__': - os.chdir(Path(__file__).resolve().parent) - parser = argparse.ArgumentParser() - parser.add_argument( - '-img_dir', - type=str, - help='path to the input image', - default='./detect_results/subimages' - ) - parser.add_argument( - '-output_dir', - type=str, - help='path to the output dir', - default='./rec_results' - ) - parser.add_argument( - '--inference-mode', - type=str, - default='cpu', - help='Inference mode, select one of cpu, cuda, or mps' - ) - parser.add_argument( - '--num-beam', - type=int, - default=1, - help='number of beam search for decoding' - ) - - args = parser.parse_args() - - print('Loading model and tokenizer...') - latex_rec_model = TexTeller.from_pretrained() - tokenizer = TexTeller.get_tokenizer() - print('Model and tokenizer loaded.') - - # Create the output directory if it doesn't exist - os.makedirs(args.output_dir, exist_ok=True) - - # Loop through all images in the input directory - for filename in os.listdir(args.img_dir): - img_path = os.path.join(args.img_dir, filename) - img = cv.imread(img_path) - - if img is not None: - print(f'Inference for {filename}...') - res = latex_inference(latex_rec_model, tokenizer, [img], accelerator=args.inference_mode, num_beams=args.num_beam) - res = to_katex(res[0]) - - # Save the recognition result to a text file - output_file = os.path.join(args.output_dir, os.path.splitext(filename)[0] + '.txt') - with open(output_file, 'w') as f: - f.write(res) - - print(f'Result saved to {output_file}') - else: - print(f"Warning: Could not read image {img_path}. Skipping...") diff --git a/src/client_demo.py b/texteller/client_demo.py similarity index 100% rename from src/client_demo.py rename to texteller/client_demo.py diff --git a/src/infer_det.py b/texteller/infer_det.py similarity index 65% rename from src/infer_det.py rename to texteller/infer_det.py index 00baf9e..2250ae3 100644 --- a/src/infer_det.py +++ b/texteller/infer_det.py @@ -1,85 +1,96 @@ -import os -import argparse -import glob -import subprocess - -import onnxruntime -from pathlib import Path - -from models.det_model.inference import PredictConfig, predict_image - - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument("--infer_cfg", type=str, help="infer_cfg.yml", - default="./models/det_model/model/infer_cfg.yml") -parser.add_argument('--onnx_file', type=str, help="onnx model file path", - default="./models/det_model/model/rtdetr_r50vd_6x_coco.onnx") -parser.add_argument("--image_dir", type=str, default='./testImgs') -parser.add_argument("--image_file", type=str) -parser.add_argument("--imgsave_dir", type=str, default="./detect_results") -parser.add_argument('--use_gpu', action='store_true', help='Whether to use GPU for inference', default=True) - - -def get_test_images(infer_dir, infer_img): - """ - Get image path list in TEST mode - """ - assert infer_img is not None or infer_dir is not None, \ - "--image_file or --image_dir should be set" - assert infer_img is None or os.path.isfile(infer_img), \ - "{} is not a file".format(infer_img) - assert infer_dir is None or os.path.isdir(infer_dir), \ - "{} is not a directory".format(infer_dir) - - # infer_img has a higher priority - if infer_img and os.path.isfile(infer_img): - return [infer_img] - - images = set() - infer_dir = os.path.abspath(infer_dir) - assert os.path.isdir(infer_dir), \ - "infer_dir {} is not a directory".format(infer_dir) - exts = ['jpg', 'jpeg', 'png', 'bmp'] - exts += [ext.upper() for ext in exts] - for ext in exts: - images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) - images = list(images) - - assert len(images) > 0, "no image found in {}".format(infer_dir) - print("Found {} inference images in total.".format(len(images))) - - return images - -def download_file(url, filename): - print(f"Downloading {filename}...") - subprocess.run(["wget", "-q", "--show-progress", "-O", filename, url], check=True) - print("Download complete.") - -if __name__ == '__main__': - cur_path = os.getcwd() - script_dirpath = Path(__file__).resolve().parent - os.chdir(script_dirpath) - - FLAGS = parser.parse_args() - - if not os.path.exists(FLAGS.infer_cfg): - infer_cfg_url = "https://huggingface.co/TonyLee1256/texteller_det/resolve/main/infer_cfg.yml?download=true" - download_file(infer_cfg_url, FLAGS.infer_cfg) - - if not os.path.exists(FLAGS.onnx_file): - onnx_file_url = "https://huggingface.co/TonyLee1256/texteller_det/resolve/main/rtdetr_r50vd_6x_coco.onnx?download=true" - download_file(onnx_file_url, FLAGS.onnx_file) - - # load image list - img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file) - - if FLAGS.use_gpu: - predictor = onnxruntime.InferenceSession(FLAGS.onnx_file, providers=['CUDAExecutionProvider']) - else: - predictor = onnxruntime.InferenceSession(FLAGS.onnx_file, providers=['CPUExecutionProvider']) - # load infer config - infer_config = PredictConfig(FLAGS.infer_cfg) - - predict_image(FLAGS.imgsave_dir, infer_config, predictor, img_list) - - os.chdir(cur_path) +import os +import argparse +import glob +import subprocess + +import onnxruntime +from pathlib import Path + +from models.det_model.inference import PredictConfig, predict_image + + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--infer_cfg", type=str, help="infer_cfg.yml", default="./models/det_model/model/infer_cfg.yml" +) +parser.add_argument( + '--onnx_file', + type=str, + help="onnx model file path", + default="./models/det_model/model/rtdetr_r50vd_6x_coco.onnx", +) +parser.add_argument("--image_dir", type=str, default='./testImgs') +parser.add_argument("--image_file", type=str) +parser.add_argument("--imgsave_dir", type=str, default="./detect_results") +parser.add_argument( + '--use_gpu', action='store_true', help='Whether to use GPU for inference', default=True +) + + +def get_test_images(infer_dir, infer_img): + """ + Get image path list in TEST mode + """ + assert ( + infer_img is not None or infer_dir is not None + ), "--image_file or --image_dir should be set" + assert infer_img is None or os.path.isfile(infer_img), "{} is not a file".format(infer_img) + assert infer_dir is None or os.path.isdir(infer_dir), "{} is not a directory".format(infer_dir) + + # infer_img has a higher priority + if infer_img and os.path.isfile(infer_img): + return [infer_img] + + images = set() + infer_dir = os.path.abspath(infer_dir) + assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(infer_dir) + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for ext in exts: + images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) + images = list(images) + + assert len(images) > 0, "no image found in {}".format(infer_dir) + print("Found {} inference images in total.".format(len(images))) + + return images + + +def download_file(url, filename): + print(f"Downloading {filename}...") + subprocess.run(["wget", "-q", "--show-progress", "-O", filename, url], check=True) + print("Download complete.") + + +if __name__ == '__main__': + cur_path = os.getcwd() + script_dirpath = Path(__file__).resolve().parent + os.chdir(script_dirpath) + + FLAGS = parser.parse_args() + + if not os.path.exists(FLAGS.infer_cfg): + infer_cfg_url = "https://huggingface.co/TonyLee1256/texteller_det/resolve/main/infer_cfg.yml?download=true" + download_file(infer_cfg_url, FLAGS.infer_cfg) + + if not os.path.exists(FLAGS.onnx_file): + onnx_file_url = "https://huggingface.co/TonyLee1256/texteller_det/resolve/main/rtdetr_r50vd_6x_coco.onnx?download=true" + download_file(onnx_file_url, FLAGS.onnx_file) + + # load image list + img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file) + + if FLAGS.use_gpu: + predictor = onnxruntime.InferenceSession( + FLAGS.onnx_file, providers=['CUDAExecutionProvider'] + ) + else: + predictor = onnxruntime.InferenceSession( + FLAGS.onnx_file, providers=['CPUExecutionProvider'] + ) + # load infer config + infer_config = PredictConfig(FLAGS.infer_cfg) + + predict_image(FLAGS.imgsave_dir, infer_config, predictor, img_list) + + os.chdir(cur_path) diff --git a/src/inference.py b/texteller/inference.py similarity index 74% rename from src/inference.py rename to texteller/inference.py index 07a0cae..f6cfe5b 100644 --- a/src/inference.py +++ b/texteller/inference.py @@ -18,32 +18,20 @@ from models.det_model.inference import PredictConfig if __name__ == '__main__': os.chdir(Path(__file__).resolve().parent) parser = argparse.ArgumentParser() + parser.add_argument('-img', type=str, required=True, help='path to the input image') parser.add_argument( - '-img', - type=str, - required=True, - help='path to the input image' - ) - parser.add_argument( - '--inference-mode', + '--inference-mode', type=str, default='cpu', - help='Inference mode, select one of cpu, cuda, or mps' + help='Inference mode, select one of cpu, cuda, or mps', ) parser.add_argument( - '--num-beam', - type=int, - default=1, - help='number of beam search for decoding' + '--num-beam', type=int, default=1, help='number of beam search for decoding' ) - parser.add_argument( - '-mix', - action='store_true', - help='use mix mode' - ) - + parser.add_argument('-mix', action='store_true', help='use mix mode') + args = parser.parse_args() - + # You can use your own checkpoint and tokenizer path. print('Loading model and tokenizer...') latex_rec_model = TexTeller.from_pretrained() @@ -63,8 +51,8 @@ if __name__ == '__main__': use_gpu = args.inference_mode == 'cuda' SIZE_LIMIT = 20 * 1024 * 1024 - det_model_dir = "./models/thrid_party/paddleocr/checkpoints/det/default_model.onnx" - rec_model_dir = "./models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx" + det_model_dir = "./models/thrid_party/paddleocr/checkpoints/det/default_model.onnx" + rec_model_dir = "./models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx" # The CPU inference of the detection model will be faster than the GPU inference (in onnxruntime) det_use_gpu = False rec_use_gpu = use_gpu and not (os.path.getsize(rec_model_dir) < SIZE_LIMIT) @@ -78,8 +66,16 @@ if __name__ == '__main__': detector = predict_det.TextDetector(paddleocr_args) paddleocr_args.use_gpu = rec_use_gpu recognizer = predict_rec.TextRecognizer(paddleocr_args) - + lang_ocr_models = [detector, recognizer] latex_rec_models = [latex_rec_model, tokenizer] - res = mix_inference(img_path, infer_config, latex_det_model, lang_ocr_models, latex_rec_models, args.inference_mode, args.num_beam) + res = mix_inference( + img_path, + infer_config, + latex_det_model, + lang_ocr_models, + latex_rec_models, + args.inference_mode, + args.num_beam, + ) print(res) diff --git a/texteller/models/__pycache__/globals.cpython-310.pyc b/texteller/models/__pycache__/globals.cpython-310.pyc new file mode 100644 index 0000000..48b23dd Binary files /dev/null and b/texteller/models/__pycache__/globals.cpython-310.pyc differ diff --git a/src/models/det_model/Bbox.py b/texteller/models/det_model/Bbox.py similarity index 90% rename from src/models/det_model/Bbox.py rename to texteller/models/det_model/Bbox.py index 9784541..53d5735 100644 --- a/src/models/det_model/Bbox.py +++ b/texteller/models/det_model/Bbox.py @@ -9,7 +9,7 @@ class Point: def __init__(self, x: int, y: int): self.x = int(x) self.y = int(y) - + def __repr__(self) -> str: return f"Point(x={self.x}, y={self.y})" @@ -28,30 +28,28 @@ class Bbox: @property def ul_point(self) -> Point: return self.p - + @property def ur_point(self) -> Point: return Point(self.p.x + self.w, self.p.y) - + @property def ll_point(self) -> Point: return Point(self.p.x, self.p.y + self.h) - + @property def lr_point(self) -> Point: return Point(self.p.x + self.w, self.p.y + self.h) - - + def same_row(self, other) -> bool: - if ( - (self.p.y >= other.p.y and self.ll_point.y <= other.ll_point.y) - or (self.p.y <= other.p.y and self.ll_point.y >= other.ll_point.y) + if (self.p.y >= other.p.y and self.ll_point.y <= other.ll_point.y) or ( + self.p.y <= other.p.y and self.ll_point.y >= other.ll_point.y ): return True if self.ll_point.y <= other.p.y or self.p.y >= other.ll_point.y: return False return 1.0 * abs(self.p.y - other.p.y) / max(self.h, other.h) < self.THREADHOLD - + def __lt__(self, other) -> bool: ''' from top to bottom, from left to right @@ -60,7 +58,7 @@ class Bbox: return self.p.y < other.p.y else: return self.p.x < other.p.x - + def __repr__(self) -> str: return f"Bbox(upper_left_point={self.p}, h={self.h}, w={self.w}), label={self.label}, confident={self.confidence}, content={self.content})" @@ -76,16 +74,16 @@ def draw_bboxes(img: Image.Image, bboxes: List[Bbox], name="annotated_image.png" top = bbox.p.y right = bbox.p.x + bbox.w bottom = bbox.p.y + bbox.h - + # Draw the rectangle on the image drawer.rectangle([left, top, right, bottom], outline="green", width=1) - + # Optionally, add text label if it exists if bbox.label: drawer.text((left, top), bbox.label, fill="blue") - + if bbox.content: drawer.text((left, bottom - 10), bbox.content[:10], fill="red") # Save the image with drawn rectangles - img.save(log_dir / name) \ No newline at end of file + img.save(log_dir / name) diff --git a/texteller/models/det_model/__pycache__/Bbox.cpython-310.pyc b/texteller/models/det_model/__pycache__/Bbox.cpython-310.pyc new file mode 100644 index 0000000..c9e0f25 Binary files /dev/null and b/texteller/models/det_model/__pycache__/Bbox.cpython-310.pyc differ diff --git a/texteller/models/det_model/__pycache__/inference.cpython-310.pyc b/texteller/models/det_model/__pycache__/inference.cpython-310.pyc new file mode 100644 index 0000000..58073df Binary files /dev/null and b/texteller/models/det_model/__pycache__/inference.cpython-310.pyc differ diff --git a/texteller/models/det_model/__pycache__/preprocess.cpython-310.pyc b/texteller/models/det_model/__pycache__/preprocess.cpython-310.pyc new file mode 100644 index 0000000..f8d3d37 Binary files /dev/null and b/texteller/models/det_model/__pycache__/preprocess.cpython-310.pyc differ diff --git a/src/models/det_model/inference.py b/texteller/models/det_model/inference.py similarity index 81% rename from src/models/det_model/inference.py rename to texteller/models/det_model/inference.py index 5e0dd2c..c866ae7 100644 --- a/src/models/det_model/inference.py +++ b/texteller/models/det_model/inference.py @@ -12,10 +12,28 @@ from .Bbox import Bbox # Global dictionary SUPPORT_MODELS = { - 'YOLO', 'PPYOLOE', 'RCNN', 'SSD', 'Face', 'FCOS', 'SOLOv2', 'TTFNet', - 'S2ANet', 'JDE', 'FairMOT', 'DeepSORT', 'GFL', 'PicoDet', 'CenterNet', - 'TOOD', 'RetinaNet', 'StrongBaseline', 'STGCN', 'YOLOX', 'HRNet', - 'DETR' + 'YOLO', + 'PPYOLOE', + 'RCNN', + 'SSD', + 'Face', + 'FCOS', + 'SOLOv2', + 'TTFNet', + 'S2ANet', + 'JDE', + 'FairMOT', + 'DeepSORT', + 'GFL', + 'PicoDet', + 'CenterNet', + 'TOOD', + 'RetinaNet', + 'StrongBaseline', + 'STGCN', + 'YOLOX', + 'HRNet', + 'DETR', } @@ -42,12 +60,12 @@ class PredictConfig(object): self.fpn_stride = yml_conf.get("fpn_stride", None) color_pool = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255)] - self.colors = {label: color_pool[i % len(color_pool)] for i, label in enumerate(self.label_list)} + self.colors = { + label: color_pool[i % len(color_pool)] for i, label in enumerate(self.label_list) + } if self.arch == 'RCNN' and yml_conf.get('export_onnx', False): - print( - 'The RCNN export model is used for ONNX and it only supports batch_size = 1' - ) + print('The RCNN export model is used for ONNX and it only supports batch_size = 1') self.print_config() def check_model(self, yml_conf): @@ -58,8 +76,7 @@ class PredictConfig(object): for support_model in SUPPORT_MODELS: if support_model in yml_conf['arch']: return True - raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[ - 'arch'], SUPPORT_MODELS)) + raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf['arch'], SUPPORT_MODELS)) def print_config(self): print('----------- Model Configuration -----------') @@ -77,8 +94,15 @@ def draw_bbox(image, outputs, infer_config): label = infer_config.label_list[int(cls_id)] color = infer_config.colors[label] cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2) - cv2.putText(image, "{}: {:.2f}".format(label, score), - (int(xmin), int(ymin - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + cv2.putText( + image, + "{}: {:.2f}".format(label, score), + (int(xmin), int(ymin - 5)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + color, + 2, + ) return image @@ -104,7 +128,7 @@ def predict_image(imgsave_dir, infer_config, predictor, img_list): inputs = transforms(img_path) inputs_name = [var.name for var in predictor.get_inputs()] - inputs = {k: inputs[k][None, ] for k in inputs_name} + inputs = {k: inputs[k][None,] for k in inputs_name} # Start timing start_time = time.time() @@ -119,7 +143,9 @@ def predict_image(imgsave_dir, infer_config, predictor, img_list): else: total_time += inference_time num_images += 1 - print(f"ONNXRuntime predict time for {os.path.basename(img_path)}: {inference_time:.4f} seconds") + print( + f"ONNXRuntime predict time for {os.path.basename(img_path)}: {inference_time:.4f} seconds" + ) print("ONNXRuntime predict: ") if infer_config.arch in ["HRNet"]: @@ -128,8 +154,7 @@ def predict_image(imgsave_dir, infer_config, predictor, img_list): bboxes = np.array(outputs[0]) for bbox in bboxes: if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold: - print(f"{int(bbox[0])} {bbox[1]} " - f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}") + print(f"{int(bbox[0])} {bbox[1]} " f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}") # Save the subimages (crop from the original image) subimg_counter = 1 @@ -137,7 +162,7 @@ def predict_image(imgsave_dir, infer_config, predictor, img_list): cls_id, score, xmin, ymin, xmax, ymax = output if score > infer_config.draw_threshold: label = infer_config.label_list[int(cls_id)] - subimg = img[int(max(ymin, 0)):int(ymax), int(max(xmin, 0)):int(xmax)] + subimg = img[int(max(ymin, 0)) : int(ymax), int(max(xmin, 0)) : int(xmax)] if len(subimg) == 0: continue @@ -151,8 +176,14 @@ def predict_image(imgsave_dir, infer_config, predictor, img_list): for output in np.array(outputs[0]): cls_id, score, xmin, ymin, xmax, ymax = output if score > infer_config.draw_threshold: - cv2.rectangle(img_with_mask, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (255, 255, 255), -1) # 盖白 - + cv2.rectangle( + img_with_mask, + (int(xmin), int(ymin)), + (int(xmax), int(ymax)), + (255, 255, 255), + -1, + ) # 盖白 + img_with_bbox = draw_bbox(img, np.array(outputs[0]), infer_config) output_dir = imgsave_dir @@ -178,7 +209,7 @@ def predict(img_path: str, predictor, infer_config) -> List[Bbox]: transforms = Compose(infer_config.preprocess_infos) inputs = transforms(img_path) inputs_name = [var.name for var in predictor.get_inputs()] - inputs = {k: inputs[k][None, ] for k in inputs_name} + inputs = {k: inputs[k][None,] for k in inputs_name} outputs = predictor.run(output_names=None, input_feed=inputs)[0] res = [] diff --git a/src/models/det_model/model/infer_cfg.yml b/texteller/models/det_model/model/infer_cfg.yml similarity index 100% rename from src/models/det_model/model/infer_cfg.yml rename to texteller/models/det_model/model/infer_cfg.yml diff --git a/src/models/det_model/preprocess.py b/texteller/models/det_model/preprocess.py similarity index 86% rename from src/models/det_model/preprocess.py rename to texteller/models/det_model/preprocess.py index 6b72494..935a2ae 100644 --- a/src/models/det_model/preprocess.py +++ b/texteller/models/det_model/preprocess.py @@ -15,10 +15,8 @@ def decode_image(img_path): im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) img_info = { - "im_shape": np.array( - im.shape[:2], dtype=np.float32), - "scale_factor": np.array( - [1., 1.], dtype=np.float32) + "im_shape": np.array(im.shape[:2], dtype=np.float32), + "scale_factor": np.array([1.0, 1.0], dtype=np.float32), } return im, img_info @@ -51,16 +49,9 @@ class Resize(object): assert self.target_size[0] > 0 and self.target_size[1] > 0 im_channel = im.shape[2] im_scale_y, im_scale_x = self.generate_scale(im) - im = cv2.resize( - im, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) + im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) im_info['im_shape'] = np.array(im.shape[:2]).astype('float32') - im_info['scale_factor'] = np.array( - [im_scale_y, im_scale_x]).astype('float32') + im_info['scale_factor'] = np.array([im_scale_y, im_scale_x]).astype('float32') return im, im_info def generate_scale(self, im): @@ -134,7 +125,9 @@ class Permute(object): channel_first (bool): whether convert HWC to CHW """ - def __init__(self, ): + def __init__( + self, + ): super(Permute, self).__init__() def __call__(self, im, im_info): @@ -151,7 +144,7 @@ class Permute(object): class PadStride(object): - """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config + """padding image for model with FPN, instead PadBatch(pad_to_stride) in original config Args: stride (bool): model with FPN need image shape % stride == 0 """ @@ -198,18 +191,16 @@ class LetterBoxResize(object): ratio_h = float(height) / shape[0] ratio_w = float(width) / shape[1] ratio = min(ratio_h, ratio_w) - new_shape = (round(shape[1] * ratio), - round(shape[0] * ratio)) # [width, height] + new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # [width, height] padw = (width - new_shape[0]) / 2 padh = (height - new_shape[1]) / 2 top, bottom = round(padh - 0.1), round(padh + 0.1) left, right = round(padw - 0.1), round(padw + 0.1) - img = cv2.resize( - img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border + img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border img = cv2.copyMakeBorder( - img, top, bottom, left, right, cv2.BORDER_CONSTANT, - value=color) # padded rectangular + img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color + ) # padded rectangular return img, ratio, padw, padh def __call__(self, im, im_info): @@ -302,12 +293,7 @@ def _get_3rd_point(a, b): return third_pt -def get_affine_transform(center, - input_size, - rot, - output_size, - shift=(0., 0.), - inv=False): +def get_affine_transform(center, input_size, rot, output_size, shift=(0.0, 0.0), inv=False): """Get the affine transform matrix, given the center/scale/rot/output_size. Args: @@ -337,8 +323,8 @@ def get_affine_transform(center, dst_h = output_size[1] rot_rad = np.pi * rot / 180 - src_dir = rotate_point([0., src_w * -0.5], rot_rad) - dst_dir = np.array([0., dst_w * -0.5]) + src_dir = rotate_point([0.0, src_w * -0.5], rot_rad) + dst_dir = np.array([0.0, dst_w * -0.5]) src = np.zeros((3, 2), dtype=np.float32) src[0, :] = center + scale_tmp * shift @@ -359,16 +345,9 @@ def get_affine_transform(center, class WarpAffine(object): - """Warp affine the image - """ + """Warp affine the image""" - def __init__(self, - keep_res=False, - pad=31, - input_h=512, - input_w=512, - scale=0.4, - shift=0.1): + def __init__(self, keep_res=False, pad=31, input_h=512, input_w=512, scale=0.4, shift=0.1): self.keep_res = keep_res self.pad = pad self.input_h = input_h @@ -398,12 +377,11 @@ class WarpAffine(object): else: s = max(h, w) * 1.0 input_h, input_w = self.input_h, self.input_w - c = np.array([w / 2., h / 2.], dtype=np.float32) + c = np.array([w / 2.0, h / 2.0], dtype=np.float32) trans_input = get_affine_transform(c, s, 0, [input_w, input_h]) img = cv2.resize(img, (w, h)) - inp = cv2.warpAffine( - img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) + inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) return inp, im_info @@ -432,13 +410,17 @@ def get_warp_matrix(theta, size_input, size_dst, size_target): matrix[0, 0] = np.cos(theta) * scale_x matrix[0, 1] = -np.sin(theta) * scale_x matrix[0, 2] = scale_x * ( - -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * - np.sin(theta) + 0.5 * size_target[0]) + -0.5 * size_input[0] * np.cos(theta) + + 0.5 * size_input[1] * np.sin(theta) + + 0.5 * size_target[0] + ) matrix[1, 0] = np.sin(theta) * scale_y matrix[1, 1] = np.cos(theta) * scale_y matrix[1, 2] = scale_y * ( - -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * - np.cos(theta) + 0.5 * size_target[1]) + -0.5 * size_input[0] * np.sin(theta) + - 0.5 * size_input[1] * np.cos(theta) + + 0.5 * size_target[1] + ) return matrix @@ -462,22 +444,26 @@ class TopDownEvalAffine(object): def __call__(self, image, im_info): rot = 0 imshape = im_info['im_shape'][::-1] - center = im_info['center'] if 'center' in im_info else imshape / 2. + center = im_info['center'] if 'center' in im_info else imshape / 2.0 scale = im_info['scale'] if 'scale' in im_info else imshape if self.use_udp: trans = get_warp_matrix( - rot, center * 2.0, - [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale) + rot, center * 2.0, [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale + ) image = cv2.warpAffine( image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) + trans, + (int(self.trainsize[0]), int(self.trainsize[1])), + flags=cv2.INTER_LINEAR, + ) else: trans = get_affine_transform(center, scale, rot, self.trainsize) image = cv2.warpAffine( image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) + trans, + (int(self.trainsize[0]), int(self.trainsize[1])), + flags=cv2.INTER_LINEAR, + ) return image, im_info diff --git a/src/models/globals.py b/texteller/models/globals.py similarity index 92% rename from src/models/globals.py rename to texteller/models/globals.py index 4d437a0..8754d67 100644 --- a/src/models/globals.py +++ b/texteller/models/globals.py @@ -1,6 +1,6 @@ # Formula image(grayscale) mean and variance IMAGE_MEAN = 0.9545467 -IMAGE_STD = 0.15394445 +IMAGE_STD = 0.15394445 # Vocabulary size for TexTeller VOCAB_SIZE = 15000 @@ -20,4 +20,4 @@ MIN_RESIZE_RATIO = 0.75 # Minimum height and width for input image for TexTeller MIN_HEIGHT = 12 -MIN_WIDTH = 30 +MIN_WIDTH = 30 diff --git a/src/models/ocr_model/model/TexTeller.py b/texteller/models/ocr_model/model/TexTeller.py similarity index 63% rename from src/models/ocr_model/model/TexTeller.py rename to texteller/models/ocr_model/model/TexTeller.py index 1f7e0ac..4f916cd 100644 --- a/src/models/ocr_model/model/TexTeller.py +++ b/texteller/models/ocr_model/model/TexTeller.py @@ -1,30 +1,24 @@ from pathlib import Path -from ...globals import ( - VOCAB_SIZE, - FIXED_IMG_SIZE, - IMG_CHANNELS, - MAX_TOKEN_SIZE -) +from ...globals import VOCAB_SIZE, FIXED_IMG_SIZE, IMG_CHANNELS, MAX_TOKEN_SIZE -from transformers import ( - RobertaTokenizerFast, - VisionEncoderDecoderModel, - VisionEncoderDecoderConfig -) +from transformers import RobertaTokenizerFast, VisionEncoderDecoderModel, VisionEncoderDecoderConfig class TexTeller(VisionEncoderDecoderModel): REPO_NAME = 'OleehyO/TexTeller' + def __init__(self): - config = VisionEncoderDecoderConfig.from_pretrained(Path(__file__).resolve().parent / "config.json") - config.encoder.image_size = FIXED_IMG_SIZE - config.encoder.num_channels = IMG_CHANNELS - config.decoder.vocab_size = VOCAB_SIZE + config = VisionEncoderDecoderConfig.from_pretrained( + Path(__file__).resolve().parent / "config.json" + ) + config.encoder.image_size = FIXED_IMG_SIZE + config.encoder.num_channels = IMG_CHANNELS + config.decoder.vocab_size = VOCAB_SIZE config.decoder.max_position_embeddings = MAX_TOKEN_SIZE super().__init__(config=config) - + @classmethod def from_pretrained(cls, model_path: str = None, use_onnx=False, onnx_provider=None): if model_path is None or model_path == 'default': @@ -32,8 +26,12 @@ class TexTeller(VisionEncoderDecoderModel): return VisionEncoderDecoderModel.from_pretrained(cls.REPO_NAME) else: from optimum.onnxruntime import ORTModelForVision2Seq + use_gpu = True if onnx_provider == 'cuda' else False - return ORTModelForVision2Seq.from_pretrained(cls.REPO_NAME, provider="CUDAExecutionProvider" if use_gpu else "CPUExecutionProvider") + return ORTModelForVision2Seq.from_pretrained( + cls.REPO_NAME, + provider="CUDAExecutionProvider" if use_gpu else "CPUExecutionProvider", + ) model_path = Path(model_path).resolve() return VisionEncoderDecoderModel.from_pretrained(str(model_path)) diff --git a/texteller/models/ocr_model/model/__pycache__/TexTeller.cpython-310.pyc b/texteller/models/ocr_model/model/__pycache__/TexTeller.cpython-310.pyc new file mode 100644 index 0000000..ece8c18 Binary files /dev/null and b/texteller/models/ocr_model/model/__pycache__/TexTeller.cpython-310.pyc differ diff --git a/src/models/ocr_model/model/config.json b/texteller/models/ocr_model/model/config.json similarity index 100% rename from src/models/ocr_model/model/config.json rename to texteller/models/ocr_model/model/config.json diff --git a/texteller/models/ocr_model/train/__pycache__/train.cpython-310.pyc b/texteller/models/ocr_model/train/__pycache__/train.cpython-310.pyc new file mode 100644 index 0000000..530caa5 Binary files /dev/null and b/texteller/models/ocr_model/train/__pycache__/train.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/train/__pycache__/training_args.cpython-310.pyc b/texteller/models/ocr_model/train/__pycache__/training_args.cpython-310.pyc new file mode 100644 index 0000000..224449c Binary files /dev/null and b/texteller/models/ocr_model/train/__pycache__/training_args.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_0.png b/texteller/models/ocr_model/train/augraphy_cache/image_0.png new file mode 100644 index 0000000..a149048 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_0.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_1.png b/texteller/models/ocr_model/train/augraphy_cache/image_1.png new file mode 100644 index 0000000..10a2184 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_1.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_10.png b/texteller/models/ocr_model/train/augraphy_cache/image_10.png new file mode 100644 index 0000000..70401c2 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_10.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_11.png b/texteller/models/ocr_model/train/augraphy_cache/image_11.png new file mode 100644 index 0000000..3acda0d Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_11.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_12.png b/texteller/models/ocr_model/train/augraphy_cache/image_12.png new file mode 100644 index 0000000..b03dfb7 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_12.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_13.png b/texteller/models/ocr_model/train/augraphy_cache/image_13.png new file mode 100644 index 0000000..64b7abb Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_13.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_14.png b/texteller/models/ocr_model/train/augraphy_cache/image_14.png new file mode 100644 index 0000000..281ad58 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_14.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_15.png b/texteller/models/ocr_model/train/augraphy_cache/image_15.png new file mode 100644 index 0000000..671e70c Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_15.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_16.png b/texteller/models/ocr_model/train/augraphy_cache/image_16.png new file mode 100644 index 0000000..0061a0b Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_16.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_17.png b/texteller/models/ocr_model/train/augraphy_cache/image_17.png new file mode 100644 index 0000000..321af30 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_17.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_18.png b/texteller/models/ocr_model/train/augraphy_cache/image_18.png new file mode 100644 index 0000000..e9eb26b Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_18.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_19.png b/texteller/models/ocr_model/train/augraphy_cache/image_19.png new file mode 100644 index 0000000..8f9ef59 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_19.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_2.png b/texteller/models/ocr_model/train/augraphy_cache/image_2.png new file mode 100644 index 0000000..b538696 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_2.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_20.png b/texteller/models/ocr_model/train/augraphy_cache/image_20.png new file mode 100644 index 0000000..db40eb2 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_20.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_21.png b/texteller/models/ocr_model/train/augraphy_cache/image_21.png new file mode 100644 index 0000000..cc9d586 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_21.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_22.png b/texteller/models/ocr_model/train/augraphy_cache/image_22.png new file mode 100644 index 0000000..220179c Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_22.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_23.png b/texteller/models/ocr_model/train/augraphy_cache/image_23.png new file mode 100644 index 0000000..b7be139 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_23.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_24.png b/texteller/models/ocr_model/train/augraphy_cache/image_24.png new file mode 100644 index 0000000..7476b76 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_24.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_25.png b/texteller/models/ocr_model/train/augraphy_cache/image_25.png new file mode 100644 index 0000000..77b9c45 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_25.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_26.png b/texteller/models/ocr_model/train/augraphy_cache/image_26.png new file mode 100644 index 0000000..e189b32 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_26.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_27.png b/texteller/models/ocr_model/train/augraphy_cache/image_27.png new file mode 100644 index 0000000..a1d4133 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_27.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_28.png b/texteller/models/ocr_model/train/augraphy_cache/image_28.png new file mode 100644 index 0000000..8b9a8b4 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_28.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_29.png b/texteller/models/ocr_model/train/augraphy_cache/image_29.png new file mode 100644 index 0000000..cb50df4 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_29.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_3.png b/texteller/models/ocr_model/train/augraphy_cache/image_3.png new file mode 100644 index 0000000..2d375b7 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_3.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_4.png b/texteller/models/ocr_model/train/augraphy_cache/image_4.png new file mode 100644 index 0000000..9d53ce8 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_4.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_5.png b/texteller/models/ocr_model/train/augraphy_cache/image_5.png new file mode 100644 index 0000000..43257bd Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_5.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_6.png b/texteller/models/ocr_model/train/augraphy_cache/image_6.png new file mode 100644 index 0000000..dd1e098 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_6.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_7.png b/texteller/models/ocr_model/train/augraphy_cache/image_7.png new file mode 100644 index 0000000..7baf0f4 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_7.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_8.png b/texteller/models/ocr_model/train/augraphy_cache/image_8.png new file mode 100644 index 0000000..3d94283 Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_8.png differ diff --git a/texteller/models/ocr_model/train/augraphy_cache/image_9.png b/texteller/models/ocr_model/train/augraphy_cache/image_9.png new file mode 100644 index 0000000..b42491b Binary files /dev/null and b/texteller/models/ocr_model/train/augraphy_cache/image_9.png differ diff --git a/src/models/ocr_model/train/dataset/images/0.png b/texteller/models/ocr_model/train/dataset/train/0.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/0.png rename to texteller/models/ocr_model/train/dataset/train/0.png diff --git a/src/models/ocr_model/train/dataset/images/1.png b/texteller/models/ocr_model/train/dataset/train/1.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/1.png rename to texteller/models/ocr_model/train/dataset/train/1.png diff --git a/src/models/ocr_model/train/dataset/images/10.png b/texteller/models/ocr_model/train/dataset/train/10.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/10.png rename to texteller/models/ocr_model/train/dataset/train/10.png diff --git a/src/models/ocr_model/train/dataset/images/11.png b/texteller/models/ocr_model/train/dataset/train/11.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/11.png rename to texteller/models/ocr_model/train/dataset/train/11.png diff --git a/src/models/ocr_model/train/dataset/images/12.png b/texteller/models/ocr_model/train/dataset/train/12.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/12.png rename to texteller/models/ocr_model/train/dataset/train/12.png diff --git a/src/models/ocr_model/train/dataset/images/13.png b/texteller/models/ocr_model/train/dataset/train/13.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/13.png rename to texteller/models/ocr_model/train/dataset/train/13.png diff --git a/src/models/ocr_model/train/dataset/images/14.png b/texteller/models/ocr_model/train/dataset/train/14.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/14.png rename to texteller/models/ocr_model/train/dataset/train/14.png diff --git a/src/models/ocr_model/train/dataset/images/15.png b/texteller/models/ocr_model/train/dataset/train/15.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/15.png rename to texteller/models/ocr_model/train/dataset/train/15.png diff --git a/src/models/ocr_model/train/dataset/images/16.png b/texteller/models/ocr_model/train/dataset/train/16.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/16.png rename to texteller/models/ocr_model/train/dataset/train/16.png diff --git a/src/models/ocr_model/train/dataset/images/17.png b/texteller/models/ocr_model/train/dataset/train/17.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/17.png rename to texteller/models/ocr_model/train/dataset/train/17.png diff --git a/src/models/ocr_model/train/dataset/images/18.png b/texteller/models/ocr_model/train/dataset/train/18.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/18.png rename to texteller/models/ocr_model/train/dataset/train/18.png diff --git a/src/models/ocr_model/train/dataset/images/19.png b/texteller/models/ocr_model/train/dataset/train/19.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/19.png rename to texteller/models/ocr_model/train/dataset/train/19.png diff --git a/src/models/ocr_model/train/dataset/images/2.png b/texteller/models/ocr_model/train/dataset/train/2.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/2.png rename to texteller/models/ocr_model/train/dataset/train/2.png diff --git a/src/models/ocr_model/train/dataset/images/20.png b/texteller/models/ocr_model/train/dataset/train/20.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/20.png rename to texteller/models/ocr_model/train/dataset/train/20.png diff --git a/src/models/ocr_model/train/dataset/images/21.png b/texteller/models/ocr_model/train/dataset/train/21.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/21.png rename to texteller/models/ocr_model/train/dataset/train/21.png diff --git a/src/models/ocr_model/train/dataset/images/22.png b/texteller/models/ocr_model/train/dataset/train/22.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/22.png rename to texteller/models/ocr_model/train/dataset/train/22.png diff --git a/src/models/ocr_model/train/dataset/images/23.png b/texteller/models/ocr_model/train/dataset/train/23.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/23.png rename to texteller/models/ocr_model/train/dataset/train/23.png diff --git a/src/models/ocr_model/train/dataset/images/24.png b/texteller/models/ocr_model/train/dataset/train/24.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/24.png rename to texteller/models/ocr_model/train/dataset/train/24.png diff --git a/src/models/ocr_model/train/dataset/images/25.png b/texteller/models/ocr_model/train/dataset/train/25.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/25.png rename to texteller/models/ocr_model/train/dataset/train/25.png diff --git a/src/models/ocr_model/train/dataset/images/26.png b/texteller/models/ocr_model/train/dataset/train/26.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/26.png rename to texteller/models/ocr_model/train/dataset/train/26.png diff --git a/src/models/ocr_model/train/dataset/images/27.png b/texteller/models/ocr_model/train/dataset/train/27.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/27.png rename to texteller/models/ocr_model/train/dataset/train/27.png diff --git a/src/models/ocr_model/train/dataset/images/28.png b/texteller/models/ocr_model/train/dataset/train/28.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/28.png rename to texteller/models/ocr_model/train/dataset/train/28.png diff --git a/src/models/ocr_model/train/dataset/images/29.png b/texteller/models/ocr_model/train/dataset/train/29.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/29.png rename to texteller/models/ocr_model/train/dataset/train/29.png diff --git a/src/models/ocr_model/train/dataset/images/3.png b/texteller/models/ocr_model/train/dataset/train/3.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/3.png rename to texteller/models/ocr_model/train/dataset/train/3.png diff --git a/src/models/ocr_model/train/dataset/images/30.png b/texteller/models/ocr_model/train/dataset/train/30.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/30.png rename to texteller/models/ocr_model/train/dataset/train/30.png diff --git a/src/models/ocr_model/train/dataset/images/31.png b/texteller/models/ocr_model/train/dataset/train/31.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/31.png rename to texteller/models/ocr_model/train/dataset/train/31.png diff --git a/src/models/ocr_model/train/dataset/images/32.png b/texteller/models/ocr_model/train/dataset/train/32.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/32.png rename to texteller/models/ocr_model/train/dataset/train/32.png diff --git a/src/models/ocr_model/train/dataset/images/33.png b/texteller/models/ocr_model/train/dataset/train/33.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/33.png rename to texteller/models/ocr_model/train/dataset/train/33.png diff --git a/src/models/ocr_model/train/dataset/images/34.png b/texteller/models/ocr_model/train/dataset/train/34.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/34.png rename to texteller/models/ocr_model/train/dataset/train/34.png diff --git a/src/models/ocr_model/train/dataset/images/4.png b/texteller/models/ocr_model/train/dataset/train/4.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/4.png rename to texteller/models/ocr_model/train/dataset/train/4.png diff --git a/src/models/ocr_model/train/dataset/images/5.png b/texteller/models/ocr_model/train/dataset/train/5.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/5.png rename to texteller/models/ocr_model/train/dataset/train/5.png diff --git a/src/models/ocr_model/train/dataset/images/6.png b/texteller/models/ocr_model/train/dataset/train/6.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/6.png rename to texteller/models/ocr_model/train/dataset/train/6.png diff --git a/src/models/ocr_model/train/dataset/images/7.png b/texteller/models/ocr_model/train/dataset/train/7.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/7.png rename to texteller/models/ocr_model/train/dataset/train/7.png diff --git a/src/models/ocr_model/train/dataset/images/8.png b/texteller/models/ocr_model/train/dataset/train/8.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/8.png rename to texteller/models/ocr_model/train/dataset/train/8.png diff --git a/src/models/ocr_model/train/dataset/images/9.png b/texteller/models/ocr_model/train/dataset/train/9.png similarity index 100% rename from src/models/ocr_model/train/dataset/images/9.png rename to texteller/models/ocr_model/train/dataset/train/9.png diff --git a/texteller/models/ocr_model/train/dataset/train/metadata.jsonl b/texteller/models/ocr_model/train/dataset/train/metadata.jsonl new file mode 100644 index 0000000..23279de --- /dev/null +++ b/texteller/models/ocr_model/train/dataset/train/metadata.jsonl @@ -0,0 +1,35 @@ +{"file_name": "0.png", "latex_formula": "\\[\\mathbb{C}^{4}\\stackrel{{\\pi_{1}}}{{\\longleftarrow}}\\mathcal{ F}\\stackrel{{\\pi_{2}}}{{\\rightarrow}}\\mathcal{PT},\\]"} +{"file_name": "1.png", "latex_formula": "\\[W^{*}_{Z}(x_{1},x_{2})=W_{f\\lrcorner Z}(y_{1},y_{2})=\\mathcal{P}\\exp\\left( \\int_{\\gamma}A_{\\mu}dx^{\\mu}\\right).\\]"} +{"file_name": "2.png", "latex_formula": "\\[G=W^{*}_{Z}(q,p)=\\tilde{H}H^{-1}\\]"} +{"file_name": "3.png", "latex_formula": "\\[H=W^{*}_{Z}(p,x),\\ \\ \\tilde{H}=W^{*}_{Z}(q,x).\\]"} +{"file_name": "4.png", "latex_formula": "\\[v\\cdot f^{*}A|_{x}=(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)},\\quad x\\in Z, \\ v\\in T_{x}Z.\\]"} +{"file_name": "5.png", "latex_formula": "\\[(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)}=v^{\\alpha\\dot{\\alpha}}\\Big{(} \\frac{\\partial y^{\\beta\\dot{\\beta}}}{\\partial x^{\\alpha\\dot{\\alpha}}}A_{\\beta \\dot{\\beta}}\\Big{)}\\Big{|}_{f\\lrcorner Z(x)},\\ x\\in Z,\\ v\\in T_{x}Z,\\]"} +{"file_name": "6.png", "latex_formula": "\\[\\{T_{i},T_{j}\\}=\\{\\tilde{T}^{i},\\tilde{T}^{j}\\}=0,\\ \\ \\{T_{i},\\tilde{T}^{j}\\}=2i \\delta^{j}_{i}D,\\]"} +{"file_name": "7.png", "latex_formula": "\\[(\\partial_{s},q_{i},\\tilde{q}^{k})\\rightarrow(D,M^{j}_{i}T_{j},\\tilde{M}^{k}_ {l}\\tilde{T}^{l}),\\]"} +{"file_name": "8.png", "latex_formula": "\\[M^{i}_{j}\\tilde{M}^{j}_{k}=\\delta^{i}_{k}.\\]"} +{"file_name": "9.png", "latex_formula": "\\[Q_{i\\alpha}=q_{i\\alpha}+\\omega_{i\\alpha},\\ \\tilde{Q}^{i}_{\\dot{\\alpha}}=q^{i}_{ \\dot{\\alpha}}+\\tilde{\\omega}^{i}_{\\dot{\\alpha}},\\ D_{\\alpha\\dot{\\alpha}}= \\partial_{\\alpha\\dot{\\alpha}}+A_{\\alpha\\dot{\\alpha}}.\\]"} +{"file_name": "10.png", "latex_formula": "\\[\\hat{f}(g,\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{j})=(f(g),[V^{-1}]^ {\\alpha}_{\\beta}\\theta^{i\\beta},[\\tilde{V}^{-1}]^{\\dot{\\alpha}}_{\\dot{\\beta}} \\tilde{\\theta}^{\\dot{\\beta}}_{j}),\\ g\\in{\\cal G},\\]"} +{"file_name": "11.png", "latex_formula": "\\[v^{\\beta\\dot{\\beta}}V^{\\alpha}_{\\beta}\\tilde{V}^{\\dot{\\alpha}}_{\\dot{\\beta}} =((f\\lrcorner L_{0})_{*}v)^{\\alpha\\dot{\\alpha}},\\]"} +{"file_name": "12.png", "latex_formula": "\\[\\omega_{i\\alpha}=\\tilde{\\theta}^{\\dot{\\alpha}}_{i}h_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\ \\ \\tilde{\\omega}^{i}_{\\alpha}=\\theta^{i\\alpha}\\tilde{h}_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\]"} +{"file_name": "13.png", "latex_formula": "\\[\\begin{split}&\\lambda^{\\alpha}\\hat{f}^{*}\\omega_{i\\alpha}(z)= \\tilde{\\theta}^{\\dot{\\beta}}_{i}\\lambda^{\\alpha}\\left(V^{\\beta}_{\\alpha}h_{ \\beta\\dot{\\beta}}(x^{\\prime},\\tau^{\\prime})\\right),\\\\ &\\tilde{\\lambda}^{\\dot{\\alpha}}\\hat{f}^{*}\\tilde{\\omega}^{i}_{ \\dot{\\alpha}}(z)=\\theta^{i\\beta}\\tilde{\\lambda}^{\\dot{\\alpha}}\\left(\\tilde{V}^ {\\dot{\\beta}}_{\\dot{\\alpha}}\\tilde{h}_{\\beta\\dot{\\beta}}(x^{\\prime},\\tau^{ \\prime})\\right),\\end{split}\\]"} +{"file_name": "14.png", "latex_formula": "\\[A_{\\alpha\\dot{\\alpha}}=A_{\\alpha\\dot{\\alpha}}(x^{\\beta\\dot{\\beta}},\\tau^{ \\beta\\dot{\\beta}})\\]"} +{"file_name": "15.png", "latex_formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}D_{\\alpha\\dot{\\alpha}}\\]"} +{"file_name": "16.png", "latex_formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}\\]"} +{"file_name": "17.png", "latex_formula": "\\[[v_{1}\\cdot D^{*},v_{2}\\cdot D^{*}]=0\\]"} +{"file_name": "18.png", "latex_formula": "\\[\\Phi_{A}=(\\omega_{i\\alpha},\\tilde{\\omega}^{i}_{\\dot{\\alpha}},A_{\\alpha\\dot{ \\alpha}})\\]"} +{"file_name": "19.png", "latex_formula": "\\[\\hat{f}:{\\cal F}^{6|4N}\\rightarrow{\\cal F}^{6|4N}\\]"} +{"file_name": "20.png", "latex_formula": "\\[\\sigma=(s,\\xi^{i},\\tilde{\\xi}_{j})\\in\\mathbb{C}^{1|2N}\\]"} +{"file_name": "21.png", "latex_formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}(h_{\\alpha\\dot{\\alpha}}+\\tilde{h}_{\\alpha\\dot{\\alpha} })=0\\]"} +{"file_name": "22.png", "latex_formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}\\rightarrow[V^{-1}]^{\\alpha}_{\\beta}[\\tilde{V}^{-1}]^{ \\dot{\\alpha}}_{\\dot{\\beta}}\\tau^{\\beta\\dot{\\beta}}\\]"} +{"file_name": "23.png", "latex_formula": "\\[\\tau^{\\beta\\dot{\\beta}}=\\sum_{i}\\theta^{i\\beta}\\tilde{\\theta}^{\\dot{\\beta}}_{i}\\]"} +{"file_name": "24.png", "latex_formula": "\\[\\theta^{i\\alpha}\\omega_{i\\alpha}+\\tilde{\\theta}^{i}_{\\dot{\\alpha}}\\tilde{ \\omega}^{\\dot{\\alpha}}_{i}=0\\]"} +{"file_name": "25.png", "latex_formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{Q}^{i}_{\\dot{\\alpha}}\\]"} +{"file_name": "26.png", "latex_formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{q}^{i}_{\\dot{\\alpha}}\\]"} +{"file_name": "27.png", "latex_formula": "\\[\\tilde{\\lambda}^{\\dot{\\alpha}}f^{*}A_{\\alpha\\dot{\\alpha}}=H^{-1}\\tilde{ \\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}H\\]"} +{"file_name": "28.png", "latex_formula": "\\[\\tilde{q}^{i}=\\partial_{\\tilde{\\xi}_{i}}+i\\xi^{i}\\partial_{s}\\]"} +{"file_name": "29.png", "latex_formula": "\\[\\tilde{q}^{i}_{\\dot{\\alpha}}=\\frac{\\partial}{\\partial\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}}+i\\theta^{i\\alpha}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"} +{"file_name": "30.png", "latex_formula": "\\[f\\lrcorner L(z)=\\pi_{1}\\circ f(z,\\lambda,\\tilde{\\lambda})\\ \\forall z\\in L\\]"} +{"file_name": "31.png", "latex_formula": "\\[q_{i\\alpha}=\\frac{\\partial}{\\partial\\theta^{i\\alpha}}+i\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"} +{"file_name": "32.png", "latex_formula": "\\[q_{i}=\\partial_{\\xi^{i}}+i\\tilde{\\xi}_{i}\\partial_{s}\\]"} +{"file_name": "33.png", "latex_formula": "\\[v^{\\alpha\\dot{\\alpha}}=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\]"} +{"file_name": "34.png", "latex_formula": "\\[z^{A}=(x^{\\alpha\\dot{\\alpha}},\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{ j})\\]"} diff --git a/src/models/ocr_model/train/train.py b/texteller/models/ocr_model/train/train.py similarity index 71% rename from src/models/ocr_model/train/train.py rename to texteller/models/ocr_model/train/train.py index 9d37f44..80b58af 100644 --- a/src/models/ocr_model/train/train.py +++ b/texteller/models/ocr_model/train/train.py @@ -5,18 +5,24 @@ from pathlib import Path from datasets import load_dataset from transformers import ( - Trainer, - TrainingArguments, - Seq2SeqTrainer, - Seq2SeqTrainingArguments, - GenerationConfig + Trainer, + TrainingArguments, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + GenerationConfig, ) from .training_args import CONFIG from ..model.TexTeller import TexTeller -from ..utils.functional import tokenize_fn, collate_fn, img_train_transform, img_inf_transform, filter_fn +from ..utils.functional import ( + tokenize_fn, + collate_fn, + img_train_transform, + img_inf_transform, + filter_fn, +) from ..utils.metrics import bleu_metric -from ...globals import MAX_TOKEN_SIZE, MIN_WIDTH, MIN_HEIGHT +from ...globals import MAX_TOKEN_SIZE, MIN_WIDTH, MIN_HEIGHT def train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer): @@ -24,11 +30,9 @@ def train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokeniz trainer = Trainer( model, training_args, - train_dataset=train_dataset, eval_dataset=eval_dataset, - - tokenizer=tokenizer, + tokenizer=tokenizer, data_collator=collate_fn_with_tokenizer, ) @@ -52,43 +56,44 @@ def evaluate(model, tokenizer, eval_dataset, collate_fn): trainer = Seq2SeqTrainer( model, seq2seq_config, - eval_dataset=eval_dataset, - tokenizer=tokenizer, + tokenizer=tokenizer, data_collator=collate_fn, - compute_metrics=partial(bleu_metric, tokenizer=tokenizer) + compute_metrics=partial(bleu_metric, tokenizer=tokenizer), ) eval_res = trainer.evaluate() print(eval_res) - + if __name__ == '__main__': script_dirpath = Path(__file__).resolve().parent os.chdir(script_dirpath) - dataset = load_dataset(str(Path('./dataset/loader.py').resolve()))['train'] - dataset = dataset.filter(lambda x: x['image'].height > MIN_HEIGHT and x['image'].width > MIN_WIDTH) + # dataset = load_dataset(str(Path('./dataset/loader.py').resolve()))['train'] + dataset = load_dataset("imagefolder", data_dir=str(script_dirpath / 'dataset'))['train'] + dataset = dataset.filter( + lambda x: x['image'].height > MIN_HEIGHT and x['image'].width > MIN_WIDTH + ) dataset = dataset.shuffle(seed=42) dataset = dataset.flatten_indices() tokenizer = TexTeller.get_tokenizer() # If you want use your own tokenizer, please modify the path to your tokenizer - #+tokenizer = TexTeller.get_tokenizer('/path/to/your/tokenizer') + # +tokenizer = TexTeller.get_tokenizer('/path/to/your/tokenizer') filter_fn_with_tokenizer = partial(filter_fn, tokenizer=tokenizer) - dataset = dataset.filter( - filter_fn_with_tokenizer, - num_proc=8 - ) + dataset = dataset.filter(filter_fn_with_tokenizer, num_proc=8) map_fn = partial(tokenize_fn, tokenizer=tokenizer) - tokenized_dataset = dataset.map(map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8) + tokenized_dataset = dataset.map( + map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8 + ) # Split dataset into train and eval, ratio 9:1 - split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42) + split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42) train_dataset, eval_dataset = split_dataset['train'], split_dataset['test'] train_dataset = train_dataset.with_transform(img_train_transform) - eval_dataset = eval_dataset.with_transform(img_inf_transform) + eval_dataset = eval_dataset.with_transform(img_inf_transform) collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer) # Train from scratch @@ -96,14 +101,14 @@ if __name__ == '__main__': # or train from TexTeller pre-trained model: model = TexTeller.from_pretrained() # If you want to train from pre-trained model, please modify the path to your pre-trained checkpoint - #+e.g. - #+model = TexTeller.from_pretrained( - #+ '/path/to/your/model_checkpoint' - #+) + # +e.g. + # +model = TexTeller.from_pretrained( + # + '/path/to/your/model_checkpoint' + # +) enable_train = True enable_evaluate = False if enable_train: - train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer) + train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer) if enable_evaluate and len(eval_dataset) > 0: evaluate(model, tokenizer, eval_dataset, collate_fn_with_tokenizer) diff --git a/texteller/models/ocr_model/train/training_args.py b/texteller/models/ocr_model/train/training_args.py new file mode 100644 index 0000000..b377cab --- /dev/null +++ b/texteller/models/ocr_model/train/training_args.py @@ -0,0 +1,31 @@ +CONFIG = { + "seed": 42, # Random seed for reproducibility + "use_cpu": False, # Whether to use CPU (it's easier to debug with CPU when starting to test the code) + "learning_rate": 5e-5, # Learning rate + "num_train_epochs": 10, # Total number of training epochs + "per_device_train_batch_size": 4, # Batch size per GPU for training + "per_device_eval_batch_size": 8, # Batch size per GPU for evaluation + "output_dir": "train_result", # Output directory + "overwrite_output_dir": False, # If the output directory exists, do not delete its content + "report_to": ["tensorboard"], # Report logs to TensorBoard + "save_strategy": "steps", # Strategy to save checkpoints + "save_steps": 500, # Interval of steps to save checkpoints, can be int or a float (0~1), when float it represents the ratio of total training steps (e.g., can set to 1.0 / 2000) + "save_total_limit": 5, # Maximum number of models to save. The oldest models will be deleted if this number is exceeded + "logging_strategy": "steps", # Log every certain number of steps + "logging_steps": 500, # Number of steps between each log + "logging_nan_inf_filter": False, # Record logs for loss=nan or inf + "optim": "adamw_torch", # Optimizer + "lr_scheduler_type": "cosine", # Learning rate scheduler + "warmup_ratio": 0.1, # Ratio of warmup steps in total training steps (e.g., for 1000 steps, the first 100 steps gradually increase lr from 0 to the set lr) + "max_grad_norm": 1.0, # For gradient clipping, ensure the norm of the gradients does not exceed 1.0 (default 1.0) + "fp16": False, # Whether to use 16-bit floating point for training (generally not recommended, as loss can easily explode) + "bf16": False, # Whether to use Brain Floating Point (bfloat16) for training (recommended if architecture supports it) + "gradient_accumulation_steps": 1, # Gradient accumulation steps, consider this parameter to achieve large batch size effects when batch size cannot be large + "jit_mode_eval": False, # Whether to use PyTorch jit trace during eval (can speed up the model, but the model must be static, otherwise will throw errors) + "torch_compile": False, # Whether to use torch.compile to compile the model (for better training and inference performance) + "dataloader_pin_memory": True, # Can speed up data transfer between CPU and GPU + "dataloader_num_workers": 1, # Default is not to use multiprocessing for data loading, usually set to 4*number of GPUs used + "evaluation_strategy": "steps", # Evaluation strategy, can be "steps" or "epoch" + "eval_steps": 500, # If evaluation_strategy="step" + "remove_unused_columns": False, # Don't change this unless you really know what you are doing. +} diff --git a/texteller/models/ocr_model/utils/__pycache__/functional.cpython-310.pyc b/texteller/models/ocr_model/utils/__pycache__/functional.cpython-310.pyc new file mode 100644 index 0000000..ed9478f Binary files /dev/null and b/texteller/models/ocr_model/utils/__pycache__/functional.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/utils/__pycache__/helpers.cpython-310.pyc b/texteller/models/ocr_model/utils/__pycache__/helpers.cpython-310.pyc new file mode 100644 index 0000000..604eb34 Binary files /dev/null and b/texteller/models/ocr_model/utils/__pycache__/helpers.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/utils/__pycache__/inference.cpython-310.pyc b/texteller/models/ocr_model/utils/__pycache__/inference.cpython-310.pyc new file mode 100644 index 0000000..d90e698 Binary files /dev/null and b/texteller/models/ocr_model/utils/__pycache__/inference.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/utils/__pycache__/metrics.cpython-310.pyc b/texteller/models/ocr_model/utils/__pycache__/metrics.cpython-310.pyc new file mode 100644 index 0000000..b6c07dc Binary files /dev/null and b/texteller/models/ocr_model/utils/__pycache__/metrics.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/utils/__pycache__/ocr_aug.cpython-310.pyc b/texteller/models/ocr_model/utils/__pycache__/ocr_aug.cpython-310.pyc new file mode 100644 index 0000000..19d359b Binary files /dev/null and b/texteller/models/ocr_model/utils/__pycache__/ocr_aug.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/utils/__pycache__/to_katex.cpython-310.pyc b/texteller/models/ocr_model/utils/__pycache__/to_katex.cpython-310.pyc new file mode 100644 index 0000000..97aca70 Binary files /dev/null and b/texteller/models/ocr_model/utils/__pycache__/to_katex.cpython-310.pyc differ diff --git a/texteller/models/ocr_model/utils/__pycache__/transforms.cpython-310.pyc b/texteller/models/ocr_model/utils/__pycache__/transforms.cpython-310.pyc new file mode 100644 index 0000000..7dd1bdb Binary files /dev/null and b/texteller/models/ocr_model/utils/__pycache__/transforms.cpython-310.pyc differ diff --git a/src/models/ocr_model/utils/functional.py b/texteller/models/ocr_model/utils/functional.py similarity index 95% rename from src/models/ocr_model/utils/functional.py rename to texteller/models/ocr_model/utils/functional.py index 9cb19ab..aa3199e 100644 --- a/src/models/ocr_model/utils/functional.py +++ b/texteller/models/ocr_model/utils/functional.py @@ -26,7 +26,7 @@ def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[ pixel_values = [dic.pop('pixel_values') for dic in samples] clm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) - + batch = clm_collator(samples) batch['pixel_values'] = pixel_values batch['decoder_input_ids'] = batch.pop('input_ids') @@ -54,6 +54,7 @@ def img_inf_transform(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]: def filter_fn(sample, tokenizer=None) -> bool: return ( - sample['image'].height > MIN_HEIGHT and sample['image'].width > MIN_WIDTH + sample['image'].height > MIN_HEIGHT + and sample['image'].width > MIN_WIDTH and len(tokenizer(sample['latex_formula'])['input_ids']) < MAX_TOKEN_SIZE - 10 ) diff --git a/src/models/ocr_model/utils/helpers.py b/texteller/models/ocr_model/utils/helpers.py similarity index 92% rename from src/models/ocr_model/utils/helpers.py rename to texteller/models/ocr_model/utils/helpers.py index d650556..50e8bd0 100644 --- a/src/models/ocr_model/utils/helpers.py +++ b/texteller/models/ocr_model/utils/helpers.py @@ -12,7 +12,7 @@ def convert2rgb(image_paths: List[str]) -> List[np.ndarray]: continue if image.dtype == np.uint16: print(f'Converting {path} to 8-bit, image may be lossy.') - image = cv2.convertScaleAbs(image, alpha=(255.0/65535.0)) + image = cv2.convertScaleAbs(image, alpha=(255.0 / 65535.0)) channels = 1 if len(image.shape) == 2 else image.shape[2] if channels == 4: diff --git a/src/models/ocr_model/utils/inference.py b/texteller/models/ocr_model/utils/inference.py similarity index 84% rename from src/models/ocr_model/utils/inference.py rename to texteller/models/ocr_model/utils/inference.py index 7d8e4d9..d07100b 100644 --- a/src/models/ocr_model/utils/inference.py +++ b/texteller/models/ocr_model/utils/inference.py @@ -11,12 +11,12 @@ from ...globals import MAX_TOKEN_SIZE def inference( - model: TexTeller, + model: TexTeller, tokenizer: RobertaTokenizerFast, - imgs: Union[List[str], List[np.ndarray]], + imgs: Union[List[str], List[np.ndarray]], accelerator: str = 'cpu', num_beams: int = 1, - max_tokens = None + max_tokens=None, ) -> List[str]: if imgs == []: return [] @@ -24,10 +24,10 @@ def inference( # not onnx session, turn model.eval() model.eval() if isinstance(imgs[0], str): - imgs = convert2rgb(imgs) + imgs = convert2rgb(imgs) else: # already numpy array(rgb format) assert isinstance(imgs[0], np.ndarray) - imgs = imgs + imgs = imgs imgs = inference_transform(imgs) pixel_values = torch.stack(imgs) @@ -44,6 +44,6 @@ def inference( eos_token_id=tokenizer.eos_token_id, bos_token_id=tokenizer.bos_token_id, ) - pred = model.generate(pixel_values, generation_config=generate_config) + pred = model.generate(pixel_values.to(model.device), generation_config=generate_config) res = tokenizer.batch_decode(pred, skip_special_tokens=True) return res diff --git a/src/models/ocr_model/utils/metrics.py b/texteller/models/ocr_model/utils/metrics.py similarity index 84% rename from src/models/ocr_model/utils/metrics.py rename to texteller/models/ocr_model/utils/metrics.py index 1dd0702..13dc972 100644 --- a/src/models/ocr_model/utils/metrics.py +++ b/texteller/models/ocr_model/utils/metrics.py @@ -10,9 +10,11 @@ from transformers import EvalPrediction, RobertaTokenizer def bleu_metric(eval_preds: EvalPrediction, tokenizer: RobertaTokenizer) -> Dict: cur_dir = Path(os.getcwd()) os.chdir(Path(__file__).resolve().parent) - metric = evaluate.load('google_bleu') # Will download the metric from huggingface if not already downloaded + metric = evaluate.load( + 'google_bleu' + ) # Will download the metric from huggingface if not already downloaded os.chdir(cur_dir) - + logits, labels = eval_preds.predictions, eval_preds.label_ids preds = logits diff --git a/src/models/ocr_model/utils/ocr_aug.py b/texteller/models/ocr_model/utils/ocr_aug.py similarity index 94% rename from src/models/ocr_model/utils/ocr_aug.py rename to texteller/models/ocr_model/utils/ocr_aug.py index 5678c61..a232735 100644 --- a/src/models/ocr_model/utils/ocr_aug.py +++ b/texteller/models/ocr_model/utils/ocr_aug.py @@ -1,9 +1,9 @@ from augraphy import * import random + def ocr_augmentation_pipeline(): - pre_phase = [ - ] + pre_phase = [] ink_phase = [ InkColorSwap( @@ -16,7 +16,7 @@ def ocr_augmentation_pipeline(): ink_swap_min_area_range=(10, 20), ink_swap_max_area_range=(400, 500), # p=0.2 - p=0.4 + p=0.4, ), LinesDegradation( line_roi=(0.0, 0.0, 1.0, 1.0), @@ -29,9 +29,8 @@ def ocr_augmentation_pipeline(): line_replacement_probability=(0.4, 0.5), line_replacement_thickness=(1, 3), # p=0.2 - p=0.4 + p=0.4, ), - # ============================ OneOf( [ @@ -46,10 +45,9 @@ def ocr_augmentation_pipeline(): ), ], # p=0.2 - p=0.4 + p=0.4, ), # ============================ - # ============================ InkShifter( text_shift_scale_range=(18, 27), @@ -59,10 +57,9 @@ def ocr_augmentation_pipeline(): blur_sigma=0, noise_type="perlin", # p=0.2 - p=0.4 + p=0.4, ), # ============================ - ] paper_phase = [ @@ -72,14 +69,14 @@ def ocr_augmentation_pipeline(): texture_width_range=(300, 500), texture_height_range=(300, 500), # p=0.2 - p=0.4 + p=0.4, ), BrightnessTexturize( # tested texturize_range=(0.9, 0.99), deviation=0.03, # p=0.2 - p=0.4 - ) + p=0.4, + ), ] post_phase = [ @@ -90,9 +87,8 @@ def ocr_augmentation_pipeline(): color_shift_brightness_range=(0.9, 1.1), color_shift_gaussian_kernel_range=(3, 3), # p=0.2 - p=0.4 + p=0.4, ), - DirtyDrum( # tested line_width_range=(1, 6), line_concentration=random.uniform(0.05, 0.15), @@ -102,9 +98,8 @@ def ocr_augmentation_pipeline(): ksize=random.choice([(3, 3), (5, 5), (7, 7)]), sigmaX=0, # p=0.2 - p=0.4 + p=0.4, ), - # ===================================== OneOf( [ @@ -127,10 +122,9 @@ def ocr_augmentation_pipeline(): ), ], # p=0.2 - p=0.4 + p=0.4, ), # ===================================== - # ===================================== OneOf( [ @@ -142,7 +136,7 @@ def ocr_augmentation_pipeline(): ), ], # p=0.2 - p=0.4 + p=0.4, ), # ===================================== ] @@ -152,7 +146,7 @@ def ocr_augmentation_pipeline(): paper_phase=paper_phase, post_phase=post_phase, pre_phase=pre_phase, - log=False + log=False, ) return pipeline diff --git a/src/models/ocr_model/utils/to_katex.py b/texteller/models/ocr_model/utils/to_katex.py similarity index 93% rename from src/models/ocr_model/utils/to_katex.py rename to texteller/models/ocr_model/utils/to_katex.py index b6166dc..20518a7 100644 --- a/src/models/ocr_model/utils/to_katex.py +++ b/texteller/models/ocr_model/utils/to_katex.py @@ -5,9 +5,9 @@ def change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, ne result = "" i = 0 n = len(input_str) - + while i < n: - if input_str[i:i+len(old_inst)] == old_inst: + if input_str[i : i + len(old_inst)] == old_inst: # check if the old_inst is followed by old_surr_l start = i + len(old_inst) else: @@ -33,12 +33,12 @@ def change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, ne count += 1 escaped = False j += 1 - + if count == 0: assert j < n assert input_str[start] == old_surr_l assert input_str[j] == old_surr_r - inner_content = input_str[start + 1:j] + inner_content = input_str[start + 1 : j] # Replace the content with new pattern result += new_inst + new_surr_l + inner_content + new_surr_r i = j + 1 @@ -53,7 +53,7 @@ def change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, ne else: result += input_str[i:start] i = start - + if old_inst != new_inst and (old_inst + old_surr_l) in result: return change(result, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r) else: @@ -68,12 +68,12 @@ def find_substring_positions(string, substring): def rm_dollar_surr(content): pattern = re.compile(r'\\[a-zA-Z]+\$.*?\$|\$.*?\$') matches = pattern.findall(content) - + for match in matches: if not re.match(r'\\[a-zA-Z]+', match): new_match = match.strip('$') content = content.replace(match, ' ' + new_match + ' ') - + return content @@ -81,7 +81,11 @@ def change_all(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l pos = find_substring_positions(input_str, old_inst + old_surr_l) res = list(input_str) for p in pos[::-1]: - res[p:] = list(change(''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r)) + res[p:] = list( + change( + ''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r + ) + ) res = ''.join(res) return res @@ -106,7 +110,6 @@ def to_katex(formula: str) -> str: res = change_all(res, r'\raisebox', r' ', r'{', r'}', r'', r' ') res = change_all(res, r'\vbox', r' ', r'{', r'}', r'', r' ') - origin_instructions = [ r'\Huge', r'\huge', @@ -116,9 +119,9 @@ def to_katex(formula: str) -> str: r'\normalsize', r'\small', r'\footnotesize', - r'\tiny' + r'\tiny', ] - for (old_ins, new_ins) in zip(origin_instructions, origin_instructions): + for old_ins, new_ins in zip(origin_instructions, origin_instructions): res = change_all(res, old_ins, new_ins, r'$', r'$', '{', '}') res = change_all(res, r'\boldmath ', r'\bm', r'{', r'}', r'{', r'}') res = change_all(res, r'\boldmath', r'\bm', r'{', r'}', r'{', r'}') @@ -127,7 +130,7 @@ def to_katex(formula: str) -> str: res = change_all(res, r'\scriptsize', r'\scriptsize', r'$', r'$', r'{', r'}') res = change_all(res, r'\emph', r'\textit', r'{', r'}', r'{', r'}') res = change_all(res, r'\emph ', r'\textit', r'{', r'}', r'{', r'}') - + origin_instructions = [ r'\left', r'\middle', @@ -147,7 +150,7 @@ def to_katex(formula: str) -> str: r'\bigr', r'\Bigr', r'\biggr', - r'\Biggr' + r'\Biggr', ] for origin_ins in origin_instructions: res = change_all(res, origin_ins, origin_ins, r'{', r'}', r'', r'') @@ -169,6 +172,7 @@ def to_katex(formula: str) -> str: texts = match.group(0) merged_content = ''.join(re.findall(r'\\text\{([^}]*)\}', texts)) return f'\\text{{{merged_content}}}' + res = re.sub(r'(\\text\{[^}]*\}\s*){2,}', merge_texts, res) res = res.replace(r'\bf ', '') diff --git a/src/models/ocr_model/utils/transforms.py b/texteller/models/ocr_model/utils/transforms.py similarity index 69% rename from src/models/ocr_model/utils/transforms.py rename to texteller/models/ocr_model/utils/transforms.py index 2a1a64a..7da2de0 100644 --- a/src/models/ocr_model/utils/transforms.py +++ b/texteller/models/ocr_model/utils/transforms.py @@ -11,31 +11,32 @@ from collections import Counter from ...globals import ( IMG_CHANNELS, FIXED_IMG_SIZE, - IMAGE_MEAN, IMAGE_STD, - MAX_RESIZE_RATIO, MIN_RESIZE_RATIO + IMAGE_MEAN, + IMAGE_STD, + MAX_RESIZE_RATIO, + MIN_RESIZE_RATIO, ) from .ocr_aug import ocr_augmentation_pipeline # train_pipeline = default_augraphy_pipeline(scan_only=True) train_pipeline = ocr_augmentation_pipeline() -general_transform_pipeline = v2.Compose([ - v2.ToImage(), - v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point - v2.Grayscale(), - - v2.Resize( - size=FIXED_IMG_SIZE - 1, - interpolation=v2.InterpolationMode.BICUBIC, - max_size=FIXED_IMG_SIZE, - antialias=True - ), - - v2.ToDtype(torch.float32, scale=True), # Normalize expects float input - v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]), - - # v2.ToPILImage() -]) +general_transform_pipeline = v2.Compose( + [ + v2.ToImage(), + v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point + v2.Grayscale(), + v2.Resize( + size=FIXED_IMG_SIZE - 1, + interpolation=v2.InterpolationMode.BICUBIC, + max_size=FIXED_IMG_SIZE, + antialias=True, + ), + v2.ToDtype(torch.float32, scale=True), # Normalize expects float input + v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]), + # v2.ToPILImage() + ] +) def trim_white_border(image: np.ndarray): @@ -45,11 +46,10 @@ def trim_white_border(image: np.ndarray): if image.dtype != np.uint8: raise ValueError(f"Image should stored in uint8") - corners = [tuple(image[0, 0]), tuple(image[0, -1]), - tuple(image[-1, 0]), tuple(image[-1, -1])] + corners = [tuple(image[0, 0]), tuple(image[0, -1]), tuple(image[-1, 0]), tuple(image[-1, -1])] bg_color = Counter(corners).most_common(1)[0][0] bg_color_np = np.array(bg_color, dtype=np.uint8) - + h, w = image.shape[:2] bg = np.full((h, w, 3), bg_color_np, dtype=np.uint8) @@ -59,9 +59,9 @@ def trim_white_border(image: np.ndarray): threshold = 15 _, diff = cv2.threshold(mask, threshold, 255, cv2.THRESH_BINARY) - x, y, w, h = cv2.boundingRect(diff) + x, y, w, h = cv2.boundingRect(diff) - trimmed_image = image[y:y+h, x:x+w] + trimmed_image = image[y : y + h, x : x + w] return trimmed_image @@ -69,12 +69,12 @@ def trim_white_border(image: np.ndarray): def add_white_border(image: np.ndarray, max_size: int) -> np.ndarray: randi = [random.randint(0, max_size) for _ in range(4)] pad_height_size = randi[1] + randi[3] - pad_width_size = randi[0] + randi[2] - if (pad_height_size + image.shape[0] < 30): + pad_width_size = randi[0] + randi[2] + if pad_height_size + image.shape[0] < 30: compensate_height = int((30 - (pad_height_size + image.shape[0])) * 0.5) + 1 randi[1] += compensate_height randi[3] += compensate_height - if (pad_width_size + image.shape[1] < 30): + if pad_width_size + image.shape[1] < 30: compensate_width = int((30 - (pad_width_size + image.shape[1])) * 0.5) + 1 randi[0] += compensate_width randi[2] += compensate_width @@ -82,32 +82,29 @@ def add_white_border(image: np.ndarray, max_size: int) -> np.ndarray: torch.from_numpy(image).permute(2, 0, 1), padding=randi, padding_mode='constant', - fill=(255, 255, 255) + fill=(255, 255, 255), ) def padding(images: List[torch.Tensor], required_size: int) -> List[torch.Tensor]: - images = [ + images = [ v2.functional.pad( - img, - padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]] + img, padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]] ) for img in images ] return images -def random_resize( - images: List[np.ndarray], - minr: float, - maxr: float -) -> List[np.ndarray]: +def random_resize(images: List[np.ndarray], minr: float, maxr: float) -> List[np.ndarray]: if len(images[0].shape) != 3 or images[0].shape[2] != 3: raise ValueError("Image is not in RGB format or channel is not in third dimension") ratios = [random.uniform(minr, maxr) for _ in range(len(images))] return [ - cv2.resize(img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LANCZOS4) # 抗锯齿 + cv2.resize( + img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LANCZOS4 + ) # 抗锯齿 for img, r in zip(images, ratios) ] @@ -133,7 +130,9 @@ def rotate(image: np.ndarray, min_angle: int, max_angle: int) -> np.ndarray: rotation_mat[1, 2] += (new_height / 2) - image_center[1] # Rotate the image with the specified border color (white in this case) - rotated_image = cv2.warpAffine(image, rotation_mat, (new_width, new_height), borderValue=(255, 255, 255)) + rotated_image = cv2.warpAffine( + image, rotation_mat, (new_width, new_height), borderValue=(255, 255, 255) + ) return rotated_image @@ -147,7 +146,7 @@ def ocr_aug(image: np.ndarray) -> np.ndarray: def train_transform(images: List[Image.Image]) -> List[torch.Tensor]: - assert IMG_CHANNELS == 1 , "Only support grayscale images for now" + assert IMG_CHANNELS == 1, "Only support grayscale images for now" images = [np.array(img.convert('RGB')) for img in images] # random resize first @@ -158,18 +157,20 @@ def train_transform(images: List[Image.Image]) -> List[torch.Tensor]: images = [ocr_aug(image) for image in images] # general transform pipeline - images = [general_transform_pipeline(image) for image in images] + images = [general_transform_pipeline(image) for image in images] # padding to fixed size images = padding(images, FIXED_IMG_SIZE) return images def inference_transform(images: List[Union[np.ndarray, Image.Image]]) -> List[torch.Tensor]: - assert IMG_CHANNELS == 1 , "Only support grayscale images for now" - images = [np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images] + assert IMG_CHANNELS == 1, "Only support grayscale images for now" + images = [ + np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images + ] images = [trim_white_border(image) for image in images] # general transform pipeline - images = [general_transform_pipeline(image) for image in images] # imgs: List[PIL.Image.Image] + images = [general_transform_pipeline(image) for image in images] # imgs: List[PIL.Image.Image] # padding to fixed size images = padding(images, FIXED_IMG_SIZE) diff --git a/src/models/thrid_party/paddleocr/checkpoints/det/default_model.onnx b/texteller/models/thrid_party/paddleocr/checkpoints/det/default_model.onnx similarity index 100% rename from src/models/thrid_party/paddleocr/checkpoints/det/default_model.onnx rename to texteller/models/thrid_party/paddleocr/checkpoints/det/default_model.onnx diff --git a/src/models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx b/texteller/models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx similarity index 100% rename from src/models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx rename to texteller/models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx diff --git a/src/models/thrid_party/paddleocr/infer/CTCLabelDecode.py b/texteller/models/thrid_party/paddleocr/infer/CTCLabelDecode.py similarity index 96% rename from src/models/thrid_party/paddleocr/infer/CTCLabelDecode.py rename to texteller/models/thrid_party/paddleocr/infer/CTCLabelDecode.py index 9ee9d34..de9a275 100644 --- a/src/models/thrid_party/paddleocr/infer/CTCLabelDecode.py +++ b/texteller/models/thrid_party/paddleocr/infer/CTCLabelDecode.py @@ -1,8 +1,9 @@ -import re -import numpy as np import os +import re from pathlib import Path +import numpy as np + class BaseRecLabelDecode(object): """Convert between text-label and text-index""" @@ -102,7 +103,7 @@ class BaseRecLabelDecode(object): ): # grouping word with '-', such as 'state-of-the-art' c_state = "en&num" - if state == None: + if state is None: state = c_state if state != c_state: @@ -143,9 +144,7 @@ class BaseRecLabelDecode(object): for ignored_token in ignored_tokens: selection &= text_index[batch_idx] != ignored_token - char_list = [ - self.character[text_id] for text_id in text_index[batch_idx][selection] - ] + char_list = [self.character[text_id] for text_id in text_index[batch_idx][selection]] if text_prob is not None: conf_list = text_prob[batch_idx][selection] else: @@ -159,9 +158,7 @@ class BaseRecLabelDecode(object): text = self.pred_reverse(text) if return_word_box: - word_list, word_col_list, state_list = self.get_word_info( - text, selection - ) + word_list, word_col_list, state_list = self.get_word_info(text, selection) result_list.append( ( text, @@ -212,4 +209,4 @@ class CTCLabelDecode(BaseRecLabelDecode): def add_special_char(self, dict_character): dict_character = ["blank"] + dict_character - return dict_character \ No newline at end of file + return dict_character diff --git a/src/models/thrid_party/paddleocr/infer/DBPostProcess.py b/texteller/models/thrid_party/paddleocr/infer/DBPostProcess.py similarity index 92% rename from src/models/thrid_party/paddleocr/infer/DBPostProcess.py rename to texteller/models/thrid_party/paddleocr/infer/DBPostProcess.py index 84919e4..b8e407a 100644 --- a/src/models/thrid_party/paddleocr/infer/DBPostProcess.py +++ b/texteller/models/thrid_party/paddleocr/infer/DBPostProcess.py @@ -19,7 +19,7 @@ class DBPostProcess(object): use_dilation=False, score_mode="fast", box_type="quad", - **kwargs + **kwargs, ): self.thresh = thresh self.box_thresh = box_thresh @@ -76,9 +76,7 @@ class DBPostProcess(object): box = np.array(box) box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) - box[:, 1] = np.clip( - np.round(box[:, 1] / height * dest_height), 0, dest_height - ) + box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) boxes.append(box.tolist()) scores.append(score) return boxes, scores @@ -124,9 +122,7 @@ class DBPostProcess(object): box = np.array(box) box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width) - box[:, 1] = np.clip( - np.round(box[:, 1] / height * dest_height), 0, dest_height - ) + box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height) boxes.append(box.astype("int32")) scores.append(score) return np.array(boxes, dtype="int32"), scores @@ -215,15 +211,11 @@ class DBPostProcess(object): else: mask = segmentation[batch_index] if self.box_type == "poly": - boxes, scores = self.polygons_from_bitmap( - pred[batch_index], mask, src_w, src_h - ) + boxes, scores = self.polygons_from_bitmap(pred[batch_index], mask, src_w, src_h) elif self.box_type == "quad": - boxes, scores = self.boxes_from_bitmap( - pred[batch_index], mask, src_w, src_h - ) + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, src_w, src_h) else: raise ValueError("box_type can only be one of ['quad', 'poly']") boxes_batch.append({"points": boxes}) - return boxes_batch \ No newline at end of file + return boxes_batch diff --git a/texteller/models/thrid_party/paddleocr/infer/__pycache__/CTCLabelDecode.cpython-310.pyc b/texteller/models/thrid_party/paddleocr/infer/__pycache__/CTCLabelDecode.cpython-310.pyc new file mode 100644 index 0000000..e8a121a Binary files /dev/null and b/texteller/models/thrid_party/paddleocr/infer/__pycache__/CTCLabelDecode.cpython-310.pyc differ diff --git a/texteller/models/thrid_party/paddleocr/infer/__pycache__/DBPostProcess.cpython-310.pyc b/texteller/models/thrid_party/paddleocr/infer/__pycache__/DBPostProcess.cpython-310.pyc new file mode 100644 index 0000000..39f3cd1 Binary files /dev/null and b/texteller/models/thrid_party/paddleocr/infer/__pycache__/DBPostProcess.cpython-310.pyc differ diff --git a/texteller/models/thrid_party/paddleocr/infer/__pycache__/operators.cpython-310.pyc b/texteller/models/thrid_party/paddleocr/infer/__pycache__/operators.cpython-310.pyc new file mode 100644 index 0000000..6819fbf Binary files /dev/null and b/texteller/models/thrid_party/paddleocr/infer/__pycache__/operators.cpython-310.pyc differ diff --git a/texteller/models/thrid_party/paddleocr/infer/__pycache__/predict_det.cpython-310.pyc b/texteller/models/thrid_party/paddleocr/infer/__pycache__/predict_det.cpython-310.pyc new file mode 100644 index 0000000..7ce39f5 Binary files /dev/null and b/texteller/models/thrid_party/paddleocr/infer/__pycache__/predict_det.cpython-310.pyc differ diff --git a/texteller/models/thrid_party/paddleocr/infer/__pycache__/predict_rec.cpython-310.pyc b/texteller/models/thrid_party/paddleocr/infer/__pycache__/predict_rec.cpython-310.pyc new file mode 100644 index 0000000..89b47c7 Binary files /dev/null and b/texteller/models/thrid_party/paddleocr/infer/__pycache__/predict_rec.cpython-310.pyc differ diff --git a/texteller/models/thrid_party/paddleocr/infer/__pycache__/utility.cpython-310.pyc b/texteller/models/thrid_party/paddleocr/infer/__pycache__/utility.cpython-310.pyc new file mode 100644 index 0000000..477417c Binary files /dev/null and b/texteller/models/thrid_party/paddleocr/infer/__pycache__/utility.cpython-310.pyc differ diff --git a/src/models/thrid_party/paddleocr/infer/operators.py b/texteller/models/thrid_party/paddleocr/infer/operators.py similarity index 99% rename from src/models/thrid_party/paddleocr/infer/operators.py rename to texteller/models/thrid_party/paddleocr/infer/operators.py index 5b1e284..e04a6d3 100644 --- a/src/models/thrid_party/paddleocr/infer/operators.py +++ b/texteller/models/thrid_party/paddleocr/infer/operators.py @@ -103,7 +103,7 @@ class DetResizeForTest(object): if int(resize_w) <= 0 or int(resize_h) <= 0: return None, (None, None) img = cv2.resize(img, (int(resize_w), int(resize_h))) - except: + except: # noqa: E722 print(img.shape, resize_w, resize_h) sys.exit(0) ratio_h = resize_h / float(h) @@ -183,4 +183,4 @@ class KeepKeys(object): data_list = [] for key in self.keep_keys: data_list.append(data[key]) - return data_list \ No newline at end of file + return data_list diff --git a/src/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt b/texteller/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt similarity index 99% rename from src/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt rename to texteller/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt index 84b885d..b75af21 100644 --- a/src/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt +++ b/texteller/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt @@ -6620,4 +6620,4 @@ j 緖 續 紹 -懮 \ No newline at end of file +懮 diff --git a/src/models/thrid_party/paddleocr/infer/predict_det.py b/texteller/models/thrid_party/paddleocr/infer/predict_det.py similarity index 87% rename from src/models/thrid_party/paddleocr/infer/predict_det.py rename to texteller/models/thrid_party/paddleocr/infer/predict_det.py index 3ffdfd7..284c673 100755 --- a/src/models/thrid_party/paddleocr/infer/predict_det.py +++ b/texteller/models/thrid_party/paddleocr/infer/predict_det.py @@ -20,17 +20,17 @@ sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "../.."))) os.environ["FLAGS_allocator_strategy"] = "auto_growth" +import sys +import time + import cv2 import numpy as np -import time -import sys # import tools.infer.utility as utility import utility -from utility import get_logger - from DBPostProcess import DBPostProcess from operators import DetResizeForTest, KeepKeys, NormalizeImage, ToCHWImage +from utility import get_logger def transform(data, ops=None): @@ -43,6 +43,7 @@ def transform(data, ops=None): return None return data + logger = get_logger() @@ -63,10 +64,17 @@ class TextDetector(object): postprocess_params["box_type"] = args.det_box_type self.preprocess_op = [ - DetResizeForTest(limit_side_len=args.det_limit_side_len, limit_type=args.det_limit_type), - NormalizeImage(std= [0.229, 0.224, 0.225], mean= [0.485, 0.456, 0.406], scale= 1./255., order= "hwc"), + DetResizeForTest( + limit_side_len=args.det_limit_side_len, limit_type=args.det_limit_type + ), + NormalizeImage( + std=[0.229, 0.224, 0.225], + mean=[0.485, 0.456, 0.406], + scale=1.0 / 255.0, + order="hwc", + ), ToCHWImage(), - KeepKeys(keep_keys= ["image", "shape"]) + KeepKeys(keep_keys=["image", "shape"]), ] self.postprocess_op = DBPostProcess(**postprocess_params) ( @@ -84,7 +92,6 @@ class TextDetector(object): elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0: self.preprocess_op[0] = DetResizeForTest(image_shape=[img_h, img_w]) - def order_points_clockwise(self, pts): rect = np.zeros((4, 2), dtype="float32") s = pts.sum(axis=1) @@ -201,10 +208,7 @@ class TextDetector(object): MIN_BOUND_DISTANCE = 50 dt_boxes = np.zeros((0, 4, 2), dtype=np.float32) elapse = 0 - if ( - img.shape[0] / img.shape[1] > 2 - and img.shape[0] > self.args.det_limit_side_len - ): + if img.shape[0] / img.shape[1] > 2 and img.shape[0] > self.args.det_limit_side_len: start_h = 0 end_h = 0 while end_h <= img.shape[0]: @@ -217,30 +221,23 @@ class TextDetector(object): # To prevent text blocks from being cut off, roll back a certain buffer area. if ( len(sub_dt_boxes) == 0 - or img.shape[1] - max([x[-1][1] for x in sub_dt_boxes]) - > MIN_BOUND_DISTANCE + or img.shape[1] - max([x[-1][1] for x in sub_dt_boxes]) > MIN_BOUND_DISTANCE ): start_h = end_h else: sorted_indices = np.argsort(sub_dt_boxes[:, 2, 1]) sub_dt_boxes = sub_dt_boxes[sorted_indices] bottom_line = ( - 0 - if len(sub_dt_boxes) <= 1 - else int(np.max(sub_dt_boxes[:-1, 2, 1])) + 0 if len(sub_dt_boxes) <= 1 else int(np.max(sub_dt_boxes[:-1, 2, 1])) ) if bottom_line > 0: start_h += bottom_line - sub_dt_boxes = sub_dt_boxes[ - sub_dt_boxes[:, 2, 1] <= bottom_line - ] + sub_dt_boxes = sub_dt_boxes[sub_dt_boxes[:, 2, 1] <= bottom_line] else: start_h = end_h if len(sub_dt_boxes) > 0: if dt_boxes.shape[0] == 0: - dt_boxes = sub_dt_boxes + np.array( - [0, offset], dtype=np.float32 - ) + dt_boxes = sub_dt_boxes + np.array([0, offset], dtype=np.float32) else: dt_boxes = np.append( dt_boxes, @@ -248,10 +245,7 @@ class TextDetector(object): axis=0, ) elapse += sub_elapse - elif ( - img.shape[1] / img.shape[0] > 3 - and img.shape[1] > self.args.det_limit_side_len * 3 - ): + elif img.shape[1] / img.shape[0] > 3 and img.shape[1] > self.args.det_limit_side_len * 3: start_w = 0 end_w = 0 while end_w <= img.shape[1]: @@ -263,17 +257,14 @@ class TextDetector(object): offset = start_w if ( len(sub_dt_boxes) == 0 - or img.shape[0] - max([x[-1][0] for x in sub_dt_boxes]) - > MIN_BOUND_DISTANCE + or img.shape[0] - max([x[-1][0] for x in sub_dt_boxes]) > MIN_BOUND_DISTANCE ): start_w = end_w else: sorted_indices = np.argsort(sub_dt_boxes[:, 2, 0]) sub_dt_boxes = sub_dt_boxes[sorted_indices] right_line = ( - 0 - if len(sub_dt_boxes) <= 1 - else int(np.max(sub_dt_boxes[:-1, 1, 0])) + 0 if len(sub_dt_boxes) <= 1 else int(np.max(sub_dt_boxes[:-1, 1, 0])) ) if right_line > 0: start_w += right_line @@ -282,9 +273,7 @@ class TextDetector(object): start_w = end_w if len(sub_dt_boxes) > 0: if dt_boxes.shape[0] == 0: - dt_boxes = sub_dt_boxes + np.array( - [offset, 0], dtype=np.float32 - ) + dt_boxes = sub_dt_boxes + np.array([offset, 0], dtype=np.float32) else: dt_boxes = np.append( dt_boxes, @@ -295,4 +284,3 @@ class TextDetector(object): else: dt_boxes, elapse = self.predict(img) return dt_boxes, elapse - diff --git a/src/models/thrid_party/paddleocr/infer/predict_rec.py b/texteller/models/thrid_party/paddleocr/infer/predict_rec.py similarity index 96% rename from src/models/thrid_party/paddleocr/infer/predict_rec.py rename to texteller/models/thrid_party/paddleocr/infer/predict_rec.py index a2d4a47..603f64f 100755 --- a/src/models/thrid_party/paddleocr/infer/predict_rec.py +++ b/texteller/models/thrid_party/paddleocr/infer/predict_rec.py @@ -39,7 +39,9 @@ class TextRecognizer(object): self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] self.rec_batch_num = args.rec_batch_num self.rec_algorithm = args.rec_algorithm - self.postprocess_op = CTCLabelDecode(character_dict_path=args.rec_char_dict_path, use_space_char=args.use_space_char) + self.postprocess_op = CTCLabelDecode( + character_dict_path=args.rec_char_dict_path, use_space_char=args.use_space_char + ) ( self.predictor, self.input_tensor, @@ -143,13 +145,9 @@ class TextRecognizer(object): imgC, imgH, imgW = image_shape feature_dim = int((imgH / 8) * (imgW / 8)) - encoder_word_pos = ( - np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype("int64") - ) + encoder_word_pos = np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype("int64") gsrm_word_pos = ( - np.array(range(0, max_text_length)) - .reshape((max_text_length, 1)) - .astype("int64") + np.array(range(0, max_text_length)).reshape((max_text_length, 1)).astype("int64") ) gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) @@ -355,9 +353,7 @@ class TextRecognizer(object): max_wh_ratio = max(max_wh_ratio, wh_ratio) wh_ratio_list.append(wh_ratio) for ino in range(beg_img_no, end_img_no): - norm_img = self.resize_norm_img( - img_list[indices[ino]], max_wh_ratio - ) + norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) norm_img_batch = np.concatenate(norm_img_batch) diff --git a/src/models/thrid_party/paddleocr/infer/utility.py b/texteller/models/thrid_party/paddleocr/infer/utility.py similarity index 94% rename from src/models/thrid_party/paddleocr/infer/utility.py rename to texteller/models/thrid_party/paddleocr/infer/utility.py index e92a77c..b2404d8 100644 --- a/src/models/thrid_party/paddleocr/infer/utility.py +++ b/texteller/models/thrid_party/paddleocr/infer/utility.py @@ -92,9 +92,7 @@ def init_args(): parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") parser.add_argument("--rec_batch_num", type=int, default=6) parser.add_argument("--max_text_length", type=int, default=25) - parser.add_argument( - "--rec_char_dict_path", type=str, default="./ppocr_keys_v1.txt" - ) + parser.add_argument("--rec_char_dict_path", type=str, default="./ppocr_keys_v1.txt") parser.add_argument("--use_space_char", type=str2bool, default=True) parser.add_argument("--vis_font_path", type=str, default="./doc/fonts/simfang.ttf") parser.add_argument("--drop_score", type=float, default=0.5) @@ -107,9 +105,7 @@ def init_args(): # PGNet parmas parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5) - parser.add_argument( - "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt" - ) + parser.add_argument("--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt") parser.add_argument("--e2e_pgnet_valid_set", type=str, default="totaltext") parser.add_argument("--e2e_pgnet_mode", type=str, default="fast") @@ -194,15 +190,12 @@ def create_predictor(args, mode, logger): if not os.path.exists(model_file_path): raise ValueError("not find model file path {}".format(model_file_path)) if args.use_gpu: - sess = ort.InferenceSession( - model_file_path, providers=["CUDAExecutionProvider"] - ) + sess = ort.InferenceSession(model_file_path, providers=["CUDAExecutionProvider"]) else: sess = ort.InferenceSession(model_file_path) return sess, sess.get_inputs()[0], None, None - def get_output_tensors(args, mode, predictor): output_names = predictor.get_output_names() output_tensors = [] @@ -333,12 +326,8 @@ def draw_ocr_box_txt( def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"): - box_height = int( - math.sqrt((box[0][0] - box[3][0]) ** 2 + (box[0][1] - box[3][1]) ** 2) - ) - box_width = int( - math.sqrt((box[0][0] - box[1][0]) ** 2 + (box[0][1] - box[1][1]) ** 2) - ) + box_height = int(math.sqrt((box[0][0] - box[3][0]) ** 2 + (box[0][1] - box[3][1]) ** 2)) + box_width = int(math.sqrt((box[0][0] - box[1][0]) ** 2 + (box[0][1] - box[1][1]) ** 2)) if box_height > 2 * box_width and box_height > 30: img_text = Image.new("RGB", (box_height, box_width), (255, 255, 255)) @@ -354,9 +343,7 @@ def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"): font = create_font(txt, (box_width, box_height), font_path) draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font) - pts1 = np.float32( - [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]] - ) + pts1 = np.float32([[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]]) pts2 = np.array(box, dtype=np.float32) M = cv2.getPerspectiveTransform(pts1, pts2) @@ -411,9 +398,7 @@ def str_count(s): return s_len - math.ceil(en_dg_count / 2) -def text_visual( - texts, scores, img_h=400, img_w=600, threshold=0.0, font_path="./doc/simfang.ttf" -): +def text_visual(texts, scores, img_h=400, img_w=600, threshold=0.0, font_path="./doc/simfang.ttf"): """ create new blank img and draw txt on it args: @@ -425,9 +410,7 @@ def text_visual( return(array): """ if scores is not None: - assert len(texts) == len( - scores - ), "The number of txts and corresponding scores must match" + assert len(texts) == len(scores), "The number of txts and corresponding scores must match" def create_blank_img(): blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255 @@ -518,14 +501,10 @@ def get_rotate_crop_image(img, points): """ assert len(points) == 4, "shape of points must be 4*2" img_crop_width = int( - max( - np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3]) - ) + max(np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])) ) img_crop_height = int( - max( - np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2]) - ) + max(np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])) ) pts_std = np.float32( [ @@ -605,6 +584,8 @@ def get_image_file_list(img_file, infer_list=None): logger_initialized = {} + + @functools.lru_cache() def get_logger(name="ppocr", log_file=None, log_level=logging.DEBUG): """Initialize and get a logger by name. @@ -654,14 +635,10 @@ def get_rotate_crop_image(img, points): """ assert len(points) == 4, "shape of points must be 4*2" img_crop_width = int( - max( - np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3]) - ) + max(np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])) ) img_crop_height = int( - max( - np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2]) - ) + max(np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])) ) pts_std = np.float32( [ @@ -708,6 +685,5 @@ def get_minarea_rect_crop(img, points): return crop_img - if __name__ == "__main__": pass diff --git a/src/models/tokenizer/train.py b/texteller/models/tokenizer/train.py similarity index 89% rename from src/models/tokenizer/train.py rename to texteller/models/tokenizer/train.py index aa44521..80e5e0e 100644 --- a/src/models/tokenizer/train.py +++ b/texteller/models/tokenizer/train.py @@ -15,10 +15,9 @@ if __name__ == '__main__': dataset = load_dataset('../ocr_model/train/dataset/loader.py')['train'] new_tokenizer = tokenizer.train_new_from_iterator( - text_iterator=dataset['latex_formula'], - + text_iterator=dataset['latex_formula'], # If you want to use a different vocab size, **change VOCAB_SIZE from globals.py** - vocab_size=VOCAB_SIZE + vocab_size=VOCAB_SIZE, ) # Save the new tokenizer for later training and inference diff --git a/texteller/models/utils/__init__.py b/texteller/models/utils/__init__.py new file mode 100644 index 0000000..3597062 --- /dev/null +++ b/texteller/models/utils/__init__.py @@ -0,0 +1 @@ +from .mix_inference import mix_inference diff --git a/texteller/models/utils/__pycache__/__init__.cpython-310.pyc b/texteller/models/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..7b3278e Binary files /dev/null and b/texteller/models/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/texteller/models/utils/__pycache__/mix_inference.cpython-310.pyc b/texteller/models/utils/__pycache__/mix_inference.cpython-310.pyc new file mode 100644 index 0000000..cadc973 Binary files /dev/null and b/texteller/models/utils/__pycache__/mix_inference.cpython-310.pyc differ diff --git a/src/models/utils/mix_inference.py b/texteller/models/utils/mix_inference.py similarity index 91% rename from src/models/utils/mix_inference.py rename to texteller/models/utils/mix_inference.py index 9da3c85..0f8aa4f 100644 --- a/src/models/utils/mix_inference.py +++ b/texteller/models/utils/mix_inference.py @@ -20,12 +20,12 @@ MAXV = 999999999 def mask_img(img, bboxes: List[Bbox], bg_color: np.ndarray) -> np.ndarray: mask_img = img.copy() for bbox in bboxes: - mask_img[bbox.p.y:bbox.p.y + bbox.h, bbox.p.x:bbox.p.x + bbox.w] = bg_color + mask_img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w] = bg_color return mask_img def bbox_merge(sorted_bboxes: List[Bbox]) -> List[Bbox]: - if (len(sorted_bboxes) == 0): + if len(sorted_bboxes) == 0: return [] bboxes = sorted_bboxes.copy() guard = Bbox(MAXV, bboxes[-1].p.y, -1, -1, label="guard") @@ -61,7 +61,7 @@ def split_conflict(ocr_bboxes: List[Bbox], latex_bboxes: List[Bbox]) -> List[Bbo candidate = heapq.heappop(bboxes) curr = heapq.heappop(bboxes) idx = 0 - while (len(bboxes) > 0): + while len(bboxes) > 0: idx += 1 assert candidate.p.x <= curr.p.x or not candidate.same_row(curr) @@ -85,7 +85,7 @@ def split_conflict(ocr_bboxes: List[Bbox], latex_bboxes: List[Bbox]) -> List[Bbo curr.p.x = candidate.ur_point.x heapq.heappush(bboxes, curr) curr = heapq.heappop(bboxes) - + elif candidate.ur_point.x >= curr.ur_point.x: assert not (candidate.label != "text" and curr.label != "text") @@ -100,8 +100,8 @@ def split_conflict(ocr_bboxes: List[Bbox], latex_bboxes: List[Bbox]) -> List[Bbo candidate.ur_point.x - curr.ur_point.x, label="text", confidence=candidate.confidence, - content=None - ) + content=None, + ), ) candidate.w = curr.p.x - candidate.p.x res.append(candidate) @@ -128,7 +128,7 @@ def slice_from_image(img: np.ndarray, ocr_bboxes: List[Bbox]) -> List[np.ndarray for bbox in ocr_bboxes: x, y = int(bbox.p.x), int(bbox.p.y) w, h = int(bbox.w), int(bbox.h) - sliced_img = img[y:y+h, x:x+w] + sliced_img = img[y : y + h, x : x + w] sliced_imgs.append(sliced_img) return sliced_imgs @@ -137,20 +137,17 @@ def mix_inference( img_path: str, infer_config, latex_det_model, - lang_ocr_models, - latex_rec_models, accelerator="cpu", - num_beams=1 + num_beams=1, ) -> str: ''' Input a mixed image of formula text and output str (in markdown syntax) ''' global img img = cv2.imread(img_path) - corners = [tuple(img[0, 0]), tuple(img[0, -1]), - tuple(img[-1, 0]), tuple(img[-1, -1])] + corners = [tuple(img[0, 0]), tuple(img[0, -1]), tuple(img[-1, 0]), tuple(img[-1, -1])] bg_color = np.array(Counter(corners).most_common(1)[0][0]) start_time = time.time() @@ -172,10 +169,13 @@ def mix_inference( print(f"ocr_det_model time: {end_time - start_time:.2f}s") ocr_bboxes = [ Bbox( - p[0][0], p[0][1], p[3][1]-p[0][1], p[1][0]-p[0][0], + p[0][0], + p[0][1], + p[3][1] - p[0][1], + p[1][0] - p[0][0], label="text", confidence=None, - content=None + content=None, ) for p in det_prediction ] @@ -198,12 +198,14 @@ def mix_inference( assert len(rec_predictions) == len(ocr_bboxes) for content, bbox in zip(rec_predictions, ocr_bboxes): bbox.content = content[0] - - latex_imgs =[] + + latex_imgs = [] for bbox in latex_bboxes: - latex_imgs.append(img[bbox.p.y:bbox.p.y + bbox.h, bbox.p.x:bbox.p.x + bbox.w]) + latex_imgs.append(img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w]) start_time = time.time() - latex_rec_res = latex_rec_predict(*latex_rec_models, latex_imgs, accelerator, num_beams, max_tokens=800) + latex_rec_res = latex_rec_predict( + *latex_rec_models, latex_imgs, accelerator, num_beams, max_tokens=800 + ) end_time = time.time() print(f"latex_rec_model time: {end_time - start_time:.2f}s") @@ -214,7 +216,6 @@ def mix_inference( elif bbox.label == "isolated": bbox.content = '\n\n' + r"$$" + bbox.content + r"$$" + '\n\n' - bboxes = sorted(ocr_bboxes + latex_bboxes) if bboxes == []: return "" @@ -223,11 +224,7 @@ def mix_inference( prev = Bbox(bboxes[0].p.x, bboxes[0].p.y, -1, -1, label="guard") for curr in bboxes: # Add the formula number back to the isolated formula - if ( - prev.label == "isolated" - and curr.label == "text" - and prev.same_row(curr) - ): + if prev.label == "isolated" and curr.label == "text" and prev.same_row(curr): curr.content = curr.content.strip() if curr.content.startswith('(') and curr.content.endswith(')'): curr.content = curr.content[1:-1] diff --git a/src/server.py b/texteller/server.py similarity index 77% rename from src/server.py rename to texteller/server.py index e6c0320..e17db20 100644 --- a/src/server.py +++ b/texteller/server.py @@ -23,12 +23,8 @@ LIBPATH = Path(sys.executable).parent.parent / 'lib' / ('python' + PYTHON_VERSIO CUDNNPATH = LIBPATH / 'nvidia' / 'cudnn' / 'lib' parser = argparse.ArgumentParser() -parser.add_argument( - '-ckpt', '--checkpoint_dir', type=str -) -parser.add_argument( - '-tknz', '--tokenizer_dir', type=str -) +parser.add_argument('-ckpt', '--checkpoint_dir', type=str) +parser.add_argument('-tknz', '--tokenizer_dir', type=str) parser.add_argument('-port', '--server_port', type=int, default=8000) parser.add_argument('--num_replicas', type=int, default=1) parser.add_argument('--ncpu_per_replica', type=float, default=1.0) @@ -41,66 +37,67 @@ parser.add_argument('-onnx', action='store_true', help='using onnx runtime') args = parser.parse_args() if args.ngpu_per_replica > 0 and not args.inference_mode == 'cuda': raise ValueError("--inference-mode must be cuda or mps if ngpu_per_replica > 0") - + @serve.deployment( - num_replicas=args.num_replicas, + num_replicas=args.num_replicas, ray_actor_options={ - "num_cpus": args.ncpu_per_replica, - "num_gpus": args.ngpu_per_replica * 1.0 / 2 - } + "num_cpus": args.ncpu_per_replica, + "num_gpus": args.ngpu_per_replica * 1.0 / 2, + }, ) class TexTellerRecServer: def __init__( - self, - checkpoint_path: str, - tokenizer_path: str, + self, + checkpoint_path: str, + tokenizer_path: str, inf_mode: str = 'cpu', use_onnx: bool = False, - num_beams: int = 1 + num_beams: int = 1, ) -> None: - self.model = TexTeller.from_pretrained(checkpoint_path, use_onnx=use_onnx, onnx_provider=inf_mode) + self.model = TexTeller.from_pretrained( + checkpoint_path, use_onnx=use_onnx, onnx_provider=inf_mode + ) self.tokenizer = TexTeller.get_tokenizer(tokenizer_path) self.inf_mode = inf_mode self.num_beams = num_beams if not use_onnx: self.model = self.model.to(inf_mode) if inf_mode != 'cpu' else self.model - + def predict(self, image_nparray) -> str: - return to_katex(rec_inference( - self.model, self.tokenizer, [image_nparray], - accelerator=self.inf_mode, num_beams=self.num_beams - )[0]) + return to_katex( + rec_inference( + self.model, + self.tokenizer, + [image_nparray], + accelerator=self.inf_mode, + num_beams=self.num_beams, + )[0] + ) + @serve.deployment( - num_replicas=args.num_replicas, + num_replicas=args.num_replicas, ray_actor_options={ - "num_cpus": args.ncpu_per_replica, + "num_cpus": args.ncpu_per_replica, "num_gpus": args.ngpu_per_replica * 1.0 / 2, - "runtime_env": { - "env_vars": { - "LD_LIBRARY_PATH": f"{str(CUDNNPATH)}/:$LD_LIBRARY_PATH" - } - } + "runtime_env": {"env_vars": {"LD_LIBRARY_PATH": f"{str(CUDNNPATH)}/:$LD_LIBRARY_PATH"}}, }, ) class TexTellerDetServer: - def __init__( - self, - inf_mode='cpu' - ): + def __init__(self, inf_mode='cpu'): self.infer_config = PredictConfig("./models/det_model/model/infer_cfg.yml") self.latex_det_model = InferenceSession( - "./models/det_model/model/rtdetr_r50vd_6x_coco.onnx", - providers=['CUDAExecutionProvider'] if inf_mode == 'cuda' else ['CPUExecutionProvider'] + "./models/det_model/model/rtdetr_r50vd_6x_coco.onnx", + providers=['CUDAExecutionProvider'] if inf_mode == 'cuda' else ['CPUExecutionProvider'], ) async def predict(self, image_nparray) -> str: with tempfile.TemporaryDirectory() as temp_dir: img_path = f"{temp_dir}/temp_image.jpg" cv2.imwrite(img_path, image_nparray) - + latex_bboxes = det_inference(img_path, self.latex_det_model, self.infer_config) return latex_bboxes @@ -110,10 +107,10 @@ class Ingress: def __init__(self, det_server: DeploymentHandle, rec_server: DeploymentHandle) -> None: self.det_server = det_server self.texteller_server = rec_server - + async def __call__(self, request: Request) -> str: request_path = request.url.path - form = await request.form() + form = await request.form() img_rb = await form['img'].read() img_nparray = np.frombuffer(img_rb, np.uint8) @@ -121,7 +118,7 @@ class Ingress: img_nparray = cv2.cvtColor(img_nparray, cv2.COLOR_BGR2RGB) if request_path.startswith("/fdet"): - if self.det_server == None: + if self.det_server is None: return "[ERROR] rtdetr_r50vd_6x_coco.onnx not found." pred = await self.det_server.predict.remote(img_nparray) return pred @@ -140,18 +137,19 @@ if __name__ == '__main__': serve.start(http_options={"host": "0.0.0.0", "port": args.server_port}) rec_server = TexTellerRecServer.bind( - ckpt_dir, tknz_dir, + ckpt_dir, + tknz_dir, inf_mode=args.inference_mode, use_onnx=args.onnx, - num_beams=args.num_beams + num_beams=args.num_beams, ) det_server = None if Path('./models/det_model/model/rtdetr_r50vd_6x_coco.onnx').exists(): det_server = TexTellerDetServer.bind(args.inference_mode) ingress = Ingress.bind(det_server, rec_server) - # ingress_handle = serve.run(ingress, route_prefix="/predict") - ingress_handle = serve.run(ingress, route_prefix="/") + # ingress_handle = serve.run(ingress, route_prefix="/predict") + ingress_handle = serve.run(ingress, route_prefix="/") while True: time.sleep(1) diff --git a/src/start_web.bat b/texteller/start_web.bat similarity index 100% rename from src/start_web.bat rename to texteller/start_web.bat diff --git a/src/start_web.sh b/texteller/start_web.sh similarity index 100% rename from src/start_web.sh rename to texteller/start_web.sh diff --git a/src/train_config.yaml b/texteller/train_config.yaml similarity index 100% rename from src/train_config.yaml rename to texteller/train_config.yaml diff --git a/src/web.py b/texteller/web.py similarity index 83% rename from src/web.py rename to texteller/web.py index 7e47a8a..7a497d9 100644 --- a/src/web.py +++ b/texteller/web.py @@ -20,10 +20,7 @@ from models.ocr_model.utils.inference import inference as latex_recognition from models.ocr_model.utils.to_katex import to_katex -st.set_page_config( - page_title="TexTeller", - page_icon="🧮" -) +st.set_page_config(page_title="TexTeller", page_icon="🧮") html_string = '''