commit 126026cb484fefd6c1881fa304597a0a1410cfb2 Author: 三洋三洋 <1258009915@qq.com> Date: Mon Jan 15 05:48:36 2024 +0000 Initial commit diff --git a/.swp b/.swp new file mode 100644 index 0000000..ff496de Binary files /dev/null and b/.swp differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..26ee243 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# TexTeller + +* python3.10 + +* 渲染出的图片大小为density 200 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..791fa24 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,75 @@ +absl-py==2.0.0 +accelerate==0.26.0 +aiohttp==3.9.1 +aiosignal==1.3.1 +async-timeout==4.0.3 +attrs==23.2.0 +cachetools==5.3.2 +certifi==2023.11.17 +charset-normalizer==3.3.2 +datasets==2.16.1 +dill==0.3.7 +filelock==3.13.1 +frozenlist==1.4.1 +fsspec==2023.10.0 +google-auth==2.26.2 +google-auth-oauthlib==1.2.0 +grpcio==1.60.0 +huggingface-hub==0.20.2 +idna==3.6 +Jinja2==3.1.2 +Markdown==3.5.2 +MarkupSafe==2.1.3 +mpmath==1.3.0 +multidict==6.0.4 +multiprocess==0.70.15 +networkx==3.2.1 +numpy==1.26.3 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.18.1 +nvidia-nvjitlink-cu12==12.3.101 +nvidia-nvtx-cu12==12.1.105 +oauthlib==3.2.2 +packaging==23.2 +pandas==2.1.4 +pillow==10.2.0 +protobuf==4.23.4 +psutil==5.9.7 +pyarrow==14.0.2 +pyarrow-hotfix==0.6 +pyasn1==0.5.1 +pyasn1-modules==0.3.0 +python-dateutil==2.8.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +regex==2023.12.25 +requests==2.31.0 +requests-oauthlib==1.3.1 +rsa==4.9 +safetensors==0.4.1 +six==1.16.0 +sympy==1.12 +tensorboard==2.15.1 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +tokenizers==0.15.0 +torch==2.1.2 +torchaudio==2.1.2 +torchvision==0.16.2 +tqdm==4.66.1 +transformers==4.36.2 +triton==2.1.0 +typing_extensions==4.9.0 +tzdata==2023.4 +urllib3==2.1.0 +Werkzeug==3.0.1 +xxhash==3.4.1 +yarl==1.9.4 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..64aa4a5 --- /dev/null +++ b/run.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# 设置 CUDA 设备 +export CUDA_VISIBLE_DEVICES=0,1,2,4 + +# 运行 Python 脚本并将输出重定向到日志文件 +nohup python -m src.models.resizer.train.train > train_result_pred_height_v3.log 2>&1 & diff --git a/src/globals.py b/src/globals.py new file mode 100644 index 0000000..cdf0538 --- /dev/null +++ b/src/globals.py @@ -0,0 +1,33 @@ +# 公式图片(灰度化后)的均值和方差 +IMAGE_MEAN = 0.9545467 +IMAGE_STD = 0.15394445 + + +# ========================= TeXify模型用的参数 ============================= # + +# 输入图片的最大最小的宽和高 +MIN_HEIGHT = 32 +MAX_HEIGHT = 512 +MIN_WIDTH = 32 +MAX_WIDTH = 1280 +# LaTex-OCR中分别是 32、192、32、672 + +# TeXify模型所用数据集中,图片所用的Density渲染值 +TEXIFY_INPUT_DENSITY = 80 + +# ============================================================================= # + + +# ========================= Resizer模型用的参数 ============================= # + +# Resizer模型所用数据集中,图片所用的Density渲染值 +RESIZER_INPUT_DENSITY = 200 + +LABEL_RATIO = 1.0 * TEXIFY_INPUT_DENSITY / RESIZER_INPUT_DENSITY + +NUM_CLASSES = 1 # 模型使用回归预测(最后会接一个sigmoid,预测0~1) +NUM_CHANNELS = 1 # 输入单通道图片(灰度图) + +# Resizer在训练时,图片所固定的的大小 +RESIZER_IMG_SIZE = 448 +# ============================================================================= # diff --git a/src/inference.py b/src/inference.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/resizer/inference.py b/src/models/resizer/inference.py new file mode 100644 index 0000000..92b1ba3 --- /dev/null +++ b/src/models/resizer/inference.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +import os +import argparse +import torch + +from pathlib import Path +from PIL import Image +from .model.Resizer import Resizer +from .utils import preprocess_fn + +from munch import Munch + + +def load_resizer(): + model = Resizer.from_pretrained('/home/lhy/code/TeXify/src/models/resizer/train/res_wo_sigmoid_train_result_v2/checkpoint-96000') + model.eval() + return model + + +def load_teller(): + arguments = Munch( + { + 'config': '/home/lhy/code/LaTeX-OCR/pix2tex/model/checkpoints/pix2tex/config.yaml', + 'checkpoint': '/home/lhy/code/LaTeX-OCR/pix2tex/model/checkpoints/pix2tex_v1/pix2tex_v1_e30_step4265.pth', + 'no_cuda': False, + 'no_resize': False + } +) + ... + + +def inference_v2(img: Image): + # img = img.convert('RGB') if img.format == 'PNG' else img + # processed_img = preprocess_fn({"pixel_values": [img]}) + + # resizer = load_resizer(resizer_path) + # inpu = torch.stack(processed_img['pixel_values']) + # pred_size = resizer(inpu) + + # teller = load_teller(teller_path) + ... + + +def inference(args): + img = Image.open(args.image) + img = img.convert('RGB') if img.format == 'PNG' else img + processed_img = preprocess_fn({"pixel_values": [img]}) + + ckt_path = Path(args.checkpoint).resolve() + model = Resizer.from_pretrained(ckt_path) + model.eval() + inpu = torch.stack(processed_img['pixel_values']) + pred = model(inpu) + print(pred) + + ... + + +if __name__ == "__main__": + cur_dirpath = os.getcwd() + script_dirpath = Path(__file__).resolve().parent + os.chdir(script_dirpath) + + parser = argparse.ArgumentParser() + parser.add_argument('-img', '--image', type=str, required=True) + parser.add_argument('-ckt', '--checkpoint', type=str, required=True) + + args = parser.parse_args([ + '-img', '/home/lhy/code/TeXify/src/models/resizer/foo5_140h.jpg', + '-ckt', '/home/lhy/code/TeXify/src/models/resizer/train/train_result_pred_height_v5' + ]) + inference(args) + + os.chdir(cur_dirpath) \ No newline at end of file diff --git a/src/models/resizer/model/Resizer.py b/src/models/resizer/model/Resizer.py new file mode 100644 index 0000000..7442a80 --- /dev/null +++ b/src/models/resizer/model/Resizer.py @@ -0,0 +1,5 @@ +from transformers import ResNetForImageClassification + +class Resizer(ResNetForImageClassification): + def __init__(self, config): + super().__init__(config) diff --git a/src/models/resizer/train/train.py b/src/models/resizer/train/train.py new file mode 100644 index 0000000..19b18f7 --- /dev/null +++ b/src/models/resizer/train/train.py @@ -0,0 +1,122 @@ +import os +import datasets + +from pathlib import Path +from transformers import ( + ResNetConfig, + TrainingArguments, + Trainer +) + +from ..utils import preprocess_fn +from ..model.Resizer import Resizer +from ....globals import NUM_CHANNELS, NUM_CLASSES, RESIZER_IMG_SIZE + + +def train(): + cur_dirpath = os.getcwd() + script_dirpath = Path(__file__).resolve().parent + os.chdir(script_dirpath) + + data = datasets.load_dataset("./dataset").shuffle(seed=42) + data = data.rename_column("images", "pixel_values") + data.flatten_indices() + data = data.with_transform(preprocess_fn) + train_data, test_data = data['train'], data['test'] + + config = ResNetConfig( + num_channels=NUM_CHANNELS, + num_labels=NUM_CLASSES, + img_size=RESIZER_IMG_SIZE + ) + model = Resizer(config) + model = Resizer.from_pretrained("/home/lhy/code/TeXify/src/models/resizer/train/train_result_pred_height_v4/checkpoint-213000") + + training_args = TrainingArguments( + # resume_from_checkpoint="/home/lhy/code/TeXify/src/models/resizer/train/train_result_pred_height_v3/checkpoint-94500", + max_grad_norm=1.0, + # use_cpu=True, + seed=42, # 随机种子,用于确保实验的可重复性 + # data_seed=42, # data sampler的采样也固定 + # full_determinism=True, # 使整个训练完全固定(这个设置会有害于模型训练,只用于debug) + + output_dir='./train_result_pred_height_v5', # 输出目录 + overwrite_output_dir=False, # 如果输出目录存在,不删除原先的内容 + report_to=["tensorboard"], # 输出日志到TensorBoard, + #+通过在命令行:tensorboard --logdir ./logs 来查看日志 + + logging_dir=None, # TensorBoard日志文件的存储目录 + log_level="info", + logging_strategy="steps", # 每隔一定步数记录一次日志 + logging_steps=500, # 记录日志的步数间隔 + logging_nan_inf_filter=False, # 对loss=nan或inf进行记录 + + num_train_epochs=50, # 总的训练轮数 + # max_steps=3, # 训练的最大步骤数。如果设置了这个参数, + #+那么num_train_epochs将被忽略(通常用于调试) + + # label_names = ['your_label_name'], # 指定data_loader中的标签名,如果不指定则默认为'labels' + + per_device_train_batch_size=55, # 每个GPU的batch size + per_device_eval_batch_size=48*2, # 每个GPU的evaluation batch size + auto_find_batch_size=False, # 自动搜索合适的batch size(指数decay) + + optim = 'adamw_torch', # 还提供了很多AdamW的变体(相较于经典的AdamW更加高效) + #+当设置了optim后,就不需要在Trainer中传入optimizer + lr_scheduler_type="cosine", # 设置lr_scheduler + warmup_ratio=0.1, # warmup占整个训练steps的比例 + # warmup_steps=500, # 预热步数 + weight_decay=0, # 权重衰减 + learning_rate=5e-5, # 学习率 + fp16=False, # 是否使用16位浮点数进行训练 + gradient_accumulation_steps=1, # 梯度累积步数,当batch size无法开很大时,可以考虑这个参数来实现大batch size的效果 + gradient_checkpointing=False, # 当为True时,会在forward时适当丢弃一些中间量(用于backward),从而减轻显存压力(但会增加forward的时间) + label_smoothing_factor=0.0, # softlabel,等于0时表示未开启 + # debug='underflow_overflow', # 训练时检查溢出,如果发生,则会发出警告。(该模式通常用于debug) + torch_compile=True, # 是否使用torch.compile来编译模型(从而获得更好的训练和推理性能) + #+ 要求torch > 2.0,并且这个功能现在还不是很稳定 + # deepspeed='your_json_path', # 使用deepspeed来训练,需要指定ds_config.json的路径 + #+ 在Trainer中使用Deepspeed时一定要注意ds_config.json中的配置是否与Trainer的一致(如学习率,batch size,梯度累积步数等) + #+ 如果不一致,会出现很奇怪的bug(而且一般还很难发现) + + dataloader_pin_memory=True, # 可以加快数据在cpu和gpu之间转移的速度 + dataloader_num_workers=16, # 默认不会使用多进程来加载数据 + dataloader_drop_last=True, # 丢掉最后一个minibatch + + evaluation_strategy="steps", # 评估策略,可以是"steps"或"epoch" + eval_steps=500, # if evaluation_strategy="step" + # eval_steps=10, # if evaluation_strategy="step" + + save_strategy="steps", # 保存checkpoint的策略 + save_steps=1500, # 模型保存的步数间隔 + save_total_limit=5, # 保存的模型的最大数量。如果超过这个数量,最旧的模型将被删除 + + load_best_model_at_end=True, # 训练结束时是否加载最佳模型 + metric_for_best_model="eval_loss", # 用于选择最佳模型的指标 + greater_is_better=False, # 指标值越小越好 + + do_train=True, # 是否进行训练,通常用于调试 + do_eval=True, # 是否进行评估,通常用于调试 + + remove_unused_columns=True, # 是否删除没有用到的列(特征),默认为True + #+当删除了没用到的列后,making it easier to unpack inputs into the model’s call function + + push_to_hub=False, # 是否训练完后上传hub,需要先在命令行:huggingface-cli login进行登录认证的配置,配置完后,认证信息会存到cache文件夹里 + hub_model_id="a_different_name", # 模型的名字 + #+每次保存模型时,都会上传到hub, + #+训练完后,记得trainer.push_to_hub(),会将模型使用的参数以及验证集上的结果传到hub上 + ) + + trainer = Trainer( + model, + training_args, + train_dataset=train_data, + eval_dataset=test_data, + ) + trainer.train() + + os.chdir(cur_dirpath) + + +if __name__ == '__main__': + train() diff --git a/src/models/resizer/utils/__init__.py b/src/models/resizer/utils/__init__.py new file mode 100644 index 0000000..e29033f --- /dev/null +++ b/src/models/resizer/utils/__init__.py @@ -0,0 +1 @@ +from .preprocess import preprocess_fn \ No newline at end of file diff --git a/src/models/resizer/utils/preprocess.py b/src/models/resizer/utils/preprocess.py new file mode 100644 index 0000000..426e7b3 --- /dev/null +++ b/src/models/resizer/utils/preprocess.py @@ -0,0 +1,73 @@ +import torch +from torchvision.transforms import v2 + +from PIL import Image, ImageChops +from ....globals import ( + IMAGE_MEAN, IMAGE_STD, + LABEL_RATIO, + RESIZER_IMG_SIZE, +) + +from typing import ( + Any, + List, + Dict, +) + + +def trim_white_border(image: Image): + if image.mode == 'RGB': + bg_color = (255, 255, 255) + elif image.mode == 'RGBA': + bg_color = (255, 255, 255, 255) + elif image.mode == 'L': + bg_color = 255 + else: + raise ValueError("Unsupported image mode") + bg = Image.new(image.mode, image.size, bg_color) + diff = ImageChops.difference(image, bg) + diff = ImageChops.add(diff, diff, 2.0, -100) + bbox = diff.getbbox() + if bbox: + return image.crop(bbox) + + +def preprocess_fn(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]: + imgs = samples['pixel_values'] + imgs = [trim_white_border(img) for img in imgs] + labels = [float(img.height * LABEL_RATIO) for img in imgs] + + transform = v2.Compose([ + v2.ToImage(), + v2.ToDtype(torch.uint8, scale=True), + v2.Grayscale(), + v2.Resize( + size=RESIZER_IMG_SIZE - 1, # size必须小于max_size + interpolation=v2.InterpolationMode.BICUBIC, + max_size=RESIZER_IMG_SIZE, + antialias=True + ), + v2.ToDtype(torch.float32, scale=True), + v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]), + ]) + imgs = transform(imgs) + imgs = [ + v2.functional.pad( + img, + padding=[0, 0, RESIZER_IMG_SIZE - img.shape[2], RESIZER_IMG_SIZE - img.shape[1]] + ) + for img in imgs + ] + + res = {'pixel_values': imgs, 'labels': labels} + return res + + +if __name__ == "__main__": # unit test + import datasets + data = datasets.load_dataset("/home/lhy/code/TeXify/src/models/resizer/train/dataset/dataset.py").shuffle(seed=42) + data = data.with_transform(preprocess_fn) + train_data, test_data = data['train'], data['test'] + + inpu = train_data[:10] + pause = 1 diff --git a/src/web.py b/src/web.py new file mode 100644 index 0000000..c2a0b22 --- /dev/null +++ b/src/web.py @@ -0,0 +1,33 @@ +import streamlit as st +import time + +from stqdm import stqdm + +# 使用 Markdown 和 HTML 将标题居中 +with st.columns(3)[1]: + st.title(":rainbow[TexTeller] :sparkles:") + +if "start" not in st.session_state: + st.balloons() + st.session_state["start"] = 1 + +uploaded_file = st.file_uploader("",type=['jpg', 'png']) +st.divider() + +if uploaded_file: + st.image(uploaded_file, caption="Input image") + +for _ in stqdm(range(10), st_container=st.sidebar): + time.sleep(0.1) + +with st.spinner('Wait for it...'): + time.sleep(5) + +st.success('Done!') + + +with st.empty(): + for seconds in range(60): + st.write(f"⏳ {seconds} seconds have passed") + time.sleep(1) + st.write("✔️ 1 minute over!") \ No newline at end of file