[refactor] Init

2025-04-16 14:23:02 +00:00
parent 0e32f3f3bf
commit 06edd104e2
101 changed files with 1854 additions and 2758 deletions
--- a/texteller/init.py
+++ b/texteller/init.py
@@ -0,0 +1 @@
 from texteller.api import *
--- a/texteller/api/init.py
+++ b/texteller/api/init.py
@@ -0,0 +1,24 @@
 from .detection import latex_detect
 from .format import format_latex
 from .inference import img2latex, paragraph2md
 from .katex import to_katex
 from .load import (
    load_latexdet_model,
    load_model,
    load_textdet_model,
    load_textrec_model,
    load_tokenizer,
 )
 __all__ = [
    "to_katex",
    "format_latex",
    "img2latex",
    "paragraph2md",
    "load_model",
    "load_tokenizer",
    "load_latexdet_model",
    "load_textrec_model",
    "load_textdet_model",
    "latex_detect",
 ]
--- a/texteller/api/criterias/init.py
+++ b/texteller/api/criterias/init.py
@@ -0,0 +1,4 @@
 from .ngram import DetectRepeatingNgramCriteria
 __all__ = ["DetectRepeatingNgramCriteria"]
--- a/texteller/models/ocr_model/utils/inference.py
+++ b/texteller/models/ocr_model/utils/inference.py
@@ -1,16 +1,8 @@
 import torch
-import numpy as np
+from transformers import StoppingCriteria
 from transformers import RobertaTokenizerFast, GenerationConfig, StoppingCriteria
 from typing import List, Union
 from .transforms import inference_transform
 from .helpers import convert2rgb
 from ..model.TexTeller import TexTeller
 from ...globals import MAX_TOKEN_SIZE
-class EfficientDetectRepeatingNgramCriteria(StoppingCriteria):
+class DetectRepeatingNgramCriteria(StoppingCriteria):
    """
    Stops generation efficiently if any n-gram repeats.
@@ -69,48 +61,3 @@ class EfficientDetectRepeatingNgramCriteria(StoppingCriteria):
            # It's a new n-gram, add it to the set and continue
            self.seen_ngrams.add(last_ngram_tuple)
            return False  # Continue generation
 def inference(
    model: TexTeller,
    tokenizer: RobertaTokenizerFast,
    imgs: Union[List[str], List[np.ndarray]],
    accelerator: str = 'cpu',
    num_beams: int = 1,
    max_tokens=None,
 ) -> List[str]:
    if imgs == []:
        return []
    if hasattr(model, 'eval'):
        # not onnx session, turn model.eval()
        model.eval()
    if isinstance(imgs[0], str):
        imgs = convert2rgb(imgs)
    else:  # already numpy array(rgb format)
        assert isinstance(imgs[0], np.ndarray)
        imgs = imgs
    imgs = inference_transform(imgs)
    pixel_values = torch.stack(imgs)
    if hasattr(model, 'eval'):
        # not onnx session, move weights to device
        model = model.to(accelerator)
    pixel_values = pixel_values.to(accelerator)
    generate_config = GenerationConfig(
        max_new_tokens=MAX_TOKEN_SIZE if max_tokens is None else max_tokens,
        num_beams=num_beams,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        # no_repeat_ngram_size=10,
    )
    pred = model.generate(
        pixel_values.to(model.device),
        generation_config=generate_config,
        # stopping_criteria=[EfficientDetectRepeatingNgramCriteria(20)],
    )
    res = tokenizer.batch_decode(pred, skip_special_tokens=True)
    return res
--- a/texteller/api/detection/init.py
+++ b/texteller/api/detection/init.py
@@ -0,0 +1,3 @@
 from .detect import latex_detect
 __all__ = ["latex_detect"]
--- a/texteller/api/detection/detect.py
+++ b/texteller/api/detection/detect.py
@@ -0,0 +1,48 @@
 from typing import List
 from onnxruntime import InferenceSession
 from texteller.types import Bbox
 from .preprocess import Compose
 _config = {
    "mode": "paddle",
    "draw_threshold": 0.5,
    "metric": "COCO",
    "use_dynamic_shape": False,
    "arch": "DETR",
    "min_subgraph_size": 3,
    "preprocess": [
        {"interp": 2, "keep_ratio": False, "target_size": [1600, 1600], "type": "Resize"},
        {
            "mean": [0.0, 0.0, 0.0],
            "norm_type": "none",
            "std": [1.0, 1.0, 1.0],
            "type": "NormalizeImage",
        },
        {"type": "Permute"},
    ],
    "label_list": ["isolated", "embedding"],
 }
 def latex_detect(img_path: str, predictor: InferenceSession) -> List[Bbox]:
    transforms = Compose(_config["preprocess"])
    inputs = transforms(img_path)
    inputs_name = [var.name for var in predictor.get_inputs()]
    inputs = {k: inputs[k][None,] for k in inputs_name}
    outputs = predictor.run(output_names=None, input_feed=inputs)[0]
    res = []
    for output in outputs:
        cls_name = _config["label_list"][int(output[0])]
        score = output[1]
        xmin = int(max(output[2], 0))
        ymin = int(max(output[3], 0))
        xmax = int(output[4])
        ymax = int(output[5])
        if score > 0.5:
            res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score))
    return res
--- a/texteller/api/detection/preprocess.py
+++ b/texteller/api/detection/preprocess.py
@@ -0,0 +1,161 @@
 import copy
 import cv2
 import numpy as np
 def decode_image(img_path):
    if isinstance(img_path, str):
        with open(img_path, "rb") as f:
            im_read = f.read()
        data = np.frombuffer(im_read, dtype="uint8")
    else:
        assert isinstance(img_path, np.ndarray)
        data = img_path
    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    img_info = {
        "im_shape": np.array(im.shape[:2], dtype=np.float32),
        "scale_factor": np.array([1.0, 1.0], dtype=np.float32),
    }
    return im, img_info
 class Resize(object):
    """resize image by target_size and max_size
    Args:
        target_size (int): the target size of image
        keep_ratio (bool): whether keep_ratio or not, default true
        interp (int): method of resize
    """
    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
        if isinstance(target_size, int):
            target_size = [target_size, target_size]
        self.target_size = target_size
        self.keep_ratio = keep_ratio
        self.interp = interp
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        assert len(self.target_size) == 2
        assert self.target_size[0] > 0 and self.target_size[1] > 0
        im_channel = im.shape[2]
        im_scale_y, im_scale_x = self.generate_scale(im)
        im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
        im_info["im_shape"] = np.array(im.shape[:2]).astype("float32")
        im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32")
        return im, im_info
    def generate_scale(self, im):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
        Returns:
            im_scale_x: the resize ratio of X
            im_scale_y: the resize ratio of Y
        """
        origin_shape = im.shape[:2]
        im_c = im.shape[2]
        if self.keep_ratio:
            im_size_min = np.min(origin_shape)
            im_size_max = np.max(origin_shape)
            target_size_min = np.min(self.target_size)
            target_size_max = np.max(self.target_size)
            im_scale = float(target_size_min) / float(im_size_min)
            if np.round(im_scale * im_size_max) > target_size_max:
                im_scale = float(target_size_max) / float(im_size_max)
            im_scale_x = im_scale
            im_scale_y = im_scale
        else:
            resize_h, resize_w = self.target_size
            im_scale_y = resize_h / float(origin_shape[0])
            im_scale_x = resize_w / float(origin_shape[1])
        return im_scale_y, im_scale_x
 class NormalizeImage(object):
    """normalize image
    Args:
        mean (list): im - mean
        std (list): im / std
        is_scale (bool): whether need im / 255
        norm_type (str): type in ['mean_std', 'none']
    """
    def __init__(self, mean, std, is_scale=True, norm_type="mean_std"):
        self.mean = mean
        self.std = std
        self.is_scale = is_scale
        self.norm_type = norm_type
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        im = im.astype(np.float32, copy=False)
        if self.is_scale:
            scale = 1.0 / 255.0
            im *= scale
        if self.norm_type == "mean_std":
            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
            std = np.array(self.std)[np.newaxis, np.newaxis, :]
            im -= mean
            im /= std
        return im, im_info
 class Permute(object):
    """permute image
    Args:
        to_bgr (bool): whether convert RGB to BGR
        channel_first (bool): whether convert HWC to CHW
    """
    def __init__(
        self,
    ):
        super(Permute, self).__init__()
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        im = im.transpose((2, 0, 1)).copy()
        return im, im_info
 class Compose:
    def __init__(self, transforms):
        self.transforms = []
        for op_info in transforms:
            new_op_info = op_info.copy()
            op_type = new_op_info.pop("type")
            self.transforms.append(eval(op_type)(**new_op_info))
    def __call__(self, img_path):
        img, im_info = decode_image(img_path)
        for t in self.transforms:
            img, im_info = t(img, im_info)
        inputs = copy.deepcopy(im_info)
        inputs["image"] = img
        return inputs
--- a/texteller/models/ocr_model/utils/latex_formatter.py
+++ b/texteller/models/ocr_model/utils/latex_formatter.py
@@ -5,9 +5,8 @@ Based on the Rust implementation at https://github.com/WGUNDERWOOD/tex-fmt
 """
 import re
 import argparse
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Dict, Set
+from typing import List, Optional, Tuple
 # Constants
 LINE_END = "\n"
@@ -49,7 +48,7 @@ RE_SPLITTING_SHARED_LINE_CAPTURE = re.compile(f"(?P<prev>\\S.*?)(?P<env>{SPLITTI
@dataclass
 class Args:
-    """Command line arguments and configuration."""
+    """Formatter configuration."""
    tabchar: str = " "
    tabsize: int = 4
@@ -542,13 +541,29 @@ def indents_return_to_zero(state: State) -> bool:
    return state.indent.actual == 0
-def format_latex(
+def format_latex(text: str) -> str:
-    old_text: str, file: str = "input.tex", args: Optional[Args] = None
+    """Format LaTeX text with default formatting options.
 ) -> Tuple[str, List[Log]]:
    """Central function to format a LaTeX string."""
    if args is None:
        args = Args()
    This is the main API function for formatting LaTeX text.
    It uses pre-defined default values for all formatting parameters.
    Args:
        text: LaTeX text to format
    Returns:
        Formatted LaTeX text
    """
    # Use default configuration
    args = Args()
    file = "input.tex"
    # Format and return only the text
    formatted_text, _ = _format_latex(text, file, args)
    return formatted_text.strip()
 def _format_latex(old_text: str, file: str, args: Args) -> Tuple[str, List[Log]]:
    """Internal function to format a LaTeX string."""
    logs = []
    logs.append(Log(level="INFO", file=file, message="Formatting started."))
@@ -636,63 +651,3 @@ def format_latex(
    logs.append(Log(level="INFO", file=file, message="Formatting complete."))
    return new_text, logs
 def main():
    """Command-line entry point."""
    parser = argparse.ArgumentParser(description="Format LaTeX files")
    parser.add_argument("file", help="LaTeX file to format")
    parser.add_argument(
        "--tabchar",
        choices=["space", "tab"],
        default="space",
        help="Character to use for indentation",
    )
    parser.add_argument("--tabsize", type=int, default=4, help="Number of spaces per indent level")
    parser.add_argument("--wrap", action="store_true", help="Enable line wrapping")
    parser.add_argument("--wraplen", type=int, default=80, help="Maximum line length")
    parser.add_argument(
        "--wrapmin", type=int, default=40, help="Minimum line length before wrapping"
    )
    parser.add_argument(
        "--lists", nargs="+", default=[], help="Additional environments to indent as lists"
    )
    parser.add_argument("--verbose", "-v", action="count", default=0, help="Increase verbosity")
    parser.add_argument("--output", "-o", help="Output file (default: overwrite input)")
    args_parsed = parser.parse_args()
    # Convert command line args to our Args class
    args = Args(
        tabchar="\t" if args_parsed.tabchar == "tab" else " ",
        tabsize=args_parsed.tabsize,
        wrap=args_parsed.wrap,
        wraplen=args_parsed.wraplen,
        wrapmin=args_parsed.wrapmin,
        lists=args_parsed.lists,
        verbosity=args_parsed.verbose,
    )
    # Read input file
    with open(args_parsed.file, "r", encoding="utf-8") as f:
        text = f.read()
    # Format the text
    formatted_text, logs = format_latex(text, args_parsed.file, args)
    # Print logs if verbose
    if args.verbosity > 0:
        for log in logs:
            if log.linum_new is not None:
                print(f"{log.level} {log.file}:{log.linum_new}:{log.linum_old}: {log.message}")
            else:
                print(f"{log.level} {log.file}: {log.message}")
    # Write output
    output_file = args_parsed.output or args_parsed.file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(formatted_text)
 if __name__ == "__main__":
    main()
--- a/texteller/api/inference.py
+++ b/texteller/api/inference.py
@@ -0,0 +1,241 @@
 import re
 import time
 from collections import Counter
 from typing import Literal
 import cv2
 import numpy as np
 import torch
 from onnxruntime import InferenceSession
 from optimum.onnxruntime import ORTModelForVision2Seq
 from transformers import GenerationConfig, RobertaTokenizerFast
 from texteller.constants import MAX_TOKEN_SIZE
 from texteller.logger import get_logger
 from texteller.paddleocr import predict_det, predict_rec
 from texteller.types import Bbox, TexTellerModel
 from texteller.utils import (
    bbox_merge,
    get_device,
    mask_img,
    readimgs,
    remove_style,
    slice_from_image,
    split_conflict,
    transform,
    add_newlines,
 )
 from .detection import latex_detect
 from .format import format_latex
 from .katex import to_katex
 _logger = get_logger()
 def img2latex(
    model: TexTellerModel,
    tokenizer: RobertaTokenizerFast,
    images: list[str] | list[np.ndarray],
    device: torch.device | None = None,
    out_format: Literal["latex", "katex"] = "latex",
    keep_style: bool = False,
    max_tokens: int = MAX_TOKEN_SIZE,
    num_beams: int = 1,
    no_repeat_ngram_size: int = 0,
 ) -> list[str]:
    """
    Convert images to LaTeX or KaTeX formatted strings.
    Args:
        model: The TexTeller or ORTModelForVision2Seq model instance
        tokenizer: The tokenizer for the model
        images: List of image paths or numpy arrays (RGB format)
        device: The torch device to use (defaults to available GPU or CPU)
        out_format: Output format, either "latex" or "katex"
        keep_style: Whether to keep the style of the LaTeX
        max_tokens: Maximum number of tokens to generate
        num_beams: Number of beams for beam search
        no_repeat_ngram_size: Size of n-grams to prevent repetition
    Returns:
        List of LaTeX or KaTeX strings corresponding to each input image
    Example usage:
        >>> import torch
        >>> from texteller import load_model, load_tokenizer, img2latex
        >>> model = load_model(model_path=None, use_onnx=False)
        >>> tokenizer = load_tokenizer(tokenizer_path=None)
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> res = img2latex(model, tokenizer, ["path/to/image.png"], device=device, out_format="katex")
    """
    assert isinstance(images, list)
    assert len(images) > 0
    if device is None:
        device = get_device()
    if device.type != model.device.type:
        if isinstance(model, ORTModelForVision2Seq):
            _logger.warning(
                f"Onnxruntime device mismatch: detected {str(device)} but model is on {str(model.device)}, using {str(model.device)} instead"
            )
        else:
            model = model.to(device=device)
    if isinstance(images[0], str):
        images = readimgs(images)
    else:  # already numpy array(rgb format)
        assert isinstance(images[0], np.ndarray)
        images = images
    images = transform(images)
    pixel_values = torch.stack(images)
    generate_config = GenerationConfig(
        max_new_tokens=max_tokens,
        num_beams=num_beams,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        no_repeat_ngram_size=no_repeat_ngram_size,
    )
    pred = model.generate(
        pixel_values.to(model.device),
        generation_config=generate_config,
    )
    res = tokenizer.batch_decode(pred, skip_special_tokens=True)
    if out_format == "katex":
        res = [to_katex(r) for r in res]
    if not keep_style:
        res = [remove_style(r) for r in res]
    res = [format_latex(r) for r in res]
    res = [add_newlines(r) for r in res]
    return res
 def paragraph2md(
    img_path: str,
    latexdet_model: InferenceSession,
    textdet_model: predict_det.TextDetector,
    textrec_model: predict_rec.TextRecognizer,
    latexrec_model: TexTellerModel,
    tokenizer: RobertaTokenizerFast,
    device: torch.device | None = None,
    num_beams=1,
 ) -> str:
    """
    Input a mixed image of formula text and output str (in markdown syntax)
    """
    img = cv2.imread(img_path)
    corners = [tuple(img[0, 0]), tuple(img[0, -1]), tuple(img[-1, 0]), tuple(img[-1, -1])]
    bg_color = np.array(Counter(corners).most_common(1)[0][0])
    start_time = time.time()
    latex_bboxes = latex_detect(img_path, latexdet_model)
    end_time = time.time()
    _logger.info(f"latex_det_model time: {end_time - start_time:.2f}s")
    latex_bboxes = sorted(latex_bboxes)
    latex_bboxes = bbox_merge(latex_bboxes)
    masked_img = mask_img(img, latex_bboxes, bg_color)
    start_time = time.time()
    det_prediction, _ = textdet_model(masked_img)
    end_time = time.time()
    _logger.info(f"ocr_det_model time: {end_time - start_time:.2f}s")
    ocr_bboxes = [
        Bbox(
            p[0][0],
            p[0][1],
            p[3][1] - p[0][1],
            p[1][0] - p[0][0],
            label="text",
            confidence=None,
            content=None,
        )
        for p in det_prediction
    ]
    ocr_bboxes = sorted(ocr_bboxes)
    ocr_bboxes = bbox_merge(ocr_bboxes)
    ocr_bboxes = split_conflict(ocr_bboxes, latex_bboxes)
    ocr_bboxes = list(filter(lambda x: x.label == "text", ocr_bboxes))
    sliced_imgs: list[np.ndarray] = slice_from_image(img, ocr_bboxes)
    start_time = time.time()
    rec_predictions, _ = textrec_model(sliced_imgs)
    end_time = time.time()
    _logger.info(f"ocr_rec_model time: {end_time - start_time:.2f}s")
    assert len(rec_predictions) == len(ocr_bboxes)
    for content, bbox in zip(rec_predictions, ocr_bboxes):
        bbox.content = content[0]
    latex_imgs = []
    for bbox in latex_bboxes:
        latex_imgs.append(img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w])
    start_time = time.time()
    latex_rec_res = img2latex(
        model=latexrec_model,
        tokenizer=tokenizer,
        images=latex_imgs,
        num_beams=num_beams,
        out_format="katex",
        device=device,
        keep_style=False,
    )
    end_time = time.time()
    _logger.info(f"latex_rec_model time: {end_time - start_time:.2f}s")
    for bbox, content in zip(latex_bboxes, latex_rec_res):
        if bbox.label == "embedding":
            bbox.content = " $" + content + "$ "
        elif bbox.label == "isolated":
            bbox.content = "\n\n" + r"$$" + content + r"$$" + "\n\n"
    bboxes = sorted(ocr_bboxes + latex_bboxes)
    if bboxes == []:
        return ""
    md = ""
    prev = Bbox(bboxes[0].p.x, bboxes[0].p.y, -1, -1, label="guard")
    for curr in bboxes:
        # Add the formula number back to the isolated formula
        if prev.label == "isolated" and curr.label == "text" and prev.same_row(curr):
            curr.content = curr.content.strip()
            if curr.content.startswith("(") and curr.content.endswith(")"):
                curr.content = curr.content[1:-1]
            if re.search(r"\\tag\{.*\}$", md[:-4]) is not None:
                # in case of multiple tag
                md = md[:-5] + f", {curr.content}" + "}" + md[-4:]
            else:
                md = md[:-4] + f"\\tag{{{curr.content}}}" + md[-4:]
            continue
        if not prev.same_row(curr):
            md += " "
        if curr.label == "embedding":
            # remove the bold effect from inline formulas
            curr.content = remove_style(curr.content)
            # change split environment into aligned
            curr.content = curr.content.replace(r"\begin{split}", r"\begin{aligned}")
            curr.content = curr.content.replace(r"\end{split}", r"\end{aligned}")
            # remove extra spaces (keeping only one)
            curr.content = re.sub(r" +", " ", curr.content)
            assert curr.content.startswith("$") and curr.content.endswith("$")
            curr.content = " $" + curr.content.strip("$") + "$ "
        md += curr.content
        prev = curr
    return md.strip()
--- a/texteller/models/ocr_model/utils/to_katex.py
+++ b/texteller/models/ocr_model/utils/to_katex.py
@@ -1,73 +1,10 @@
 import re
-from .latex_formatter import format_latex
+from ..utils.latex import change_all
 from .format import format_latex
-def change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
+def _rm_dollar_surr(content):
    result = ""
    i = 0
    n = len(input_str)
    while i < n:
        if input_str[i : i + len(old_inst)] == old_inst:
            # check if the old_inst is followed by old_surr_l
            start = i + len(old_inst)
        else:
            result += input_str[i]
            i += 1
            continue
        if start < n and input_str[start] == old_surr_l:
            # found an old_inst followed by old_surr_l, now look for the matching old_surr_r
            count = 1
            j = start + 1
            escaped = False
            while j < n and count > 0:
                if input_str[j] == '\\' and not escaped:
                    escaped = True
                    j += 1
                    continue
                if input_str[j] == old_surr_r and not escaped:
                    count -= 1
                    if count == 0:
                        break
                elif input_str[j] == old_surr_l and not escaped:
                    count += 1
                escaped = False
                j += 1
            if count == 0:
                assert j < n
                assert input_str[start] == old_surr_l
                assert input_str[j] == old_surr_r
                inner_content = input_str[start + 1 : j]
                # Replace the content with new pattern
                result += new_inst + new_surr_l + inner_content + new_surr_r
                i = j + 1
                continue
            else:
                assert count >= 1
                assert j == n
                print("Warning: unbalanced surrogate pair in input string")
                result += new_inst + new_surr_l
                i = start + 1
                continue
        else:
            result += input_str[i:start]
            i = start
    if old_inst != new_inst and (old_inst + old_surr_l) in result:
        return change(result, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r)
    else:
        return result
 def find_substring_positions(string, substring):
    positions = [match.start() for match in re.finditer(re.escape(substring), string)]
    return positions
 def rm_dollar_surr(content):
    pattern = re.compile(r'\\[a-zA-Z]+\$.*?\$|\$.*?\$')
    matches = pattern.findall(content)
@@ -79,19 +16,6 @@ def rm_dollar_surr(content):
    return content
 def change_all(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
    pos = find_substring_positions(input_str, old_inst + old_surr_l)
    res = list(input_str)
    for p in pos[::-1]:
        res[p:] = list(
            change(
                ''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r
            )
        )
    res = ''.join(res)
    return res
 def to_katex(formula: str) -> str:
    res = formula
    # remove mbox surrounding
@@ -182,13 +106,13 @@ def to_katex(formula: str) -> str:
    res = re.sub(r'(\\text\{[^}]*\}\s*){2,}', merge_texts, res)
    res = res.replace(r'\bf ', '')
-    res = rm_dollar_surr(res)
+    res = _rm_dollar_surr(res)
    # remove extra spaces (keeping only one)
    res = re.sub(r' +', ' ', res)
    # format latex
    res = res.strip()
-    res, logs = format_latex(res)
+    res = format_latex(res)
    return res
--- a/texteller/api/load.py
+++ b/texteller/api/load.py
@@ -0,0 +1,66 @@
 from pathlib import Path
 import wget
 from onnxruntime import InferenceSession
 from transformers import RobertaTokenizerFast
 from texteller.constants import LATEX_DET_MODEL_URL, TEXT_DET_MODEL_URL, TEXT_REC_MODEL_URL
 from texteller.globals import Globals
 from texteller.logger import get_logger
 from texteller.models import TexTeller
 from texteller.paddleocr import predict_det, predict_rec
 from texteller.paddleocr.utility import parse_args
 from texteller.utils import cuda_available, mkdir, resolve_path
 from texteller.types import TexTellerModel
 _logger = get_logger(__name__)
 def load_model(model_dir: str | None = None, use_onnx: bool = False) -> TexTellerModel:
    return TexTeller.from_pretrained(model_dir, use_onnx=use_onnx)
 def load_tokenizer(tokenizer_dir: str | None = None) -> RobertaTokenizerFast:
    return TexTeller.get_tokenizer(tokenizer_dir)
 def load_latexdet_model() -> InferenceSession:
    fpath = _maybe_download(LATEX_DET_MODEL_URL)
    return InferenceSession(
        resolve_path(fpath),
        providers=["CUDAExecutionProvider" if cuda_available() else "CPUExecutionProvider"],
    )
 def load_textrec_model() -> predict_rec.TextRecognizer:
    fpath = _maybe_download(TEXT_REC_MODEL_URL)
    paddleocr_args = parse_args()
    paddleocr_args.use_onnx = True
    paddleocr_args.rec_model_dir = resolve_path(fpath)
    paddleocr_args.use_gpu = cuda_available()
    predictor = predict_rec.TextRecognizer(paddleocr_args)
    return predictor
 def load_textdet_model() -> predict_det.TextDetector:
    fpath = _maybe_download(TEXT_DET_MODEL_URL)
    paddleocr_args = parse_args()
    paddleocr_args.use_onnx = True
    paddleocr_args.det_model_dir = resolve_path(fpath)
    paddleocr_args.use_gpu = cuda_available()
    predictor = predict_det.TextDetector(paddleocr_args)
    return predictor
 def _maybe_download(url: str, dirpath: str | None = None, force: bool = False) -> Path:
    if dirpath is None:
        dirpath = Globals().cache_dir
    mkdir(dirpath)
    fname = Path(url).name
    fpath = Path(dirpath) / fname
    if not fpath.exists() or force:
        _logger.info(f"Downloading {fname} from {url} to {fpath}")
        wget.download(url, resolve_path(fpath))
    return fpath
--- a/texteller/cli/init.py
+++ b/texteller/cli/init.py
@@ -0,0 +1,25 @@
 """
 CLI entry point for TexTeller.
 """
 import time
 import click
 from texteller.cli.commands.inference import inference
 from texteller.cli.commands.launch import launch
 from texteller.cli.commands.web import web
@click.group()
 def cli():
    pass
 cli.add_command(inference)
 cli.add_command(web)
 cli.add_command(launch)
 if __name__ == "__main__":
    cli()
--- a/texteller/cli/commands/init.py
+++ b/texteller/cli/commands/init.py
@@ -0,0 +1,3 @@
 """
 CLI commands for TexTeller
 """
--- a/texteller/cli/commands/inference.py
+++ b/texteller/cli/commands/inference.py
@@ -0,0 +1,51 @@
 """
 CLI command for formula inference from images.
 """
 import click
 from texteller.api import img2latex, load_model, load_tokenizer
@click.command()
@click.argument("image_path", type=click.Path(exists=True, file_okay=True, dir_okay=False))
@click.option(
    "--model-path",
    type=click.Path(exists=True, file_okay=False, dir_okay=True),
    default=None,
    help="Path to the model dir path, if not provided, will use model from huggingface repo",
 )
@click.option(
    "--tokenizer-path",
    type=click.Path(exists=True, file_okay=False, dir_okay=True),
    default=None,
    help="Path to the tokenizer dir path, if not provided, will use tokenizer from huggingface repo",
 )
@click.option(
    "--output-format",
    type=click.Choice(["latex", "katex"]),
    default="katex",
    help="Output format, either latex or katex",
 )
@click.option(
    "--keep-style",
    is_flag=True,
    default=False,
    help="Whether to keep the style of the LaTeX (e.g. bold, italic, etc.)",
 )
 def inference(image_path, model_path, tokenizer_path, output_format, keep_style):
    """
    CLI command for formula inference from images.
    """
    model = load_model(model_dir=model_path)
    tknz = load_tokenizer(tokenizer_dir=tokenizer_path)
    pred = img2latex(
        model=model,
        tokenizer=tknz,
        images=[image_path],
        out_format=output_format,
        keep_style=keep_style,
    )[0]
    click.echo(f"Predicted LaTeX: ```\n{pred}\n```")
--- a/texteller/cli/commands/launch/init.py
+++ b/texteller/cli/commands/launch/init.py
@@ -0,0 +1,106 @@
 """
 CLI commands for launching server.
 """
 import sys
 import time
 import click
 from ray import serve
 from texteller.globals import Globals
 from texteller.utils import get_device
@click.command()
@click.option(
    "-ckpt",
    "--checkpoint_dir",
    type=click.Path(exists=True, file_okay=False, dir_okay=True),
    default=None,
    help="Path to the checkpoint directory, if not provided, will use model from huggingface repo",
 )
@click.option(
    "-tknz",
    "--tokenizer_dir",
    type=click.Path(exists=True, file_okay=False, dir_okay=True),
    default=None,
    help="Path to the tokenizer directory, if not provided, will use tokenizer from huggingface repo",
 )
@click.option(
    "-p",
    "--port",
    type=int,
    default=8000,
    help="Port to run the server on",
 )
@click.option(
    "--num-replicas",
    type=int,
    default=1,
    help="Number of replicas to run the server on",
 )
@click.option(
    "--ncpu-per-replica",
    type=float,
    default=1.0,
    help="Number of CPUs per replica",
 )
@click.option(
    "--ngpu-per-replica",
    type=float,
    default=1.0,
    help="Number of GPUs per replica",
 )
@click.option(
    "--num-beams",
    type=int,
    default=1,
    help="Number of beams to use",
 )
@click.option(
    "--use-onnx",
    is_flag=True,
    type=bool,
    default=False,
    help="Use ONNX runtime",
 )
 def launch(
    checkpoint_dir,
    tokenizer_dir,
    port,
    num_replicas,
    ncpu_per_replica,
    ngpu_per_replica,
    num_beams,
    use_onnx,
 ):
    """Launch the api server"""
    device = get_device()
    if ngpu_per_replica > 0 and not device.type == "cuda":
        click.echo(
            click.style(
                f"Error: --ngpu-per-replica > 0 but detected device is {device.type}",
                fg="red",
            )
        )
        sys.exit(1)
    Globals().num_replicas = num_replicas
    Globals().ncpu_per_replica = ncpu_per_replica
    Globals().ngpu_per_replica = ngpu_per_replica
    from texteller.cli.commands.launch.server import Ingress, TexTellerServer
    serve.start(http_options={"host": "0.0.0.0", "port": port})
    rec_server = TexTellerServer.bind(
        checkpoint_dir=checkpoint_dir,
        tokenizer_dir=tokenizer_dir,
        use_onnx=use_onnx,
        num_beams=num_beams,
    )
    ingress = Ingress.bind(rec_server)
    serve.run(ingress, route_prefix="/predict")
    while True:
        time.sleep(1)
--- a/texteller/cli/commands/launch/server.py
+++ b/texteller/cli/commands/launch/server.py
@@ -0,0 +1,69 @@
 import numpy as np
 import cv2
 from starlette.requests import Request
 from ray import serve
 from ray.serve.handle import DeploymentHandle
 from texteller.api import load_model, load_tokenizer, img2latex
 from texteller.utils import get_device
 from texteller.globals import Globals
 from typing import Literal
@serve.deployment(
    num_replicas=Globals().num_replicas,
    ray_actor_options={
        "num_cpus": Globals().ncpu_per_replica,
        "num_gpus": Globals().ngpu_per_replica * 1.0 / 2,
    },
 )
 class TexTellerServer:
    def __init__(
        self,
        checkpoint_dir: str,
        tokenizer_dir: str,
        use_onnx: bool = False,
        out_format: Literal["latex", "katex"] = "katex",
        keep_style: bool = False,
        num_beams: int = 1,
    ) -> None:
        self.model = load_model(
            model_dir=checkpoint_dir,
            use_onnx=use_onnx,
        )
        self.tokenizer = load_tokenizer(tokenizer_dir=tokenizer_dir)
        self.num_beams = num_beams
        self.out_format = out_format
        self.keep_style = keep_style
        if not use_onnx:
            self.model = self.model.to(get_device())
    def predict(self, image_nparray: np.ndarray) -> str:
        return img2latex(
            model=self.model,
            tokenizer=self.tokenizer,
            images=[image_nparray],
            device=get_device(),
            out_format=self.out_format,
            keep_style=self.keep_style,
            num_beams=self.num_beams,
        )[0]
@serve.deployment()
 class Ingress:
    def __init__(self, rec_server: DeploymentHandle) -> None:
        self.texteller_server = rec_server
    async def __call__(self, request: Request) -> str:
        form = await request.form()
        img_rb = await form["img"].read()
        img_nparray = np.frombuffer(img_rb, np.uint8)
        img_nparray = cv2.imdecode(img_nparray, cv2.IMREAD_COLOR)
        img_nparray = cv2.cvtColor(img_nparray, cv2.COLOR_BGR2RGB)
        pred = await self.texteller_server.predict.remote(img_nparray)
        return pred
--- a/texteller/cli/commands/web/init.py
+++ b/texteller/cli/commands/web/init.py
@@ -0,0 +1,9 @@
 import os
 import click
 from pathlib import Path
@click.command()
 def web():
    """Launch the web interface for TexTeller."""
    os.system(f"streamlit run {Path(__file__).parent / 'streamlit_demo.py'}")
--- a/texteller/cli/commands/web/streamlit_demo.py
+++ b/texteller/cli/commands/web/streamlit_demo.py
@@ -0,0 +1,225 @@
 import base64
 import io
 import os
 import re
 import shutil
 import tempfile
 import streamlit as st
 from PIL import Image
 from streamlit_paste_button import paste_image_button as pbutton
 from texteller.api import (
    img2latex,
    load_latexdet_model,
    load_model,
    load_textdet_model,
    load_textrec_model,
    load_tokenizer,
    paragraph2md,
 )
 from texteller.cli.commands.web.style import (
    HEADER_HTML,
    IMAGE_EMBED_HTML,
    IMAGE_INFO_HTML,
    SUCCESS_GIF_HTML,
 )
 from texteller.utils import str2device
 st.set_page_config(page_title="TexTeller", page_icon="🧮")
@st.cache_resource
 def get_texteller(use_onnx):
    return load_model(use_onnx=use_onnx)
@st.cache_resource
 def get_tokenizer():
    return load_tokenizer()
@st.cache_resource
 def get_latexdet_model():
    return load_latexdet_model()
@st.cache_resource()
 def get_textrec_model():
    return load_textrec_model()
@st.cache_resource()
 def get_textdet_model():
    return load_textdet_model()
 def get_image_base64(img_file):
    buffered = io.BytesIO()
    img_file.seek(0)
    img = Image.open(img_file)
    img.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode()
 def on_file_upload():
    st.session_state["UPLOADED_FILE_CHANGED"] = True
 def change_side_bar():
    st.session_state["CHANGE_SIDEBAR_FLAG"] = True
 if "start" not in st.session_state:
    st.session_state["start"] = 1
    st.toast("Hooray!", icon="🎉")
 if "UPLOADED_FILE_CHANGED" not in st.session_state:
    st.session_state["UPLOADED_FILE_CHANGED"] = False
 if "CHANGE_SIDEBAR_FLAG" not in st.session_state:
    st.session_state["CHANGE_SIDEBAR_FLAG"] = False
 if "INF_MODE" not in st.session_state:
    st.session_state["INF_MODE"] = "Formula recognition"
 # ====== <sidebar> ======
 with st.sidebar:
    num_beams = 1
    st.markdown("# 🔨️ Config")
    st.markdown("")
    inf_mode = st.selectbox(
        "Inference mode",
        ("Formula recognition", "Paragraph recognition"),
        on_change=change_side_bar,
    )
    num_beams = st.number_input(
        "Number of beams", min_value=1, max_value=20, step=1, on_change=change_side_bar
    )
    device = st.radio("device", ("cpu", "cuda", "mps"), on_change=change_side_bar)
    st.markdown("## Seedup")
    use_onnx = st.toggle("ONNX Runtime ")
 # ====== </sidebar> ======
 # ====== <page> ======
 latexrec_model = get_texteller(use_onnx)
 tokenizer = get_tokenizer()
 if inf_mode == "Paragraph recognition":
    latexdet_model = get_latexdet_model()
    textrec_model = get_textrec_model()
    textdet_model = get_textdet_model()
 st.markdown(HEADER_HTML, unsafe_allow_html=True)
 uploaded_file = st.file_uploader(" ", type=["jpg", "png"], on_change=on_file_upload)
 paste_result = pbutton(
    label="📋 Paste an image",
    background_color="#5BBCFF",
    hover_background_color="#3498db",
 )
 st.write("")
 if st.session_state["CHANGE_SIDEBAR_FLAG"] is True:
    st.session_state["CHANGE_SIDEBAR_FLAG"] = False
 elif uploaded_file or paste_result.image_data is not None:
    if st.session_state["UPLOADED_FILE_CHANGED"] is False and paste_result.image_data is not None:
        uploaded_file = io.BytesIO()
        paste_result.image_data.save(uploaded_file, format="PNG")
        uploaded_file.seek(0)
    if st.session_state["UPLOADED_FILE_CHANGED"] is True:
        st.session_state["UPLOADED_FILE_CHANGED"] = False
    img = Image.open(uploaded_file)
    temp_dir = tempfile.mkdtemp()
    png_fpath = os.path.join(temp_dir, "image.png")
    img.save(png_fpath, "PNG")
    with st.container(height=300):
        img_base64 = get_image_base64(uploaded_file)
        st.markdown(
            IMAGE_EMBED_HTML.format(img_base64=img_base64),
            unsafe_allow_html=True,
        )
    st.markdown(
        IMAGE_INFO_HTML.format(img_height=img.height, img_width=img.width),
        unsafe_allow_html=True,
    )
    st.write("")
    with st.spinner("Predicting..."):
        if inf_mode == "Formula recognition":
            pred = img2latex(
                model=latexrec_model,
                tokenizer=tokenizer,
                images=[png_fpath],
                device=str2device(device),
                out_format="katex",
                num_beams=num_beams,
                keep_style=False,
            )[0]
        else:
            pred = paragraph2md(
                img_path=png_fpath,
                latexdet_model=latexdet_model,
                textdet_model=textdet_model,
                textrec_model=textrec_model,
                latexrec_model=latexrec_model,
                tokenizer=tokenizer,
                device=str2device(device),
                num_beams=num_beams,
            )
        st.success("Completed!", icon="✅")
        # st.markdown(SUCCESS_GIF_HTML, unsafe_allow_html=True)
        # st.text_area("Predicted LaTeX", pred, height=150)
        if inf_mode == "Formula recognition":
            st.code(pred, language="latex")
        elif inf_mode == "Paragraph recognition":
            st.code(pred, language="markdown")
        else:
            raise ValueError(f"Invalid inference mode: {inf_mode}")
        if inf_mode == "Formula recognition":
            st.latex(pred)
        elif inf_mode == "Paragraph recognition":
            mixed_res = re.split(r"(\$\$.*?\$\$)", pred, flags=re.DOTALL)
            for text in mixed_res:
                if text.startswith("$$") and text.endswith("$$"):
                    st.latex(text.strip("$$"))
                else:
                    st.markdown(text)
        st.write("")
        st.write("")
        with st.expander(":star2: :gray[Tips for better results]"):
            st.markdown("""
                * :mag_right: Use a clear and high-resolution image.
                * :scissors: Crop images as accurately as possible.
                * :jigsaw: Split large multi line formulas into smaller ones.
                * :page_facing_up: Use images with **white background and black text** as much as possible.
                * :book: Use a font with good readability.
            """)
        shutil.rmtree(temp_dir)
    paste_result.image_data = None
 # ====== </page> ======
--- a/texteller/cli/commands/web/style.py
+++ b/texteller/cli/commands/web/style.py
@@ -0,0 +1,55 @@
 from texteller.utils import lines_dedent
 HEADER_HTML = lines_dedent("""
    <h1 style="color: black; text-align: center;">
        <img src="https://raw.githubusercontent.com/OleehyO/TexTeller/main/assets/fire.svg" width="100">
        𝚃𝚎𝚡𝚃𝚎𝚕𝚕𝚎𝚛
        <img src="https://raw.githubusercontent.com/OleehyO/TexTeller/main/assets/fire.svg" width="100">
    </h1>
    """)
 SUCCESS_GIF_HTML = lines_dedent("""
    <h1 style="color: black; text-align: center;">
        <img src="https://slackmojis.com/emojis/90621-clapclap-e/download" width="50">
        <img src="https://slackmojis.com/emojis/90621-clapclap-e/download" width="50">
        <img src="https://slackmojis.com/emojis/90621-clapclap-e/download" width="50">
    </h1>
    """)
 FAIL_GIF_HTML = lines_dedent("""
    <h1 style="color: black; text-align: center;">
        <img src="https://slackmojis.com/emojis/51439-allthethings_intensifies/download">
        <img src="https://slackmojis.com/emojis/51439-allthethings_intensifies/download">
        <img src="https://slackmojis.com/emojis/51439-allthethings_intensifies/download">
    </h1>
    """)
 IMAGE_EMBED_HTML = lines_dedent("""
    <style>
    .centered-container {{
        text-align: center;
    }}
    .centered-image {{
        display: block;
        margin-left: auto;
        margin-right: auto;
        max-height: 350px;
        max-width: 100%;
    }}
    </style>
    <div class="centered-container">
        <img src="data:image/png;base64,{img_base64}" class="centered-image" alt="Input image">
    </div>
    """)
 IMAGE_INFO_HTML = lines_dedent("""
    <style>
    .centered-container {{
        text-align: center;
    }}
    </style>
    <div class="centered-container">
        <p style="color:gray;">Input image ({img_height}✖️{img_width})</p>
    </div>
    """)
--- a/texteller/client_demo.py
+++ b/texteller/client_demo.py
@@ -1,12 +0,0 @@
 import requests
 rec_server_url = "http://127.0.0.1:8000/frec"
 det_server_url = "http://127.0.0.1:8000/fdet"
 img_path = "/your/image/path/"
 with open(img_path, 'rb') as img:
    files = {'img': img}
    response = requests.post(rec_server_url, files=files)
    # response = requests.post(det_server_url, files=files)
 print(response.text)
--- a/texteller/models/globals.py
+++ b/texteller/models/globals.py
@@ -21,3 +21,13 @@ MIN_RESIZE_RATIO = 0.75
 # Minimum height and width for input image for TexTeller
 MIN_HEIGHT = 12
 MIN_WIDTH = 30
 LATEX_DET_MODEL_URL = (
    "https://huggingface.co/TonyLee1256/texteller_det/resolve/main/rtdetr_r50vd_6x_coco.onnx"
 )
 TEXT_REC_MODEL_URL = (
    "https://huggingface.co/OleehyO/paddleocrv4.onnx/resolve/main/ch_PP-OCRv4_server_rec.onnx"
 )
 TEXT_DET_MODEL_URL = (
    "https://huggingface.co/OleehyO/paddleocrv4.onnx/resolve/main/ch_PP-OCRv4_det.onnx"
 )
--- a/texteller/globals.py
+++ b/texteller/globals.py
@@ -0,0 +1,41 @@
 import logging
 from pathlib import Path
 class Globals:
    """
    Singleton class for managing global variables with predefined and dynamic attributes.
    Usage Example:
        >>> # 1. Access predefined variable (with default value)
        >>> print(Globals().repo_name)  # Output: OleehyO/TexTeller
        >>> # 2. Modify predefined variable
        >>> Globals().repo_name = "NewRepo/NewProject"
        >>> print(Globals().repo_name)  # Output: NewRepo/NewProject
        >>> # 3. Dynamically add new variable
        >>> Globals().new_var = "hello"
        >>> print(Globals().new_var)  # Output: hello
        >>> # 4. View all variables
        >>> print(Globals())  # Output: <Globals: {'repo_name': ..., 'new_var': ...}>
    """
    _instance = None
    _initialized = False
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
        return cls._instance
    def __init__(self):
        if not self._initialized:
            self.repo_name = "OleehyO/TexTeller"
            self.logging_level = logging.INFO
            self.cache_dir = Path("~/.cache/texteller").expanduser().resolve()
            self.__class__._initialized = True
    def __repr__(self):
        return f"<Globals: {self.__dict__}>"
--- a/texteller/infer_det.py
+++ b/texteller/infer_det.py
@@ -1,96 +0,0 @@
 import os
 import argparse
 import glob
 import subprocess
 import onnxruntime
 from pathlib import Path
 from models.det_model.inference import PredictConfig, predict_image
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--infer_cfg", type=str, help="infer_cfg.yml", default="./models/det_model/model/infer_cfg.yml"
 )
 parser.add_argument(
    '--onnx_file',
    type=str,
    help="onnx model file path",
    default="./models/det_model/model/rtdetr_r50vd_6x_coco.onnx",
 )
 parser.add_argument("--image_dir", type=str, default='./testImgs')
 parser.add_argument("--image_file", type=str)
 parser.add_argument("--imgsave_dir", type=str, default="./detect_results")
 parser.add_argument(
    '--use_gpu', action='store_true', help='Whether to use GPU for inference', default=True
 )
 def get_test_images(infer_dir, infer_img):
    """
    Get image path list in TEST mode
    """
    assert (
        infer_img is not None or infer_dir is not None
    ), "--image_file or --image_dir should be set"
    assert infer_img is None or os.path.isfile(infer_img), "{} is not a file".format(infer_img)
    assert infer_dir is None or os.path.isdir(infer_dir), "{} is not a directory".format(infer_dir)
    # infer_img has a higher priority
    if infer_img and os.path.isfile(infer_img):
        return [infer_img]
    images = set()
    infer_dir = os.path.abspath(infer_dir)
    assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(infer_dir)
    exts = ['jpg', 'jpeg', 'png', 'bmp']
    exts += [ext.upper() for ext in exts]
    for ext in exts:
        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
    images = list(images)
    assert len(images) > 0, "no image found in {}".format(infer_dir)
    print("Found {} inference images in total.".format(len(images)))
    return images
 def download_file(url, filename):
    print(f"Downloading {filename}...")
    subprocess.run(["wget", "-q", "--show-progress", "-O", filename, url], check=True)
    print("Download complete.")
 if __name__ == '__main__':
    cur_path = os.getcwd()
    script_dirpath = Path(__file__).resolve().parent
    os.chdir(script_dirpath)
    FLAGS = parser.parse_args()
    if not os.path.exists(FLAGS.infer_cfg):
        infer_cfg_url = "https://huggingface.co/TonyLee1256/texteller_det/resolve/main/infer_cfg.yml?download=true"
        download_file(infer_cfg_url, FLAGS.infer_cfg)
    if not os.path.exists(FLAGS.onnx_file):
        onnx_file_url = "https://huggingface.co/TonyLee1256/texteller_det/resolve/main/rtdetr_r50vd_6x_coco.onnx?download=true"
        download_file(onnx_file_url, FLAGS.onnx_file)
    # load image list
    img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
    if FLAGS.use_gpu:
        predictor = onnxruntime.InferenceSession(
            FLAGS.onnx_file, providers=['CUDAExecutionProvider']
        )
    else:
        predictor = onnxruntime.InferenceSession(
            FLAGS.onnx_file, providers=['CPUExecutionProvider']
        )
    # load infer config
    infer_config = PredictConfig(FLAGS.infer_cfg)
    predict_image(FLAGS.imgsave_dir, infer_config, predictor, img_list)
    os.chdir(cur_path)
--- a/texteller/inference.py
+++ b/texteller/inference.py
@@ -1,81 +0,0 @@
 import os
 import argparse
 import cv2 as cv
 from pathlib import Path
 from onnxruntime import InferenceSession
 from models.thrid_party.paddleocr.infer import predict_det, predict_rec
 from models.thrid_party.paddleocr.infer import utility
 from models.utils import mix_inference
 from models.ocr_model.utils.to_katex import to_katex
 from models.ocr_model.utils.inference import inference as latex_inference
 from models.ocr_model.model.TexTeller import TexTeller
 from models.det_model.inference import PredictConfig
 if __name__ == '__main__':
    os.chdir(Path(__file__).resolve().parent)
    parser = argparse.ArgumentParser()
    parser.add_argument('-img', type=str, required=True, help='path to the input image')
    parser.add_argument(
        '--inference-mode',
        type=str,
        default='cpu',
        help='Inference mode, select one of cpu, cuda, or mps',
    )
    parser.add_argument(
        '--num-beam', type=int, default=1, help='number of beam search for decoding'
    )
    parser.add_argument('-mix', action='store_true', help='use mix mode')
    args = parser.parse_args()
    # You can use your own checkpoint and tokenizer path.
    print('Loading model and tokenizer...')
    latex_rec_model = TexTeller.from_pretrained()
    tokenizer = TexTeller.get_tokenizer()
    print('Model and tokenizer loaded.')
    img_path = args.img
    img = cv.imread(img_path)
    print('Inference...')
    if not args.mix:
        res = latex_inference(latex_rec_model, tokenizer, [img], args.inference_mode, args.num_beam)
        res = to_katex(res[0])
        print(res)
    else:
        infer_config = PredictConfig("./models/det_model/model/infer_cfg.yml")
        latex_det_model = InferenceSession("./models/det_model/model/rtdetr_r50vd_6x_coco.onnx")
        use_gpu = args.inference_mode == 'cuda'
        SIZE_LIMIT = 20 * 1024 * 1024
        det_model_dir = "./models/thrid_party/paddleocr/checkpoints/det/default_model.onnx"
        rec_model_dir = "./models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx"
        # The CPU inference of the detection model will be faster than the GPU inference (in onnxruntime)
        det_use_gpu = False
        rec_use_gpu = use_gpu and not (os.path.getsize(rec_model_dir) < SIZE_LIMIT)
        paddleocr_args = utility.parse_args()
        paddleocr_args.use_onnx = True
        paddleocr_args.det_model_dir = det_model_dir
        paddleocr_args.rec_model_dir = rec_model_dir
        paddleocr_args.use_gpu = det_use_gpu
        detector = predict_det.TextDetector(paddleocr_args)
        paddleocr_args.use_gpu = rec_use_gpu
        recognizer = predict_rec.TextRecognizer(paddleocr_args)
        lang_ocr_models = [detector, recognizer]
        latex_rec_models = [latex_rec_model, tokenizer]
        res = mix_inference(
            img_path,
            infer_config,
            latex_det_model,
            lang_ocr_models,
            latex_rec_models,
            args.inference_mode,
            args.num_beam,
        )
        print(res)
--- a/texteller/logger.py
+++ b/texteller/logger.py
@@ -0,0 +1,96 @@
 import inspect
 import logging
 import os
 from datetime import datetime
 from logging import Logger
 import colorama
 from colorama import Fore, Style
 from texteller.globals import Globals
 # Initialize colorama for colored console output
 colorama.init(autoreset=True)
 TEMPLATE = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 class ColoredFormatter(logging.Formatter):
    """Custom formatter to add colors based on log level."""
    FORMATS = {  # noqa: E501
        logging.DEBUG: Fore.LIGHTBLACK_EX + TEMPLATE + Style.RESET_ALL,
        logging.INFO: Fore.WHITE + TEMPLATE + Style.RESET_ALL,
        logging.WARNING: Fore.YELLOW + TEMPLATE + Style.RESET_ALL,
        logging.ERROR: Fore.RED + TEMPLATE + Style.RESET_ALL,
        logging.CRITICAL: Fore.RED + Style.BRIGHT + TEMPLATE + Style.RESET_ALL,
    }  # noqa: E501
    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno, self.FORMATS[logging.INFO])
        formatter = logging.Formatter(log_fmt, datefmt="%Y-%m-%d %H:%M:%S")
        return formatter.format(record)
 def get_logger(name: str | None = None, use_file_handler: bool = False) -> Logger:
    """
    Creates and configures a logger with the caller's module name (if provided) or the first two modules.
    If the module name is too long, it takes the first two modules.
    Args:
        name (str, optional): Custom logger name. If None, derives from caller's module.
        use_file_handler (bool, optional): Whether to use a file handler. Defaults to False.
    Returns:
        Logger: Configured logger with colored console output and file handler.
    """
    # If name is not provided, derive it from the caller's module
    if name is None:
        # Get the caller's stack frame
        frame = inspect.stack()[1]
        module = inspect.getmodule(frame[0])
        if module and module.__name__:
            module_name = module.__name__
            # Split module name and take first two components if too long
            parts = module_name.split(".")
            if len(parts) > 2:
                name = ".".join(parts[:2])
            else:
                name = module_name
        else:
            name = "root"
    # Create or get logger
    logger = logging.getLogger(name)
    # Prevent duplicate handlers
    if logger.handlers:
        return logger
    # Set logger level
    logger.setLevel(Globals().logging_level)
    # Create console handler with colored formatter
    console_handler = logging.StreamHandler()
    console_handler.setLevel(Globals().logging_level)
    console_formatter = ColoredFormatter()
    console_handler.setFormatter(console_formatter)
    logger.addHandler(console_handler)
    # Create file handler
    if use_file_handler:
        log_dir = "logs"
        os.makedirs(log_dir, exist_ok=True)
        log_file = os.path.join(log_dir, f"{datetime.now().strftime('%Y%m%d')}.log")
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(Globals().logging_level)
        # File formatter (no colors)
        file_formatter = logging.Formatter(TEMPLATE, datefmt="%Y-%m-%d %H:%M:%S")
        file_handler.setFormatter(file_formatter)
        logger.addHandler(file_handler)
    # Prevent logger from propagating to root logger
    logger.propagate = False
    return logger
--- a/texteller/models/init.py
+++ b/texteller/models/init.py
@@ -0,0 +1,3 @@
 from .texteller import TexTeller
 __all__ = ['TexTeller']
--- a/texteller/models/det_model/inference.py
+++ b/texteller/models/det_model/inference.py
@@ -1,226 +0,0 @@
 import os
 import time
 import yaml
 import numpy as np
 import cv2
 from tqdm import tqdm
 from typing import List
 from .preprocess import Compose
 from .Bbox import Bbox
 # Global dictionary
 SUPPORT_MODELS = {
    'YOLO',
    'PPYOLOE',
    'RCNN',
    'SSD',
    'Face',
    'FCOS',
    'SOLOv2',
    'TTFNet',
    'S2ANet',
    'JDE',
    'FairMOT',
    'DeepSORT',
    'GFL',
    'PicoDet',
    'CenterNet',
    'TOOD',
    'RetinaNet',
    'StrongBaseline',
    'STGCN',
    'YOLOX',
    'HRNet',
    'DETR',
 }
 class PredictConfig(object):
    """set config of preprocess, postprocess and visualize
    Args:
        infer_config (str): path of infer_cfg.yml
    """
    def __init__(self, infer_config):
        # parsing Yaml config for Preprocess
        with open(infer_config) as f:
            yml_conf = yaml.safe_load(f)
        self.check_model(yml_conf)
        self.arch = yml_conf['arch']
        self.preprocess_infos = yml_conf['Preprocess']
        self.min_subgraph_size = yml_conf['min_subgraph_size']
        self.label_list = yml_conf['label_list']
        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
        self.mask = yml_conf.get("mask", False)
        self.tracker = yml_conf.get("tracker", None)
        self.nms = yml_conf.get("NMS", None)
        self.fpn_stride = yml_conf.get("fpn_stride", None)
        color_pool = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255)]
        self.colors = {
            label: color_pool[i % len(color_pool)] for i, label in enumerate(self.label_list)
        }
        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
            print('The RCNN export model is used for ONNX and it only supports batch_size = 1')
        self.print_config()
    def check_model(self, yml_conf):
        """
        Raises:
            ValueError: loaded model not in supported model type
        """
        for support_model in SUPPORT_MODELS:
            if support_model in yml_conf['arch']:
                return True
        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf['arch'], SUPPORT_MODELS))
    def print_config(self):
        print('-----------  Model Configuration -----------')
        print('%s: %s' % ('Model Arch', self.arch))
        print('%s: ' % ('Transform Order'))
        for op_info in self.preprocess_infos:
            print('--%s: %s' % ('transform op', op_info['type']))
        print('--------------------------------------------')
 def draw_bbox(image, outputs, infer_config):
    for output in outputs:
        cls_id, score, xmin, ymin, xmax, ymax = output
        if score > infer_config.draw_threshold:
            label = infer_config.label_list[int(cls_id)]
            color = infer_config.colors[label]
            cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2)
            cv2.putText(
                image,
                "{}: {:.2f}".format(label, score),
                (int(xmin), int(ymin - 5)),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                color,
                2,
            )
    return image
 def predict_image(imgsave_dir, infer_config, predictor, img_list):
    # load preprocess transforms
    transforms = Compose(infer_config.preprocess_infos)
    errImgList = []
    # Check and create subimg_save_dir if not exist
    subimg_save_dir = os.path.join(imgsave_dir, 'subimages')
    os.makedirs(subimg_save_dir, exist_ok=True)
    first_image_skipped = False
    total_time = 0
    num_images = 0
    # predict image
    for img_path in tqdm(img_list):
        img = cv2.imread(img_path)
        if img is None:
            print(f"Warning: Could not read image {img_path}. Skipping...")
            errImgList.append(img_path)
            continue
        inputs = transforms(img_path)
        inputs_name = [var.name for var in predictor.get_inputs()]
        inputs = {k: inputs[k][None,] for k in inputs_name}
        # Start timing
        start_time = time.time()
        outputs = predictor.run(output_names=None, input_feed=inputs)
        # Stop timing
        end_time = time.time()
        inference_time = end_time - start_time
        if not first_image_skipped:
            first_image_skipped = True
        else:
            total_time += inference_time
            num_images += 1
        print(
            f"ONNXRuntime predict time for {os.path.basename(img_path)}: {inference_time:.4f} seconds"
        )
        print("ONNXRuntime predict: ")
        if infer_config.arch in ["HRNet"]:
            print(np.array(outputs[0]))
        else:
            bboxes = np.array(outputs[0])
            for bbox in bboxes:
                if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:
                    print(f"{int(bbox[0])} {bbox[1]} " f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
        # Save the subimages (crop from the original image)
        subimg_counter = 1
        for output in np.array(outputs[0]):
            cls_id, score, xmin, ymin, xmax, ymax = output
            if score > infer_config.draw_threshold:
                label = infer_config.label_list[int(cls_id)]
                subimg = img[int(max(ymin, 0)) : int(ymax), int(max(xmin, 0)) : int(xmax)]
                if len(subimg) == 0:
                    continue
                subimg_filename = f"{os.path.splitext(os.path.basename(img_path))[0]}_{label}_{xmin:.2f}_{ymin:.2f}_{xmax:.2f}_{ymax:.2f}.jpg"
                subimg_path = os.path.join(subimg_save_dir, subimg_filename)
                cv2.imwrite(subimg_path, subimg)
                subimg_counter += 1
        # Draw bounding boxes and save the image with bounding boxes
        img_with_mask = img.copy()
        for output in np.array(outputs[0]):
            cls_id, score, xmin, ymin, xmax, ymax = output
            if score > infer_config.draw_threshold:
                cv2.rectangle(
                    img_with_mask,
                    (int(xmin), int(ymin)),
                    (int(xmax), int(ymax)),
                    (255, 255, 255),
                    -1,
                )  # 盖白
        img_with_bbox = draw_bbox(img, np.array(outputs[0]), infer_config)
        output_dir = imgsave_dir
        os.makedirs(output_dir, exist_ok=True)
        draw_box_dir = os.path.join(output_dir, 'draw_box')
        mask_white_dir = os.path.join(output_dir, 'mask_white')
        os.makedirs(draw_box_dir, exist_ok=True)
        os.makedirs(mask_white_dir, exist_ok=True)
        output_file_mask = os.path.join(mask_white_dir, os.path.basename(img_path))
        output_file_bbox = os.path.join(draw_box_dir, os.path.basename(img_path))
        cv2.imwrite(output_file_mask, img_with_mask)
        cv2.imwrite(output_file_bbox, img_with_bbox)
    avg_time_per_image = total_time / num_images if num_images > 0 else 0
    print(f"Total inference time for {num_images} images: {total_time:.4f} seconds")
    print(f"Average time per image: {avg_time_per_image:.4f} seconds")
    print("ErrorImgs:")
    print(errImgList)
 def predict(img_path: str, predictor, infer_config) -> List[Bbox]:
    transforms = Compose(infer_config.preprocess_infos)
    inputs = transforms(img_path)
    inputs_name = [var.name for var in predictor.get_inputs()]
    inputs = {k: inputs[k][None,] for k in inputs_name}
    outputs = predictor.run(output_names=None, input_feed=inputs)[0]
    res = []
    for output in outputs:
        cls_name = infer_config.label_list[int(output[0])]
        score = output[1]
        xmin = int(max(output[2], 0))
        ymin = int(max(output[3], 0))
        xmax = int(output[4])
        ymax = int(output[5])
        if score > infer_config.draw_threshold:
            res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score))
    return res
--- a/texteller/models/det_model/model/infer_cfg.yml
+++ b/texteller/models/det_model/model/infer_cfg.yml
@@ -1,27 +0,0 @@
 mode: paddle
 draw_threshold: 0.5
 metric: COCO
 use_dynamic_shape: false
 arch: DETR
 min_subgraph_size: 3
 Preprocess:
 - interp: 2
  keep_ratio: false
  target_size:
  - 1600
  - 1600
  type: Resize
 - mean:
  - 0.0
  - 0.0
  - 0.0
  norm_type: none
  std:
  - 1.0
  - 1.0
  - 1.0
  type: NormalizeImage
 - type: Permute
 label_list:
 - isolated
 - embedding
--- a/texteller/models/det_model/preprocess.py
+++ b/texteller/models/det_model/preprocess.py
@@ -1,485 +0,0 @@
 import numpy as np
 import cv2
 import copy
 def decode_image(img_path):
    if isinstance(img_path, str):
        with open(img_path, 'rb') as f:
            im_read = f.read()
        data = np.frombuffer(im_read, dtype='uint8')
    else:
        assert isinstance(img_path, np.ndarray)
        data = img_path
    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    img_info = {
        "im_shape": np.array(im.shape[:2], dtype=np.float32),
        "scale_factor": np.array([1.0, 1.0], dtype=np.float32),
    }
    return im, img_info
 class Resize(object):
    """resize image by target_size and max_size
    Args:
        target_size (int): the target size of image
        keep_ratio (bool): whether keep_ratio or not, default true
        interp (int): method of resize
    """
    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
        if isinstance(target_size, int):
            target_size = [target_size, target_size]
        self.target_size = target_size
        self.keep_ratio = keep_ratio
        self.interp = interp
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        assert len(self.target_size) == 2
        assert self.target_size[0] > 0 and self.target_size[1] > 0
        im_channel = im.shape[2]
        im_scale_y, im_scale_x = self.generate_scale(im)
        im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
        im_info['scale_factor'] = np.array([im_scale_y, im_scale_x]).astype('float32')
        return im, im_info
    def generate_scale(self, im):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
        Returns:
            im_scale_x: the resize ratio of X
            im_scale_y: the resize ratio of Y
        """
        origin_shape = im.shape[:2]
        im_c = im.shape[2]
        if self.keep_ratio:
            im_size_min = np.min(origin_shape)
            im_size_max = np.max(origin_shape)
            target_size_min = np.min(self.target_size)
            target_size_max = np.max(self.target_size)
            im_scale = float(target_size_min) / float(im_size_min)
            if np.round(im_scale * im_size_max) > target_size_max:
                im_scale = float(target_size_max) / float(im_size_max)
            im_scale_x = im_scale
            im_scale_y = im_scale
        else:
            resize_h, resize_w = self.target_size
            im_scale_y = resize_h / float(origin_shape[0])
            im_scale_x = resize_w / float(origin_shape[1])
        return im_scale_y, im_scale_x
 class NormalizeImage(object):
    """normalize image
    Args:
        mean (list): im - mean
        std (list): im / std
        is_scale (bool): whether need im / 255
        norm_type (str): type in ['mean_std', 'none']
    """
    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
        self.mean = mean
        self.std = std
        self.is_scale = is_scale
        self.norm_type = norm_type
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        im = im.astype(np.float32, copy=False)
        if self.is_scale:
            scale = 1.0 / 255.0
            im *= scale
        if self.norm_type == 'mean_std':
            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
            std = np.array(self.std)[np.newaxis, np.newaxis, :]
            im -= mean
            im /= std
        return im, im_info
 class Permute(object):
    """permute image
    Args:
        to_bgr (bool): whether convert RGB to BGR
        channel_first (bool): whether convert HWC to CHW
    """
    def __init__(
        self,
    ):
        super(Permute, self).__init__()
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        im = im.transpose((2, 0, 1)).copy()
        return im, im_info
 class PadStride(object):
    """padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
    Args:
        stride (bool): model with FPN need image shape % stride == 0
    """
    def __init__(self, stride=0):
        self.coarsest_stride = stride
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        coarsest_stride = self.coarsest_stride
        if coarsest_stride <= 0:
            return im, im_info
        im_c, im_h, im_w = im.shape
        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
        padding_im[:, :im_h, :im_w] = im
        return padding_im, im_info
 class LetterBoxResize(object):
    def __init__(self, target_size):
        """
        Resize image to target size, convert normalized xywh to pixel xyxy
        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
        Args:
            target_size (int|list): image target size.
        """
        super(LetterBoxResize, self).__init__()
        if isinstance(target_size, int):
            target_size = [target_size, target_size]
        self.target_size = target_size
    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
        # letterbox: resize a rectangular image to a padded rectangular
        shape = img.shape[:2]  # [height, width]
        ratio_h = float(height) / shape[0]
        ratio_w = float(width) / shape[1]
        ratio = min(ratio_h, ratio_w)
        new_shape = (round(shape[1] * ratio), round(shape[0] * ratio))  # [width, height]
        padw = (width - new_shape[0]) / 2
        padh = (height - new_shape[1]) / 2
        top, bottom = round(padh - 0.1), round(padh + 0.1)
        left, right = round(padw - 0.1), round(padw + 0.1)
        img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
        img = cv2.copyMakeBorder(
            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
        )  # padded rectangular
        return img, ratio, padw, padh
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        assert len(self.target_size) == 2
        assert self.target_size[0] > 0 and self.target_size[1] > 0
        height, width = self.target_size
        h, w = im.shape[:2]
        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
        new_shape = [round(h * ratio), round(w * ratio)]
        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
        return im, im_info
 class Pad(object):
    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
        """
        Pad image to a specified size.
        Args:
            size (list[int]): image target size
            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
        """
        super(Pad, self).__init__()
        if isinstance(size, int):
            size = [size, size]
        self.size = size
        self.fill_value = fill_value
    def __call__(self, im, im_info):
        im_h, im_w = im.shape[:2]
        h, w = self.size
        if h == im_h and w == im_w:
            im = im.astype(np.float32)
            return im, im_info
        canvas = np.ones((h, w, 3), dtype=np.float32)
        canvas *= np.array(self.fill_value, dtype=np.float32)
        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
        im = canvas
        return im, im_info
 def rotate_point(pt, angle_rad):
    """Rotate a point by an angle.
    Args:
        pt (list[float]): 2 dimensional point to be rotated
        angle_rad (float): rotation angle by radian
    Returns:
        list[float]: Rotated point.
    """
    assert len(pt) == 2
    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
    new_x = pt[0] * cs - pt[1] * sn
    new_y = pt[0] * sn + pt[1] * cs
    rotated_pt = [new_x, new_y]
    return rotated_pt
 def _get_3rd_point(a, b):
    """To calculate the affine matrix, three pairs of points are required. This
    function is used to get the 3rd point, given 2D points a & b.
    The 3rd point is defined by rotating vector `a - b` by 90 degrees
    anticlockwise, using b as the rotation center.
    Args:
        a (np.ndarray): point(x,y)
        b (np.ndarray): point(x,y)
    Returns:
        np.ndarray: The 3rd point.
    """
    assert len(a) == 2
    assert len(b) == 2
    direction = a - b
    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
    return third_pt
 def get_affine_transform(center, input_size, rot, output_size, shift=(0.0, 0.0), inv=False):
    """Get the affine transform matrix, given the center/scale/rot/output_size.
    Args:
        center (np.ndarray[2, ]): Center of the bounding box (x, y).
        scale (np.ndarray[2, ]): Scale of the bounding box
            wrt [width, height].
        rot (float): Rotation angle (degree).
        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
        shift (0-100%): Shift translation ratio wrt the width/height.
            Default (0., 0.).
        inv (bool): Option to inverse the affine transform direction.
            (inv=False: src->dst or inv=True: dst->src)
    Returns:
        np.ndarray: The transform matrix.
    """
    assert len(center) == 2
    assert len(output_size) == 2
    assert len(shift) == 2
    if not isinstance(input_size, (np.ndarray, list)):
        input_size = np.array([input_size, input_size], dtype=np.float32)
    scale_tmp = input_size
    shift = np.array(shift)
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]
    rot_rad = np.pi * rot / 180
    src_dir = rotate_point([0.0, src_w * -0.5], rot_rad)
    dst_dir = np.array([0.0, dst_w * -0.5])
    src = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
    dst = np.zeros((3, 2), dtype=np.float32)
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
    return trans
 class WarpAffine(object):
    """Warp affine the image"""
    def __init__(self, keep_res=False, pad=31, input_h=512, input_w=512, scale=0.4, shift=0.1):
        self.keep_res = keep_res
        self.pad = pad
        self.input_h = input_h
        self.input_w = input_w
        self.scale = scale
        self.shift = shift
    def __call__(self, im, im_info):
        """
        Args:
            im (np.ndarray): image (np.ndarray)
            im_info (dict): info of image
        Returns:
            im (np.ndarray):  processed image (np.ndarray)
            im_info (dict): info of processed image
        """
        img = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
        h, w = img.shape[:2]
        if self.keep_res:
            input_h = (h | self.pad) + 1
            input_w = (w | self.pad) + 1
            s = np.array([input_w, input_h], dtype=np.float32)
            c = np.array([w // 2, h // 2], dtype=np.float32)
        else:
            s = max(h, w) * 1.0
            input_h, input_w = self.input_h, self.input_w
            c = np.array([w / 2.0, h / 2.0], dtype=np.float32)
        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
        img = cv2.resize(img, (w, h))
        inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
        return inp, im_info
 # keypoint preprocess
 def get_warp_matrix(theta, size_input, size_dst, size_target):
    """This code is based on
        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
        Calculate the transformation matrix under the constraint of unbiased.
    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
    Data Processing for Human Pose Estimation (CVPR 2020).
    Args:
        theta (float): Rotation angle in degrees.
        size_input (np.ndarray): Size of input image [w, h].
        size_dst (np.ndarray): Size of output image [w, h].
        size_target (np.ndarray): Size of ROI in input plane [w, h].
    Returns:
        matrix (np.ndarray): A matrix for transformation.
    """
    theta = np.deg2rad(theta)
    matrix = np.zeros((2, 3), dtype=np.float32)
    scale_x = size_dst[0] / size_target[0]
    scale_y = size_dst[1] / size_target[1]
    matrix[0, 0] = np.cos(theta) * scale_x
    matrix[0, 1] = -np.sin(theta) * scale_x
    matrix[0, 2] = scale_x * (
        -0.5 * size_input[0] * np.cos(theta)
        + 0.5 * size_input[1] * np.sin(theta)
        + 0.5 * size_target[0]
    )
    matrix[1, 0] = np.sin(theta) * scale_y
    matrix[1, 1] = np.cos(theta) * scale_y
    matrix[1, 2] = scale_y * (
        -0.5 * size_input[0] * np.sin(theta)
        - 0.5 * size_input[1] * np.cos(theta)
        + 0.5 * size_target[1]
    )
    return matrix
 class TopDownEvalAffine(object):
    """apply affine transform to image and coords
    Args:
        trainsize (list): [w, h], the standard size used to train
        use_udp (bool): whether to use Unbiased Data Processing.
        records(dict): the dict contained the image and coords
    Returns:
        records (dict): contain the image and coords after tranformed
    """
    def __init__(self, trainsize, use_udp=False):
        self.trainsize = trainsize
        self.use_udp = use_udp
    def __call__(self, image, im_info):
        rot = 0
        imshape = im_info['im_shape'][::-1]
        center = im_info['center'] if 'center' in im_info else imshape / 2.0
        scale = im_info['scale'] if 'scale' in im_info else imshape
        if self.use_udp:
            trans = get_warp_matrix(
                rot, center * 2.0, [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale
            )
            image = cv2.warpAffine(
                image,
                trans,
                (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR,
            )
        else:
            trans = get_affine_transform(center, scale, rot, self.trainsize)
            image = cv2.warpAffine(
                image,
                trans,
                (int(self.trainsize[0]), int(self.trainsize[1])),
                flags=cv2.INTER_LINEAR,
            )
        return image, im_info
 class Compose:
    def __init__(self, transforms):
        self.transforms = []
        for op_info in transforms:
            new_op_info = op_info.copy()
            op_type = new_op_info.pop('type')
            self.transforms.append(eval(op_type)(**new_op_info))
    def __call__(self, img_path):
        img, im_info = decode_image(img_path)
        for t in self.transforms:
            img, im_info = t(img, im_info)
        inputs = copy.deepcopy(im_info)
        inputs['image'] = img
        return inputs
--- a/texteller/models/ocr_model/model/TexTeller.py
+++ b/texteller/models/ocr_model/model/TexTeller.py
@@ -1,43 +0,0 @@
 from pathlib import Path
 from ...globals import VOCAB_SIZE, FIXED_IMG_SIZE, IMG_CHANNELS, MAX_TOKEN_SIZE
 from transformers import RobertaTokenizerFast, VisionEncoderDecoderModel, VisionEncoderDecoderConfig
 class TexTeller(VisionEncoderDecoderModel):
    REPO_NAME = 'OleehyO/TexTeller'
    def __init__(self):
        config = VisionEncoderDecoderConfig.from_pretrained(
            Path(__file__).resolve().parent / "config.json"
        )
        config.encoder.image_size = FIXED_IMG_SIZE
        config.encoder.num_channels = IMG_CHANNELS
        config.decoder.vocab_size = VOCAB_SIZE
        config.decoder.max_position_embeddings = MAX_TOKEN_SIZE
        super().__init__(config=config)
    @classmethod
    def from_pretrained(cls, model_path: str = None, use_onnx=False, onnx_provider=None):
        if model_path is None or model_path == 'default':
            if not use_onnx:
                return VisionEncoderDecoderModel.from_pretrained(cls.REPO_NAME)
            else:
                from optimum.onnxruntime import ORTModelForVision2Seq
                use_gpu = True if onnx_provider == 'cuda' else False
                return ORTModelForVision2Seq.from_pretrained(
                    cls.REPO_NAME,
                    provider="CUDAExecutionProvider" if use_gpu else "CPUExecutionProvider",
                )
        model_path = Path(model_path).resolve()
        return VisionEncoderDecoderModel.from_pretrained(str(model_path))
    @classmethod
    def get_tokenizer(cls, tokenizer_path: str = None) -> RobertaTokenizerFast:
        if tokenizer_path is None or tokenizer_path == 'default':
            return RobertaTokenizerFast.from_pretrained(cls.REPO_NAME)
        tokenizer_path = Path(tokenizer_path).resolve()
        return RobertaTokenizerFast.from_pretrained(str(tokenizer_path))
--- a/texteller/models/ocr_model/model/config.json
+++ b/texteller/models/ocr_model/model/config.json
@@ -1,168 +0,0 @@
 {
  "_name_or_path": "OleehyO/TexTeller",
  "architectures": [
    "VisionEncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "",
    "activation_dropout": 0.0,
    "activation_function": "gelu",
    "add_cross_attention": true,
    "architectures": null,
    "attention_dropout": 0.0,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": 0.0,
    "cross_attention_hidden_size": 768,
    "d_model": 1024,
    "decoder_attention_heads": 16,
    "decoder_ffn_dim": 4096,
    "decoder_layerdrop": 0.0,
    "decoder_layers": 12,
    "decoder_start_token_id": 2,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "dropout": 0.1,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "init_std": 0.02,
    "is_decoder": true,
    "is_encoder_decoder": false,
    "label2id": {
      "LABEL_0": 0,
      "LABEL_1": 1
    },
    "layernorm_embedding": true,
    "length_penalty": 1.0,
    "max_length": 20,
    "max_position_embeddings": 1024,
    "min_length": 0,
    "model_type": "trocr",
    "no_repeat_ngram_size": 0,
    "num_beam_groups": 1,
    "num_beams": 1,
    "num_return_sequences": 1,
    "output_attentions": false,
    "output_hidden_states": false,
    "output_scores": false,
    "pad_token_id": 1,
    "prefix": null,
    "problem_type": null,
    "pruned_heads": {},
    "remove_invalid_values": false,
    "repetition_penalty": 1.0,
    "return_dict": true,
    "return_dict_in_generate": false,
    "scale_embedding": false,
    "sep_token_id": null,
    "suppress_tokens": null,
    "task_specific_params": null,
    "temperature": 1.0,
    "tf_legacy_loss": false,
    "tie_encoder_decoder": false,
    "tie_word_embeddings": true,
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
    "torch_dtype": null,
    "torchscript": false,
    "typical_p": 1.0,
    "use_bfloat16": false,
    "use_cache": false,
    "use_learned_position_embeddings": true,
    "vocab_size": 15000
  },
  "encoder": {
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "attention_probs_dropout_prob": 0.0,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "encoder_stride": 16,
    "eos_token_id": null,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 768,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "image_size": 448,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "is_decoder": false,
    "is_encoder_decoder": false,
    "label2id": {
      "LABEL_0": 0,
      "LABEL_1": 1
    },
    "layer_norm_eps": 1e-12,
    "length_penalty": 1.0,
    "max_length": 20,
    "min_length": 0,
    "model_type": "vit",
    "no_repeat_ngram_size": 0,
    "num_attention_heads": 12,
    "num_beam_groups": 1,
    "num_beams": 1,
    "num_channels": 1,
    "num_hidden_layers": 12,
    "num_return_sequences": 1,
    "output_attentions": false,
    "output_hidden_states": false,
    "output_scores": false,
    "pad_token_id": null,
    "patch_size": 16,
    "prefix": null,
    "problem_type": null,
    "pruned_heads": {},
    "qkv_bias": false,
    "remove_invalid_values": false,
    "repetition_penalty": 1.0,
    "return_dict": true,
    "return_dict_in_generate": false,
    "sep_token_id": null,
    "suppress_tokens": null,
    "task_specific_params": null,
    "temperature": 1.0,
    "tf_legacy_loss": false,
    "tie_encoder_decoder": false,
    "tie_word_embeddings": true,
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
    "torch_dtype": null,
    "torchscript": false,
    "typical_p": 1.0,
    "use_bfloat16": false
  },
  "is_encoder_decoder": true,
  "model_type": "vision-encoder-decoder",
  "tie_word_embeddings": false,
  "transformers_version": "4.41.2",
  "use_cache": true
 }
--- a/texteller/models/ocr_model/train/dataset/train/0.png
+++ b/texteller/models/ocr_model/train/dataset/train/0.png
--- a/texteller/models/ocr_model/train/dataset/train/1.png
+++ b/texteller/models/ocr_model/train/dataset/train/1.png
--- a/texteller/models/ocr_model/train/dataset/train/10.png
+++ b/texteller/models/ocr_model/train/dataset/train/10.png
--- a/texteller/models/ocr_model/train/dataset/train/11.png
+++ b/texteller/models/ocr_model/train/dataset/train/11.png
--- a/texteller/models/ocr_model/train/dataset/train/12.png
+++ b/texteller/models/ocr_model/train/dataset/train/12.png
--- a/texteller/models/ocr_model/train/dataset/train/13.png
+++ b/texteller/models/ocr_model/train/dataset/train/13.png
--- a/texteller/models/ocr_model/train/dataset/train/14.png
+++ b/texteller/models/ocr_model/train/dataset/train/14.png
--- a/texteller/models/ocr_model/train/dataset/train/15.png
+++ b/texteller/models/ocr_model/train/dataset/train/15.png
--- a/texteller/models/ocr_model/train/dataset/train/16.png
+++ b/texteller/models/ocr_model/train/dataset/train/16.png
--- a/texteller/models/ocr_model/train/dataset/train/17.png
+++ b/texteller/models/ocr_model/train/dataset/train/17.png
--- a/texteller/models/ocr_model/train/dataset/train/18.png
+++ b/texteller/models/ocr_model/train/dataset/train/18.png
--- a/texteller/models/ocr_model/train/dataset/train/19.png
+++ b/texteller/models/ocr_model/train/dataset/train/19.png
--- a/texteller/models/ocr_model/train/dataset/train/2.png
+++ b/texteller/models/ocr_model/train/dataset/train/2.png
--- a/texteller/models/ocr_model/train/dataset/train/20.png
+++ b/texteller/models/ocr_model/train/dataset/train/20.png
--- a/texteller/models/ocr_model/train/dataset/train/21.png
+++ b/texteller/models/ocr_model/train/dataset/train/21.png
--- a/texteller/models/ocr_model/train/dataset/train/22.png
+++ b/texteller/models/ocr_model/train/dataset/train/22.png
--- a/texteller/models/ocr_model/train/dataset/train/23.png
+++ b/texteller/models/ocr_model/train/dataset/train/23.png
--- a/texteller/models/ocr_model/train/dataset/train/24.png
+++ b/texteller/models/ocr_model/train/dataset/train/24.png
--- a/texteller/models/ocr_model/train/dataset/train/25.png
+++ b/texteller/models/ocr_model/train/dataset/train/25.png
--- a/texteller/models/ocr_model/train/dataset/train/26.png
+++ b/texteller/models/ocr_model/train/dataset/train/26.png
--- a/texteller/models/ocr_model/train/dataset/train/27.png
+++ b/texteller/models/ocr_model/train/dataset/train/27.png
--- a/texteller/models/ocr_model/train/dataset/train/28.png
+++ b/texteller/models/ocr_model/train/dataset/train/28.png
--- a/texteller/models/ocr_model/train/dataset/train/29.png
+++ b/texteller/models/ocr_model/train/dataset/train/29.png
--- a/texteller/models/ocr_model/train/dataset/train/3.png
+++ b/texteller/models/ocr_model/train/dataset/train/3.png
--- a/texteller/models/ocr_model/train/dataset/train/30.png
+++ b/texteller/models/ocr_model/train/dataset/train/30.png
--- a/texteller/models/ocr_model/train/dataset/train/31.png
+++ b/texteller/models/ocr_model/train/dataset/train/31.png
--- a/texteller/models/ocr_model/train/dataset/train/32.png
+++ b/texteller/models/ocr_model/train/dataset/train/32.png
--- a/texteller/models/ocr_model/train/dataset/train/33.png
+++ b/texteller/models/ocr_model/train/dataset/train/33.png
--- a/texteller/models/ocr_model/train/dataset/train/34.png
+++ b/texteller/models/ocr_model/train/dataset/train/34.png
--- a/texteller/models/ocr_model/train/dataset/train/4.png
+++ b/texteller/models/ocr_model/train/dataset/train/4.png
--- a/texteller/models/ocr_model/train/dataset/train/5.png
+++ b/texteller/models/ocr_model/train/dataset/train/5.png
--- a/texteller/models/ocr_model/train/dataset/train/6.png
+++ b/texteller/models/ocr_model/train/dataset/train/6.png
--- a/texteller/models/ocr_model/train/dataset/train/7.png
+++ b/texteller/models/ocr_model/train/dataset/train/7.png
--- a/texteller/models/ocr_model/train/dataset/train/8.png
+++ b/texteller/models/ocr_model/train/dataset/train/8.png
--- a/texteller/models/ocr_model/train/dataset/train/9.png
+++ b/texteller/models/ocr_model/train/dataset/train/9.png
--- a/texteller/models/ocr_model/train/dataset/train/metadata.jsonl
+++ b/texteller/models/ocr_model/train/dataset/train/metadata.jsonl
@@ -1,35 +0,0 @@
 {"file_name": "0.png", "latex_formula": "\\[\\mathbb{C}^{4}\\stackrel{{\\pi_{1}}}{{\\longleftarrow}}\\mathcal{ F}\\stackrel{{\\pi_{2}}}{{\\rightarrow}}\\mathcal{PT},\\]"}
 {"file_name": "1.png", "latex_formula": "\\[W^{*}_{Z}(x_{1},x_{2})=W_{f\\lrcorner Z}(y_{1},y_{2})=\\mathcal{P}\\exp\\left( \\int_{\\gamma}A_{\\mu}dx^{\\mu}\\right).\\]"}
 {"file_name": "2.png", "latex_formula": "\\[G=W^{*}_{Z}(q,p)=\\tilde{H}H^{-1}\\]"}
 {"file_name": "3.png", "latex_formula": "\\[H=W^{*}_{Z}(p,x),\\ \\ \\tilde{H}=W^{*}_{Z}(q,x).\\]"}
 {"file_name": "4.png", "latex_formula": "\\[v\\cdot f^{*}A|_{x}=(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)},\\quad x\\in Z, \\ v\\in T_{x}Z.\\]"}
 {"file_name": "5.png", "latex_formula": "\\[(f\\lrcorner Z)_{*}v\\cdot A|_{f\\lrcorner Z(x)}=v^{\\alpha\\dot{\\alpha}}\\Big{(} \\frac{\\partial y^{\\beta\\dot{\\beta}}}{\\partial x^{\\alpha\\dot{\\alpha}}}A_{\\beta \\dot{\\beta}}\\Big{)}\\Big{|}_{f\\lrcorner Z(x)},\\ x\\in Z,\\ v\\in T_{x}Z,\\]"}
 {"file_name": "6.png", "latex_formula": "\\[\\{T_{i},T_{j}\\}=\\{\\tilde{T}^{i},\\tilde{T}^{j}\\}=0,\\ \\ \\{T_{i},\\tilde{T}^{j}\\}=2i \\delta^{j}_{i}D,\\]"}
 {"file_name": "7.png", "latex_formula": "\\[(\\partial_{s},q_{i},\\tilde{q}^{k})\\rightarrow(D,M^{j}_{i}T_{j},\\tilde{M}^{k}_ {l}\\tilde{T}^{l}),\\]"}
 {"file_name": "8.png", "latex_formula": "\\[M^{i}_{j}\\tilde{M}^{j}_{k}=\\delta^{i}_{k}.\\]"}
 {"file_name": "9.png", "latex_formula": "\\[Q_{i\\alpha}=q_{i\\alpha}+\\omega_{i\\alpha},\\ \\tilde{Q}^{i}_{\\dot{\\alpha}}=q^{i}_{ \\dot{\\alpha}}+\\tilde{\\omega}^{i}_{\\dot{\\alpha}},\\ D_{\\alpha\\dot{\\alpha}}= \\partial_{\\alpha\\dot{\\alpha}}+A_{\\alpha\\dot{\\alpha}}.\\]"}
 {"file_name": "10.png", "latex_formula": "\\[\\hat{f}(g,\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{j})=(f(g),[V^{-1}]^ {\\alpha}_{\\beta}\\theta^{i\\beta},[\\tilde{V}^{-1}]^{\\dot{\\alpha}}_{\\dot{\\beta}} \\tilde{\\theta}^{\\dot{\\beta}}_{j}),\\ g\\in{\\cal G},\\]"}
 {"file_name": "11.png", "latex_formula": "\\[v^{\\beta\\dot{\\beta}}V^{\\alpha}_{\\beta}\\tilde{V}^{\\dot{\\alpha}}_{\\dot{\\beta}} =((f\\lrcorner L_{0})_{*}v)^{\\alpha\\dot{\\alpha}},\\]"}
 {"file_name": "12.png", "latex_formula": "\\[\\omega_{i\\alpha}=\\tilde{\\theta}^{\\dot{\\alpha}}_{i}h_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\ \\ \\tilde{\\omega}^{i}_{\\alpha}=\\theta^{i\\alpha}\\tilde{h}_{\\alpha\\dot{\\alpha}}(x^{ \\beta\\dot{\\beta}},\\tau^{\\beta\\dot{\\beta}}),\\]"}
 {"file_name": "13.png", "latex_formula": "\\[\\begin{split}&\\lambda^{\\alpha}\\hat{f}^{*}\\omega_{i\\alpha}(z)= \\tilde{\\theta}^{\\dot{\\beta}}_{i}\\lambda^{\\alpha}\\left(V^{\\beta}_{\\alpha}h_{ \\beta\\dot{\\beta}}(x^{\\prime},\\tau^{\\prime})\\right),\\\\ &\\tilde{\\lambda}^{\\dot{\\alpha}}\\hat{f}^{*}\\tilde{\\omega}^{i}_{ \\dot{\\alpha}}(z)=\\theta^{i\\beta}\\tilde{\\lambda}^{\\dot{\\alpha}}\\left(\\tilde{V}^ {\\dot{\\beta}}_{\\dot{\\alpha}}\\tilde{h}_{\\beta\\dot{\\beta}}(x^{\\prime},\\tau^{ \\prime})\\right),\\end{split}\\]"}
 {"file_name": "14.png", "latex_formula": "\\[A_{\\alpha\\dot{\\alpha}}=A_{\\alpha\\dot{\\alpha}}(x^{\\beta\\dot{\\beta}},\\tau^{ \\beta\\dot{\\beta}})\\]"}
 {"file_name": "15.png", "latex_formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}D_{\\alpha\\dot{\\alpha}}\\]"}
 {"file_name": "16.png", "latex_formula": "\\[D=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}\\]"}
 {"file_name": "17.png", "latex_formula": "\\[[v_{1}\\cdot D^{*},v_{2}\\cdot D^{*}]=0\\]"}
 {"file_name": "18.png", "latex_formula": "\\[\\Phi_{A}=(\\omega_{i\\alpha},\\tilde{\\omega}^{i}_{\\dot{\\alpha}},A_{\\alpha\\dot{ \\alpha}})\\]"}
 {"file_name": "19.png", "latex_formula": "\\[\\hat{f}:{\\cal F}^{6|4N}\\rightarrow{\\cal F}^{6|4N}\\]"}
 {"file_name": "20.png", "latex_formula": "\\[\\sigma=(s,\\xi^{i},\\tilde{\\xi}_{j})\\in\\mathbb{C}^{1|2N}\\]"}
 {"file_name": "21.png", "latex_formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}(h_{\\alpha\\dot{\\alpha}}+\\tilde{h}_{\\alpha\\dot{\\alpha} })=0\\]"}
 {"file_name": "22.png", "latex_formula": "\\[\\tau^{\\alpha\\dot{\\alpha}}\\rightarrow[V^{-1}]^{\\alpha}_{\\beta}[\\tilde{V}^{-1}]^{ \\dot{\\alpha}}_{\\dot{\\beta}}\\tau^{\\beta\\dot{\\beta}}\\]"}
 {"file_name": "23.png", "latex_formula": "\\[\\tau^{\\beta\\dot{\\beta}}=\\sum_{i}\\theta^{i\\beta}\\tilde{\\theta}^{\\dot{\\beta}}_{i}\\]"}
 {"file_name": "24.png", "latex_formula": "\\[\\theta^{i\\alpha}\\omega_{i\\alpha}+\\tilde{\\theta}^{i}_{\\dot{\\alpha}}\\tilde{ \\omega}^{\\dot{\\alpha}}_{i}=0\\]"}
 {"file_name": "25.png", "latex_formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{Q}^{i}_{\\dot{\\alpha}}\\]"}
 {"file_name": "26.png", "latex_formula": "\\[\\tilde{T}^{i}=\\tilde{\\lambda}^{\\dot{\\alpha}}\\tilde{q}^{i}_{\\dot{\\alpha}}\\]"}
 {"file_name": "27.png", "latex_formula": "\\[\\tilde{\\lambda}^{\\dot{\\alpha}}f^{*}A_{\\alpha\\dot{\\alpha}}=H^{-1}\\tilde{ \\lambda}^{\\dot{\\alpha}}\\partial_{\\alpha\\dot{\\alpha}}H\\]"}
 {"file_name": "28.png", "latex_formula": "\\[\\tilde{q}^{i}=\\partial_{\\tilde{\\xi}_{i}}+i\\xi^{i}\\partial_{s}\\]"}
 {"file_name": "29.png", "latex_formula": "\\[\\tilde{q}^{i}_{\\dot{\\alpha}}=\\frac{\\partial}{\\partial\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}}+i\\theta^{i\\alpha}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"}
 {"file_name": "30.png", "latex_formula": "\\[f\\lrcorner L(z)=\\pi_{1}\\circ f(z,\\lambda,\\tilde{\\lambda})\\ \\forall z\\in L\\]"}
 {"file_name": "31.png", "latex_formula": "\\[q_{i\\alpha}=\\frac{\\partial}{\\partial\\theta^{i\\alpha}}+i\\tilde{\\theta}^{\\dot{ \\alpha}}_{i}\\frac{\\partial}{\\partial x^{\\alpha\\dot{\\alpha}}}\\]"}
 {"file_name": "32.png", "latex_formula": "\\[q_{i}=\\partial_{\\xi^{i}}+i\\tilde{\\xi}_{i}\\partial_{s}\\]"}
 {"file_name": "33.png", "latex_formula": "\\[v^{\\alpha\\dot{\\alpha}}=\\lambda^{\\alpha}\\tilde{\\lambda}^{\\dot{\\alpha}}\\]"}
 {"file_name": "34.png", "latex_formula": "\\[z^{A}=(x^{\\alpha\\dot{\\alpha}},\\theta^{i\\alpha},\\tilde{\\theta}^{\\dot{\\alpha}}_{ j})\\]"}
--- a/texteller/models/ocr_model/train/train.py
+++ b/texteller/models/ocr_model/train/train.py
@@ -1,114 +0,0 @@
 import os
 from functools import partial
 from pathlib import Path
 from datasets import load_dataset
 from transformers import (
    Trainer,
    TrainingArguments,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    GenerationConfig,
 )
 from .training_args import CONFIG
 from ..model.TexTeller import TexTeller
 from ..utils.functional import (
    tokenize_fn,
    collate_fn,
    img_train_transform,
    img_inf_transform,
    filter_fn,
 )
 from ..utils.metrics import bleu_metric
 from ...globals import MAX_TOKEN_SIZE, MIN_WIDTH, MIN_HEIGHT
 def train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer):
    training_args = TrainingArguments(**CONFIG)
    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=collate_fn_with_tokenizer,
    )
    trainer.train(resume_from_checkpoint=None)
 def evaluate(model, tokenizer, eval_dataset, collate_fn):
    eval_config = CONFIG.copy()
    eval_config['predict_with_generate'] = True
    generate_config = GenerationConfig(
        max_new_tokens=MAX_TOKEN_SIZE,
        num_beams=1,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
    )
    eval_config['generation_config'] = generate_config
    seq2seq_config = Seq2SeqTrainingArguments(**eval_config)
    trainer = Seq2SeqTrainer(
        model,
        seq2seq_config,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=collate_fn,
        compute_metrics=partial(bleu_metric, tokenizer=tokenizer),
    )
    eval_res = trainer.evaluate()
    print(eval_res)
 if __name__ == '__main__':
    script_dirpath = Path(__file__).resolve().parent
    os.chdir(script_dirpath)
    # dataset = load_dataset(str(Path('./dataset/loader.py').resolve()))['train']
    dataset = load_dataset("imagefolder", data_dir=str(script_dirpath / 'dataset'))['train']
    dataset = dataset.filter(
        lambda x: x['image'].height > MIN_HEIGHT and x['image'].width > MIN_WIDTH
    )
    dataset = dataset.shuffle(seed=42)
    dataset = dataset.flatten_indices()
    tokenizer = TexTeller.get_tokenizer()
    # If you want use your own tokenizer, please modify the path to your tokenizer
    # +tokenizer = TexTeller.get_tokenizer('/path/to/your/tokenizer')
    filter_fn_with_tokenizer = partial(filter_fn, tokenizer=tokenizer)
    dataset = dataset.filter(filter_fn_with_tokenizer, num_proc=8)
    map_fn = partial(tokenize_fn, tokenizer=tokenizer)
    tokenized_dataset = dataset.map(
        map_fn, batched=True, remove_columns=dataset.column_names, num_proc=8
    )
    # Split dataset into train and eval, ratio 9:1
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset, eval_dataset = split_dataset['train'], split_dataset['test']
    train_dataset = train_dataset.with_transform(img_train_transform)
    eval_dataset = eval_dataset.with_transform(img_inf_transform)
    collate_fn_with_tokenizer = partial(collate_fn, tokenizer=tokenizer)
    # Train from scratch
    model = TexTeller()
    # or train from TexTeller pre-trained model: model = TexTeller.from_pretrained()
    # If you want to train from pre-trained model, please modify the path to your pre-trained checkpoint
    # +e.g.
    # +model = TexTeller.from_pretrained(
    # +    '/path/to/your/model_checkpoint'
    # +)
    enable_train = True
    enable_evaluate = False
    if enable_train:
        train(model, tokenizer, train_dataset, eval_dataset, collate_fn_with_tokenizer)
    if enable_evaluate and len(eval_dataset) > 0:
        evaluate(model, tokenizer, eval_dataset, collate_fn_with_tokenizer)
--- a/texteller/models/ocr_model/train/training_args.py
+++ b/texteller/models/ocr_model/train/training_args.py
@@ -1,31 +0,0 @@
 CONFIG = {
    "seed": 42,  # Random seed for reproducibility
    "use_cpu": False,  # Whether to use CPU (it's easier to debug with CPU when starting to test the code)
    "learning_rate": 5e-5,  # Learning rate
    "num_train_epochs": 10,  # Total number of training epochs
    "per_device_train_batch_size": 4,  # Batch size per GPU for training
    "per_device_eval_batch_size": 8,  # Batch size per GPU for evaluation
    "output_dir": "train_result",  # Output directory
    "overwrite_output_dir": False,  # If the output directory exists, do not delete its content
    "report_to": ["tensorboard"],  # Report logs to TensorBoard
    "save_strategy": "steps",  # Strategy to save checkpoints
    "save_steps": 500,  # Interval of steps to save checkpoints, can be int or a float (0~1), when float it represents the ratio of total training steps (e.g., can set to 1.0 / 2000)
    "save_total_limit": 5,  # Maximum number of models to save. The oldest models will be deleted if this number is exceeded
    "logging_strategy": "steps",  # Log every certain number of steps
    "logging_steps": 500,  # Number of steps between each log
    "logging_nan_inf_filter": False,  # Record logs for loss=nan or inf
    "optim": "adamw_torch",  # Optimizer
    "lr_scheduler_type": "cosine",  # Learning rate scheduler
    "warmup_ratio": 0.1,  # Ratio of warmup steps in total training steps (e.g., for 1000 steps, the first 100 steps gradually increase lr from 0 to the set lr)
    "max_grad_norm": 1.0,  # For gradient clipping, ensure the norm of the gradients does not exceed 1.0 (default 1.0)
    "fp16": False,  # Whether to use 16-bit floating point for training (generally not recommended, as loss can easily explode)
    "bf16": False,  # Whether to use Brain Floating Point (bfloat16) for training (recommended if architecture supports it)
    "gradient_accumulation_steps": 1,  # Gradient accumulation steps, consider this parameter to achieve large batch size effects when batch size cannot be large
    "jit_mode_eval": False,  # Whether to use PyTorch jit trace during eval (can speed up the model, but the model must be static, otherwise will throw errors)
    "torch_compile": False,  # Whether to use torch.compile to compile the model (for better training and inference performance)
    "dataloader_pin_memory": True,  # Can speed up data transfer between CPU and GPU
    "dataloader_num_workers": 1,  # Default is not to use multiprocessing for data loading, usually set to 4*number of GPUs used
    "evaluation_strategy": "steps",  # Evaluation strategy, can be "steps" or "epoch"
    "eval_steps": 500,  # If evaluation_strategy="step"
    "remove_unused_columns": False,  # Don't change this unless you really know what you are doing.
 }
--- a/texteller/models/ocr_model/utils/functional.py
+++ b/texteller/models/ocr_model/utils/functional.py
@@ -1,60 +0,0 @@
 import torch
 from transformers import DataCollatorForLanguageModeling
 from typing import List, Dict, Any
 from .transforms import train_transform, inference_transform
 from ...globals import MIN_HEIGHT, MIN_WIDTH, MAX_TOKEN_SIZE
 def left_move(x: torch.Tensor, pad_val):
    assert len(x.shape) == 2, 'x should be 2-dimensional'
    lefted_x = torch.ones_like(x)
    lefted_x[:, :-1] = x[:, 1:]
    lefted_x[:, -1] = pad_val
    return lefted_x
 def tokenize_fn(samples: Dict[str, List[Any]], tokenizer=None) -> Dict[str, List[Any]]:
    assert tokenizer is not None, 'tokenizer should not be None'
    tokenized_formula = tokenizer(samples['latex_formula'], return_special_tokens_mask=True)
    tokenized_formula['pixel_values'] = samples['image']
    return tokenized_formula
 def collate_fn(samples: List[Dict[str, Any]], tokenizer=None) -> Dict[str, List[Any]]:
    assert tokenizer is not None, 'tokenizer should not be None'
    pixel_values = [dic.pop('pixel_values') for dic in samples]
    clm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    batch = clm_collator(samples)
    batch['pixel_values'] = pixel_values
    batch['decoder_input_ids'] = batch.pop('input_ids')
    batch['decoder_attention_mask'] = batch.pop('attention_mask')
    # 左移labels和decoder_attention_mask
    batch['labels'] = left_move(batch['labels'], -100)
    # 把list of Image转成一个tensor with (B, C, H, W)
    batch['pixel_values'] = torch.stack(batch['pixel_values'], dim=0)
    return batch
 def img_train_transform(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
    processed_img = train_transform(samples['pixel_values'])
    samples['pixel_values'] = processed_img
    return samples
 def img_inf_transform(samples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
    processed_img = inference_transform(samples['pixel_values'])
    samples['pixel_values'] = processed_img
    return samples
 def filter_fn(sample, tokenizer=None) -> bool:
    return (
        sample['image'].height > MIN_HEIGHT
        and sample['image'].width > MIN_WIDTH
        and len(tokenizer(sample['latex_formula'])['input_ids']) < MAX_TOKEN_SIZE - 10
    )
--- a/texteller/models/ocr_model/utils/helpers.py
+++ b/texteller/models/ocr_model/utils/helpers.py
@@ -1,26 +0,0 @@
 import cv2
 import numpy as np
 from typing import List
 def convert2rgb(image_paths: List[str]) -> List[np.ndarray]:
    processed_images = []
    for path in image_paths:
        image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
        if image is None:
            print(f"Image at {path} could not be read.")
            continue
        if image.dtype == np.uint16:
            print(f'Converting {path} to 8-bit, image may be lossy.')
            image = cv2.convertScaleAbs(image, alpha=(255.0 / 65535.0))
        channels = 1 if len(image.shape) == 2 else image.shape[2]
        if channels == 4:
            image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
        elif channels == 1:
            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif channels == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        processed_images.append(image)
    return processed_images
--- a/texteller/models/ocr_model/utils/metrics.py
+++ b/texteller/models/ocr_model/utils/metrics.py
@@ -1,25 +0,0 @@
 import evaluate
 import numpy as np
 import os
 from pathlib import Path
 from typing import Dict
 from transformers import EvalPrediction, RobertaTokenizer
 def bleu_metric(eval_preds: EvalPrediction, tokenizer: RobertaTokenizer) -> Dict:
    cur_dir = Path(os.getcwd())
    os.chdir(Path(__file__).resolve().parent)
    metric = evaluate.load(
        'google_bleu'
    )  # Will download the metric from huggingface if not already downloaded
    os.chdir(cur_dir)
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    preds = logits
    labels = np.where(labels == -100, 1, labels)
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=preds, references=labels)
--- a/texteller/models/ocr_model/utils/ocr_aug.py
+++ b/texteller/models/ocr_model/utils/ocr_aug.py
@@ -1,152 +0,0 @@
 from augraphy import *
 import random
 def ocr_augmentation_pipeline():
    pre_phase = []
    ink_phase = [
        InkColorSwap(
            ink_swap_color="random",
            ink_swap_sequence_number_range=(5, 10),
            ink_swap_min_width_range=(2, 3),
            ink_swap_max_width_range=(100, 120),
            ink_swap_min_height_range=(2, 3),
            ink_swap_max_height_range=(100, 120),
            ink_swap_min_area_range=(10, 20),
            ink_swap_max_area_range=(400, 500),
            # p=0.2
            p=0.4,
        ),
        LinesDegradation(
            line_roi=(0.0, 0.0, 1.0, 1.0),
            line_gradient_range=(32, 255),
            line_gradient_direction=(0, 2),
            line_split_probability=(0.2, 0.4),
            line_replacement_value=(250, 255),
            line_min_length=(30, 40),
            line_long_to_short_ratio=(5, 7),
            line_replacement_probability=(0.4, 0.5),
            line_replacement_thickness=(1, 3),
            # p=0.2
            p=0.4,
        ),
        #  ============================
        OneOf(
            [
                Dithering(
                    dither="floyd-steinberg",
                    order=(3, 5),
                ),
                InkBleed(
                    intensity_range=(0.1, 0.2),
                    kernel_size=random.choice([(7, 7), (5, 5), (3, 3)]),
                    severity=(0.4, 0.6),
                ),
            ],
            # p=0.2
            p=0.4,
        ),
        #  ============================
        #  ============================
        InkShifter(
            text_shift_scale_range=(18, 27),
            text_shift_factor_range=(1, 4),
            text_fade_range=(0, 2),
            blur_kernel_size=(5, 5),
            blur_sigma=0,
            noise_type="perlin",
            # p=0.2
            p=0.4,
        ),
        #  ============================
    ]
    paper_phase = [
        NoiseTexturize(  # tested
            sigma_range=(3, 10),
            turbulence_range=(2, 5),
            texture_width_range=(300, 500),
            texture_height_range=(300, 500),
            # p=0.2
            p=0.4,
        ),
        BrightnessTexturize(  # tested
            texturize_range=(0.9, 0.99),
            deviation=0.03,
            # p=0.2
            p=0.4,
        ),
    ]
    post_phase = [
        ColorShift(  # tested
            color_shift_offset_x_range=(3, 5),
            color_shift_offset_y_range=(3, 5),
            color_shift_iterations=(2, 3),
            color_shift_brightness_range=(0.9, 1.1),
            color_shift_gaussian_kernel_range=(3, 3),
            # p=0.2
            p=0.4,
        ),
        DirtyDrum(  # tested
            line_width_range=(1, 6),
            line_concentration=random.uniform(0.05, 0.15),
            direction=random.randint(0, 2),
            noise_intensity=random.uniform(0.6, 0.95),
            noise_value=(64, 224),
            ksize=random.choice([(3, 3), (5, 5), (7, 7)]),
            sigmaX=0,
            # p=0.2
            p=0.4,
        ),
        # =====================================
        OneOf(
            [
                LightingGradient(
                    light_position=None,
                    direction=None,
                    max_brightness=255,
                    min_brightness=0,
                    mode="gaussian",
                    linear_decay_rate=None,
                    transparency=None,
                ),
                Brightness(
                    brightness_range=(0.9, 1.1),
                    min_brightness=0,
                    min_brightness_value=(120, 150),
                ),
                Gamma(
                    gamma_range=(0.9, 1.1),
                ),
            ],
            # p=0.2
            p=0.4,
        ),
        # =====================================
        # =====================================
        OneOf(
            [
                SubtleNoise(
                    subtle_range=random.randint(5, 10),
                ),
                Jpeg(
                    quality_range=(70, 95),
                ),
            ],
            # p=0.2
            p=0.4,
        ),
        # =====================================
    ]
    pipeline = AugraphyPipeline(
        ink_phase=ink_phase,
        paper_phase=paper_phase,
        post_phase=post_phase,
        pre_phase=pre_phase,
        log=False,
    )
    return pipeline
--- a/texteller/models/ocr_model/utils/transforms.py
+++ b/texteller/models/ocr_model/utils/transforms.py
@@ -1,177 +0,0 @@
 import torch
 import random
 import numpy as np
 import cv2
 from torchvision.transforms import v2
 from typing import List, Union
 from PIL import Image
 from collections import Counter
 from ...globals import (
    IMG_CHANNELS,
    FIXED_IMG_SIZE,
    IMAGE_MEAN,
    IMAGE_STD,
    MAX_RESIZE_RATIO,
    MIN_RESIZE_RATIO,
 )
 from .ocr_aug import ocr_augmentation_pipeline
 # train_pipeline = default_augraphy_pipeline(scan_only=True)
 train_pipeline = ocr_augmentation_pipeline()
 general_transform_pipeline = v2.Compose(
    [
        v2.ToImage(),
        v2.ToDtype(torch.uint8, scale=True),  # optional, most input are already uint8 at this point
        v2.Grayscale(),
        v2.Resize(
            size=FIXED_IMG_SIZE - 1,
            interpolation=v2.InterpolationMode.BICUBIC,
            max_size=FIXED_IMG_SIZE,
            antialias=True,
        ),
        v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
        v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
        # v2.ToPILImage()
    ]
 )
 def trim_white_border(image: np.ndarray):
    if len(image.shape) != 3 or image.shape[2] != 3:
        raise ValueError("Image is not in RGB format or channel is not in third dimension")
    if image.dtype != np.uint8:
        raise ValueError(f"Image should stored in uint8")
    corners = [tuple(image[0, 0]), tuple(image[0, -1]), tuple(image[-1, 0]), tuple(image[-1, -1])]
    bg_color = Counter(corners).most_common(1)[0][0]
    bg_color_np = np.array(bg_color, dtype=np.uint8)
    h, w = image.shape[:2]
    bg = np.full((h, w, 3), bg_color_np, dtype=np.uint8)
    diff = cv2.absdiff(image, bg)
    mask = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
    threshold = 15
    _, diff = cv2.threshold(mask, threshold, 255, cv2.THRESH_BINARY)
    x, y, w, h = cv2.boundingRect(diff)
    trimmed_image = image[y : y + h, x : x + w]
    return trimmed_image
 def add_white_border(image: np.ndarray, max_size: int) -> np.ndarray:
    randi = [random.randint(0, max_size) for _ in range(4)]
    pad_height_size = randi[1] + randi[3]
    pad_width_size = randi[0] + randi[2]
    if pad_height_size + image.shape[0] < 30:
        compensate_height = int((30 - (pad_height_size + image.shape[0])) * 0.5) + 1
        randi[1] += compensate_height
        randi[3] += compensate_height
    if pad_width_size + image.shape[1] < 30:
        compensate_width = int((30 - (pad_width_size + image.shape[1])) * 0.5) + 1
        randi[0] += compensate_width
        randi[2] += compensate_width
    return v2.functional.pad(
        torch.from_numpy(image).permute(2, 0, 1),
        padding=randi,
        padding_mode='constant',
        fill=(255, 255, 255),
    )
 def padding(images: List[torch.Tensor], required_size: int) -> List[torch.Tensor]:
    images = [
        v2.functional.pad(
            img, padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]]
        )
        for img in images
    ]
    return images
 def random_resize(images: List[np.ndarray], minr: float, maxr: float) -> List[np.ndarray]:
    if len(images[0].shape) != 3 or images[0].shape[2] != 3:
        raise ValueError("Image is not in RGB format or channel is not in third dimension")
    ratios = [random.uniform(minr, maxr) for _ in range(len(images))]
    return [
        cv2.resize(
            img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LANCZOS4
        )  # 抗锯齿
        for img, r in zip(images, ratios)
    ]
 def rotate(image: np.ndarray, min_angle: int, max_angle: int) -> np.ndarray:
    # Get the center of the image to define the point of rotation
    image_center = tuple(np.array(image.shape[1::-1]) / 2)
    # Generate a random angle within the specified range
    angle = random.randint(min_angle, max_angle)
    # Get the rotation matrix for rotating the image around its center
    rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
    # Determine the size of the rotated image
    cos = np.abs(rotation_mat[0, 0])
    sin = np.abs(rotation_mat[0, 1])
    new_width = int((image.shape[0] * sin) + (image.shape[1] * cos))
    new_height = int((image.shape[0] * cos) + (image.shape[1] * sin))
    # Adjust the rotation matrix to take into account translation
    rotation_mat[0, 2] += (new_width / 2) - image_center[0]
    rotation_mat[1, 2] += (new_height / 2) - image_center[1]
    # Rotate the image with the specified border color (white in this case)
    rotated_image = cv2.warpAffine(
        image, rotation_mat, (new_width, new_height), borderValue=(255, 255, 255)
    )
    return rotated_image
 def ocr_aug(image: np.ndarray) -> np.ndarray:
    if random.random() < 0.2:
        image = rotate(image, -5, 5)
    image = add_white_border(image, max_size=25).permute(1, 2, 0).numpy()
    image = train_pipeline(image)
    return image
 def train_transform(images: List[Image.Image]) -> List[torch.Tensor]:
    assert IMG_CHANNELS == 1, "Only support grayscale images for now"
    images = [np.array(img.convert('RGB')) for img in images]
    # random resize first
    images = random_resize(images, MIN_RESIZE_RATIO, MAX_RESIZE_RATIO)
    images = [trim_white_border(image) for image in images]
    # OCR augmentation
    images = [ocr_aug(image) for image in images]
    # general transform pipeline
    images = [general_transform_pipeline(image) for image in images]
    # padding to fixed size
    images = padding(images, FIXED_IMG_SIZE)
    return images
 def inference_transform(images: List[Union[np.ndarray, Image.Image]]) -> List[torch.Tensor]:
    assert IMG_CHANNELS == 1, "Only support grayscale images for now"
    images = [
        np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images
    ]
    images = [trim_white_border(image) for image in images]
    # general transform pipeline
    images = [general_transform_pipeline(image) for image in images]  # imgs: List[PIL.Image.Image]
    # padding to fixed size
    images = padding(images, FIXED_IMG_SIZE)
    return images
--- a/texteller/models/texteller.py
+++ b/texteller/models/texteller.py
@@ -0,0 +1,48 @@
 from pathlib import Path
 from transformers import RobertaTokenizerFast, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
 from texteller.constants import (
    FIXED_IMG_SIZE,
    IMG_CHANNELS,
    MAX_TOKEN_SIZE,
    VOCAB_SIZE,
 )
 from texteller.globals import Globals
 from texteller.types import TexTellerModel
 from texteller.utils import cuda_available
 class TexTeller(VisionEncoderDecoderModel):
    def __init__(self):
        config = VisionEncoderDecoderConfig.from_pretrained(Globals().repo_name)
        config.encoder.image_size = FIXED_IMG_SIZE
        config.encoder.num_channels = IMG_CHANNELS
        config.decoder.vocab_size = VOCAB_SIZE
        config.decoder.max_position_embeddings = MAX_TOKEN_SIZE
        super().__init__(config=config)
    @classmethod
    def from_pretrained(cls, model_dir: str | None = None, use_onnx=False) -> TexTellerModel:
        if model_dir is None or model_dir == Globals().repo_name:
            if not use_onnx:
                return VisionEncoderDecoderModel.from_pretrained(Globals().repo_name)
            else:
                from optimum.onnxruntime import ORTModelForVision2Seq
                return ORTModelForVision2Seq.from_pretrained(
                    Globals().repo_name,
                    provider="CUDAExecutionProvider"
                    if cuda_available()
                    else "CPUExecutionProvider",
                )
        model_dir = Path(model_dir).resolve()
        return VisionEncoderDecoderModel.from_pretrained(str(model_dir))
    @classmethod
    def get_tokenizer(cls, tokenizer_dir: str = None) -> RobertaTokenizerFast:
        if tokenizer_dir is None or tokenizer_dir == Globals().repo_name:
            return RobertaTokenizerFast.from_pretrained(Globals().repo_name)
        tokenizer_dir = Path(tokenizer_dir).resolve()
        return RobertaTokenizerFast.from_pretrained(str(tokenizer_dir))
--- a/texteller/models/thrid_party/paddleocr/checkpoints/det/default_model.onnx
+++ b/texteller/models/thrid_party/paddleocr/checkpoints/det/default_model.onnx
--- a/texteller/models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx
+++ b/texteller/models/thrid_party/paddleocr/checkpoints/rec/default_model.onnx
--- a/texteller/models/tokenizer/train.py
+++ b/texteller/models/tokenizer/train.py
@@ -1,24 +0,0 @@
 import os
 from pathlib import Path
 from datasets import load_dataset
 from ..ocr_model.model.TexTeller import TexTeller
 from ..globals import VOCAB_SIZE
 if __name__ == '__main__':
    script_dirpath = Path(__file__).resolve().parent
    os.chdir(script_dirpath)
    tokenizer = TexTeller.get_tokenizer()
    # Don't forget to config your dataset path in loader.py
    dataset = load_dataset('../ocr_model/train/dataset/loader.py')['train']
    new_tokenizer = tokenizer.train_new_from_iterator(
        text_iterator=dataset['latex_formula'],
        # If you want to use a different vocab size, **change VOCAB_SIZE from globals.py**
        vocab_size=VOCAB_SIZE,
    )
    # Save the new tokenizer for later training and inference
    new_tokenizer.save_pretrained('./your_dir_name')
--- a/texteller/models/utils/init.py
+++ b/texteller/models/utils/init.py
@@ -1 +0,0 @@
 from .mix_inference import mix_inference
--- a/texteller/models/utils/mix_inference.py
+++ b/texteller/models/utils/mix_inference.py
@@ -1,261 +0,0 @@
 import re
 import heapq
 import cv2
 import time
 import numpy as np
 from collections import Counter
 from typing import List
 from PIL import Image
 from ..det_model.inference import predict as latex_det_predict
 from ..det_model.Bbox import Bbox, draw_bboxes
 from ..ocr_model.utils.inference import inference as latex_rec_predict
 from ..ocr_model.utils.to_katex import to_katex, change_all
 MAXV = 999999999
 def mask_img(img, bboxes: List[Bbox], bg_color: np.ndarray) -> np.ndarray:
    mask_img = img.copy()
    for bbox in bboxes:
        mask_img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w] = bg_color
    return mask_img
 def bbox_merge(sorted_bboxes: List[Bbox]) -> List[Bbox]:
    if len(sorted_bboxes) == 0:
        return []
    bboxes = sorted_bboxes.copy()
    guard = Bbox(MAXV, bboxes[-1].p.y, -1, -1, label="guard")
    bboxes.append(guard)
    res = []
    prev = bboxes[0]
    for curr in bboxes:
        if prev.ur_point.x <= curr.p.x or not prev.same_row(curr):
            res.append(prev)
            prev = curr
        else:
            prev.w = max(prev.w, curr.ur_point.x - prev.p.x)
    return res
 def split_conflict(ocr_bboxes: List[Bbox], latex_bboxes: List[Bbox]) -> List[Bbox]:
    if latex_bboxes == []:
        return ocr_bboxes
    if ocr_bboxes == [] or len(ocr_bboxes) == 1:
        return ocr_bboxes
    bboxes = sorted(ocr_bboxes + latex_bboxes)
    # log results
    for idx, bbox in enumerate(bboxes):
        bbox.content = str(idx)
    draw_bboxes(Image.fromarray(img), bboxes, name="before_split_confict.png")
    assert len(bboxes) > 1
    heapq.heapify(bboxes)
    res = []
    candidate = heapq.heappop(bboxes)
    curr = heapq.heappop(bboxes)
    idx = 0
    while len(bboxes) > 0:
        idx += 1
        assert candidate.p.x <= curr.p.x or not candidate.same_row(curr)
        if candidate.ur_point.x <= curr.p.x or not candidate.same_row(curr):
            res.append(candidate)
            candidate = curr
            curr = heapq.heappop(bboxes)
        elif candidate.ur_point.x < curr.ur_point.x:
            assert not (candidate.label != "text" and curr.label != "text")
            if candidate.label == "text" and curr.label == "text":
                candidate.w = curr.ur_point.x - candidate.p.x
                curr = heapq.heappop(bboxes)
            elif candidate.label != curr.label:
                if candidate.label == "text":
                    candidate.w = curr.p.x - candidate.p.x
                    res.append(candidate)
                    candidate = curr
                    curr = heapq.heappop(bboxes)
                else:
                    curr.w = curr.ur_point.x - candidate.ur_point.x
                    curr.p.x = candidate.ur_point.x
                    heapq.heappush(bboxes, curr)
                    curr = heapq.heappop(bboxes)
        elif candidate.ur_point.x >= curr.ur_point.x:
            assert not (candidate.label != "text" and curr.label != "text")
            if candidate.label == "text":
                assert curr.label != "text"
                heapq.heappush(
                    bboxes,
                    Bbox(
                        curr.ur_point.x,
                        candidate.p.y,
                        candidate.h,
                        candidate.ur_point.x - curr.ur_point.x,
                        label="text",
                        confidence=candidate.confidence,
                        content=None,
                    ),
                )
                candidate.w = curr.p.x - candidate.p.x
                res.append(candidate)
                candidate = curr
                curr = heapq.heappop(bboxes)
            else:
                assert curr.label == "text"
                curr = heapq.heappop(bboxes)
        else:
            assert False
    res.append(candidate)
    res.append(curr)
    # log results
    for idx, bbox in enumerate(res):
        bbox.content = str(idx)
    draw_bboxes(Image.fromarray(img), res, name="after_split_confict.png")
    return res
 def slice_from_image(img: np.ndarray, ocr_bboxes: List[Bbox]) -> List[np.ndarray]:
    sliced_imgs = []
    for bbox in ocr_bboxes:
        x, y = int(bbox.p.x), int(bbox.p.y)
        w, h = int(bbox.w), int(bbox.h)
        sliced_img = img[y : y + h, x : x + w]
        sliced_imgs.append(sliced_img)
    return sliced_imgs
 def mix_inference(
    img_path: str,
    infer_config,
    latex_det_model,
    lang_ocr_models,
    latex_rec_models,
    accelerator="cpu",
    num_beams=1,
 ) -> str:
    '''
    Input a mixed image of formula text and output str (in markdown syntax)
    '''
    global img
    img = cv2.imread(img_path)
    corners = [tuple(img[0, 0]), tuple(img[0, -1]), tuple(img[-1, 0]), tuple(img[-1, -1])]
    bg_color = np.array(Counter(corners).most_common(1)[0][0])
    start_time = time.time()
    latex_bboxes = latex_det_predict(img_path, latex_det_model, infer_config)
    end_time = time.time()
    print(f"latex_det_model time: {end_time - start_time:.2f}s")
    latex_bboxes = sorted(latex_bboxes)
    # log results
    draw_bboxes(Image.fromarray(img), latex_bboxes, name="latex_bboxes(unmerged).png")
    latex_bboxes = bbox_merge(latex_bboxes)
    # log results
    draw_bboxes(Image.fromarray(img), latex_bboxes, name="latex_bboxes(merged).png")
    masked_img = mask_img(img, latex_bboxes, bg_color)
    det_model, rec_model = lang_ocr_models
    start_time = time.time()
    det_prediction, _ = det_model(masked_img)
    end_time = time.time()
    print(f"ocr_det_model time: {end_time - start_time:.2f}s")
    ocr_bboxes = [
        Bbox(
            p[0][0],
            p[0][1],
            p[3][1] - p[0][1],
            p[1][0] - p[0][0],
            label="text",
            confidence=None,
            content=None,
        )
        for p in det_prediction
    ]
    # log results
    draw_bboxes(Image.fromarray(img), ocr_bboxes, name="ocr_bboxes(unmerged).png")
    ocr_bboxes = sorted(ocr_bboxes)
    ocr_bboxes = bbox_merge(ocr_bboxes)
    # log results
    draw_bboxes(Image.fromarray(img), ocr_bboxes, name="ocr_bboxes(merged).png")
    ocr_bboxes = split_conflict(ocr_bboxes, latex_bboxes)
    ocr_bboxes = list(filter(lambda x: x.label == "text", ocr_bboxes))
    sliced_imgs: List[np.ndarray] = slice_from_image(img, ocr_bboxes)
    start_time = time.time()
    rec_predictions, _ = rec_model(sliced_imgs)
    end_time = time.time()
    print(f"ocr_rec_model time: {end_time - start_time:.2f}s")
    assert len(rec_predictions) == len(ocr_bboxes)
    for content, bbox in zip(rec_predictions, ocr_bboxes):
        bbox.content = content[0]
    latex_imgs = []
    for bbox in latex_bboxes:
        latex_imgs.append(img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w])
    start_time = time.time()
    latex_rec_res = latex_rec_predict(
        *latex_rec_models, latex_imgs, accelerator, num_beams, max_tokens=800
    )
    end_time = time.time()
    print(f"latex_rec_model time: {end_time - start_time:.2f}s")
    for bbox, content in zip(latex_bboxes, latex_rec_res):
        bbox.content = to_katex(content)
        if bbox.label == "embedding":
            bbox.content = " $" + bbox.content + "$ "
        elif bbox.label == "isolated":
            bbox.content = '\n\n' + r"$$" + bbox.content + r"$$" + '\n\n'
    bboxes = sorted(ocr_bboxes + latex_bboxes)
    if bboxes == []:
        return ""
    md = ""
    prev = Bbox(bboxes[0].p.x, bboxes[0].p.y, -1, -1, label="guard")
    for curr in bboxes:
        # Add the formula number back to the isolated formula
        if prev.label == "isolated" and curr.label == "text" and prev.same_row(curr):
            curr.content = curr.content.strip()
            if curr.content.startswith('(') and curr.content.endswith(')'):
                curr.content = curr.content[1:-1]
            if re.search(r'\\tag\{.*\}$', md[:-4]) is not None:
                # in case of multiple tag
                md = md[:-5] + f', {curr.content}' + '}' + md[-4:]
            else:
                md = md[:-4] + f'\\tag{{{curr.content}}}' + md[-4:]
            continue
        if not prev.same_row(curr):
            md += " "
        if curr.label == "embedding":
            # remove the bold effect from inline formulas
            curr.content = change_all(curr.content, r'\bm', r' ', r'{', r'}', r'', r' ')
            curr.content = change_all(curr.content, r'\boldsymbol', r' ', r'{', r'}', r'', r' ')
            curr.content = change_all(curr.content, r'\textit', r' ', r'{', r'}', r'', r' ')
            curr.content = change_all(curr.content, r'\textbf', r' ', r'{', r'}', r'', r' ')
            curr.content = change_all(curr.content, r'\textbf', r' ', r'{', r'}', r'', r' ')
            curr.content = change_all(curr.content, r'\mathbf', r' ', r'{', r'}', r'', r' ')
            # change split environment into aligned
            curr.content = curr.content.replace(r'\begin{split}', r'\begin{aligned}')
            curr.content = curr.content.replace(r'\end{split}', r'\end{aligned}')
            # remove extra spaces (keeping only one)
            curr.content = re.sub(r' +', ' ', curr.content)
            assert curr.content.startswith(' $') and curr.content.endswith('$ ')
            curr.content = ' $' + curr.content[2:-2].strip() + '$ '
        md += curr.content
        prev = curr
    return md.strip()
--- a/texteller/models/thrid_party/paddleocr/infer/CTCLabelDecode.py
+++ b/texteller/models/thrid_party/paddleocr/infer/CTCLabelDecode.py
@@ -81,7 +81,7 @@ class BaseRecLabelDecode(object):
        word_list = []
        word_col_list = []
        state_list = []
-        valid_col = np.where(selection == True)[0]
+        valid_col = np.where(selection)[0]
        for c_i, char in enumerate(text):
            if "\u4e00" <= char <= "\u9fff":
--- a/texteller/models/thrid_party/paddleocr/infer/DBPostProcess.py
+++ b/texteller/models/thrid_party/paddleocr/infer/DBPostProcess.py
--- a/texteller/models/thrid_party/paddleocr/infer/operators.py
+++ b/texteller/models/thrid_party/paddleocr/infer/operators.py
--- a/texteller/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt
+++ b/texteller/models/thrid_party/paddleocr/infer/ppocr_keys_v1.txt
--- a/texteller/models/thrid_party/paddleocr/infer/predict_det.py
+++ b/texteller/models/thrid_party/paddleocr/infer/predict_det.py
@@ -12,25 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(__dir__)
 sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "../..")))
 os.environ["FLAGS_allocator_strategy"] = "auto_growth"
 import sys
 import time
 import cv2
 import numpy as np
-# import tools.infer.utility as utility
+from .DBPostProcess import DBPostProcess
-import utility
+from .operators import DetResizeForTest, KeepKeys, NormalizeImage, ToCHWImage
-from DBPostProcess import DBPostProcess
+from .utility import create_predictor, get_logger
 from operators import DetResizeForTest, KeepKeys, NormalizeImage, ToCHWImage
 from utility import get_logger
 def transform(data, ops=None):
@@ -82,7 +73,7 @@ class TextDetector(object):
            self.input_tensor,
            self.output_tensors,
            self.config,
-        ) = utility.create_predictor(args, "det", logger)
+        ) = create_predictor(args, "det", logger)
        assert self.use_onnx
        if self.use_onnx:
--- a/texteller/models/thrid_party/paddleocr/infer/predict_rec.py
+++ b/texteller/models/thrid_party/paddleocr/infer/predict_rec.py
--- a/texteller/models/thrid_party/paddleocr/infer/utility.py
+++ b/texteller/models/thrid_party/paddleocr/infer/utility.py
--- a/texteller/server.py
+++ b/texteller/server.py
@@ -1,155 +0,0 @@
 import sys
 import argparse
 import tempfile
 import time
 import numpy as np
 import cv2
 from pathlib import Path
 from starlette.requests import Request
 from ray import serve
 from ray.serve.handle import DeploymentHandle
 from onnxruntime import InferenceSession
 from texteller.models.ocr_model.utils.inference import inference as rec_inference
 from texteller.models.det_model.inference import predict as det_inference
 from texteller.models.ocr_model.model.TexTeller import TexTeller
 from texteller.models.det_model.inference import PredictConfig
 from texteller.models.ocr_model.utils.to_katex import to_katex
 PYTHON_VERSION = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
 LIBPATH = Path(sys.executable).parent.parent / 'lib' / ('python' + PYTHON_VERSION) / 'site-packages'
 CUDNNPATH = LIBPATH / 'nvidia' / 'cudnn' / 'lib'
 parser = argparse.ArgumentParser()
 parser.add_argument('-ckpt', '--checkpoint_dir', type=str)
 parser.add_argument('-tknz', '--tokenizer_dir', type=str)
 parser.add_argument('-port', '--server_port', type=int, default=8000)
 parser.add_argument('--num_replicas', type=int, default=1)
 parser.add_argument('--ncpu_per_replica', type=float, default=1.0)
 parser.add_argument('--ngpu_per_replica', type=float, default=0.0)
 parser.add_argument('--inference-mode', type=str, default='cpu')
 parser.add_argument('--num_beams', type=int, default=1)
 parser.add_argument('-onnx', action='store_true', help='using onnx runtime')
 args = parser.parse_args()
 if args.ngpu_per_replica > 0 and not args.inference_mode == 'cuda':
    raise ValueError("--inference-mode must be cuda or mps if ngpu_per_replica > 0")
@serve.deployment(
    num_replicas=args.num_replicas,
    ray_actor_options={
        "num_cpus": args.ncpu_per_replica,
        "num_gpus": args.ngpu_per_replica * 1.0 / 2,
    },
 )
 class TexTellerRecServer:
    def __init__(
        self,
        checkpoint_path: str,
        tokenizer_path: str,
        inf_mode: str = 'cpu',
        use_onnx: bool = False,
        num_beams: int = 1,
    ) -> None:
        self.model = TexTeller.from_pretrained(
            checkpoint_path, use_onnx=use_onnx, onnx_provider=inf_mode
        )
        self.tokenizer = TexTeller.get_tokenizer(tokenizer_path)
        self.inf_mode = inf_mode
        self.num_beams = num_beams
        if not use_onnx:
            self.model = self.model.to(inf_mode) if inf_mode != 'cpu' else self.model
    def predict(self, image_nparray) -> str:
        return to_katex(
            rec_inference(
                self.model,
                self.tokenizer,
                [image_nparray],
                accelerator=self.inf_mode,
                num_beams=self.num_beams,
            )[0]
        )
@serve.deployment(
    num_replicas=args.num_replicas,
    ray_actor_options={
        "num_cpus": args.ncpu_per_replica,
        "num_gpus": args.ngpu_per_replica * 1.0 / 2,
        "runtime_env": {"env_vars": {"LD_LIBRARY_PATH": f"{str(CUDNNPATH)}/:$LD_LIBRARY_PATH"}},
    },
 )
 class TexTellerDetServer:
    def __init__(self, inf_mode='cpu'):
        self.infer_config = PredictConfig("./models/det_model/model/infer_cfg.yml")
        self.latex_det_model = InferenceSession(
            "./models/det_model/model/rtdetr_r50vd_6x_coco.onnx",
            providers=['CUDAExecutionProvider'] if inf_mode == 'cuda' else ['CPUExecutionProvider'],
        )
    async def predict(self, image_nparray) -> str:
        with tempfile.TemporaryDirectory() as temp_dir:
            img_path = f"{temp_dir}/temp_image.jpg"
            cv2.imwrite(img_path, image_nparray)
            latex_bboxes = det_inference(img_path, self.latex_det_model, self.infer_config)
            return latex_bboxes
@serve.deployment()
 class Ingress:
    def __init__(self, det_server: DeploymentHandle, rec_server: DeploymentHandle) -> None:
        self.det_server = det_server
        self.texteller_server = rec_server
    async def __call__(self, request: Request) -> str:
        request_path = request.url.path
        form = await request.form()
        img_rb = await form['img'].read()
        img_nparray = np.frombuffer(img_rb, np.uint8)
        img_nparray = cv2.imdecode(img_nparray, cv2.IMREAD_COLOR)
        img_nparray = cv2.cvtColor(img_nparray, cv2.COLOR_BGR2RGB)
        if request_path.startswith("/fdet"):
            if self.det_server is None:
                return "[ERROR] rtdetr_r50vd_6x_coco.onnx not found."
            pred = await self.det_server.predict.remote(img_nparray)
            return pred
        elif request_path.startswith("/frec"):
            pred = await self.texteller_server.predict.remote(img_nparray)
            return pred
        else:
            return "[ERROR] Invalid request path"
 if __name__ == '__main__':
    ckpt_dir = args.checkpoint_dir
    tknz_dir = args.tokenizer_dir
    serve.start(http_options={"host": "0.0.0.0", "port": args.server_port})
    rec_server = TexTellerRecServer.bind(
        ckpt_dir,
        tknz_dir,
        inf_mode=args.inference_mode,
        use_onnx=args.onnx,
        num_beams=args.num_beams,
    )
    det_server = None
    if Path('./models/det_model/model/rtdetr_r50vd_6x_coco.onnx').exists():
        det_server = TexTellerDetServer.bind(args.inference_mode)
    ingress = Ingress.bind(det_server, rec_server)
    # ingress_handle = serve.run(ingress, route_prefix="/predict")
    ingress_handle = serve.run(ingress, route_prefix="/")
    while True:
        time.sleep(1)
--- a/texteller/start_web.bat
+++ b/texteller/start_web.bat
@@ -1,9 +0,0 @@
@echo off
 SETLOCAL ENABLEEXTENSIONS
 set CHECKPOINT_DIR=default
 set TOKENIZER_DIR=default
 streamlit run web.py
 ENDLOCAL
--- a/texteller/start_web.sh
+++ b/texteller/start_web.sh
@@ -1,7 +0,0 @@
 #!/usr/bin/env bash
 set -exu
 export CHECKPOINT_DIR="default"
 export TOKENIZER_DIR="default"
 streamlit run web.py
--- a/texteller/train_config.yaml
+++ b/texteller/train_config.yaml
@@ -1,14 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: MULTI_GPU
 gpu_ids: all
 num_processes: 1
 machine_rank: 0
 main_training_function: main
 num_machines: 1
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/texteller/types/init.py
+++ b/texteller/types/init.py
@@ -0,0 +1,12 @@
 from typing import TypeAlias
 from optimum.onnxruntime import ORTModelForVision2Seq
 from transformers import VisionEncoderDecoderModel
 from .bbox import Bbox
 TexTellerModel: TypeAlias = VisionEncoderDecoderModel | ORTModelForVision2Seq
 __all__ = ["Bbox", "TexTellerModel"]
--- a/texteller/models/det_model/Bbox.py
+++ b/texteller/models/det_model/Bbox.py
@@ -1,10 +1,3 @@
 import os
 from PIL import Image, ImageDraw
 from typing import List
 from pathlib import Path
 class Point:
    def __init__(self, x: int, y: int):
        self.x = int(x)
@@ -51,9 +44,9 @@ class Bbox:
        return 1.0 * abs(self.p.y - other.p.y) / max(self.h, other.h) < self.THREADHOLD
    def __lt__(self, other) -> bool:
-        '''
+        """
        from top to bottom, from left to right
-        '''
+        """
        if not self.same_row(other):
            return self.p.y < other.p.y
        else:
@@ -61,29 +54,3 @@ class Bbox:
    def __repr__(self) -> str:
        return f"Bbox(upper_left_point={self.p}, h={self.h}, w={self.w}), label={self.label}, confident={self.confidence}, content={self.content})"
 def draw_bboxes(img: Image.Image, bboxes: List[Bbox], name="annotated_image.png"):
    curr_work_dir = Path(os.getcwd())
    log_dir = curr_work_dir / "logs"
    log_dir.mkdir(exist_ok=True)
    drawer = ImageDraw.Draw(img)
    for bbox in bboxes:
        # Calculate the coordinates for the rectangle to be drawn
        left = bbox.p.x
        top = bbox.p.y
        right = bbox.p.x + bbox.w
        bottom = bbox.p.y + bbox.h
        # Draw the rectangle on the image
        drawer.rectangle([left, top, right, bottom], outline="green", width=1)
        # Optionally, add text label if it exists
        if bbox.label:
            drawer.text((left, top), bbox.label, fill="blue")
        if bbox.content:
            drawer.text((left, bottom - 10), bbox.content[:10], fill="red")
    # Save the image with drawn rectangles
    img.save(log_dir / name)
--- a/texteller/utils/init.py
+++ b/texteller/utils/init.py
@@ -0,0 +1,26 @@
 from .device import get_device, cuda_available, mps_available, str2device
 from .image import readimgs, transform
 from .latex import change_all, remove_style, add_newlines
 from .path import mkdir, resolve_path
 from .misc import lines_dedent
 from .bbox import mask_img, bbox_merge, split_conflict, slice_from_image, draw_bboxes
 __all__ = [
    "get_device",
    "cuda_available",
    "mps_available",
    "str2device",
    "readimgs",
    "transform",
    "change_all",
    "remove_style",
    "add_newlines",
    "mkdir",
    "resolve_path",
    "lines_dedent",
    "mask_img",
    "bbox_merge",
    "split_conflict",
    "slice_from_image",
    "draw_bboxes",
 ]
--- a/texteller/utils/bbox.py
+++ b/texteller/utils/bbox.py
@@ -0,0 +1,142 @@
 import heapq
 import os
 from pathlib import Path
 import numpy as np
 from PIL import Image, ImageDraw
 from texteller.types import Bbox
 _MAXV = 999999999
 def mask_img(img, bboxes: list[Bbox], bg_color: np.ndarray) -> np.ndarray:
    mask_img = img.copy()
    for bbox in bboxes:
        mask_img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w] = bg_color
    return mask_img
 def bbox_merge(sorted_bboxes: list[Bbox]) -> list[Bbox]:
    if len(sorted_bboxes) == 0:
        return []
    bboxes = sorted_bboxes.copy()
    guard = Bbox(_MAXV, bboxes[-1].p.y, -1, -1, label="guard")
    bboxes.append(guard)
    res = []
    prev = bboxes[0]
    for curr in bboxes:
        if prev.ur_point.x <= curr.p.x or not prev.same_row(curr):
            res.append(prev)
            prev = curr
        else:
            prev.w = max(prev.w, curr.ur_point.x - prev.p.x)
    return res
 def split_conflict(ocr_bboxes: list[Bbox], latex_bboxes: list[Bbox]) -> list[Bbox]:
    if latex_bboxes == []:
        return ocr_bboxes
    if ocr_bboxes == [] or len(ocr_bboxes) == 1:
        return ocr_bboxes
    bboxes = sorted(ocr_bboxes + latex_bboxes)
    assert len(bboxes) > 1
    heapq.heapify(bboxes)
    res = []
    candidate = heapq.heappop(bboxes)
    curr = heapq.heappop(bboxes)
    idx = 0
    while len(bboxes) > 0:
        idx += 1
        assert candidate.p.x <= curr.p.x or not candidate.same_row(curr)
        if candidate.ur_point.x <= curr.p.x or not candidate.same_row(curr):
            res.append(candidate)
            candidate = curr
            curr = heapq.heappop(bboxes)
        elif candidate.ur_point.x < curr.ur_point.x:
            assert not (candidate.label != "text" and curr.label != "text")
            if candidate.label == "text" and curr.label == "text":
                candidate.w = curr.ur_point.x - candidate.p.x
                curr = heapq.heappop(bboxes)
            elif candidate.label != curr.label:
                if candidate.label == "text":
                    candidate.w = curr.p.x - candidate.p.x
                    res.append(candidate)
                    candidate = curr
                    curr = heapq.heappop(bboxes)
                else:
                    curr.w = curr.ur_point.x - candidate.ur_point.x
                    curr.p.x = candidate.ur_point.x
                    heapq.heappush(bboxes, curr)
                    curr = heapq.heappop(bboxes)
        elif candidate.ur_point.x >= curr.ur_point.x:
            assert not (candidate.label != "text" and curr.label != "text")
            if candidate.label == "text":
                assert curr.label != "text"
                heapq.heappush(
                    bboxes,
                    Bbox(
                        curr.ur_point.x,
                        candidate.p.y,
                        candidate.h,
                        candidate.ur_point.x - curr.ur_point.x,
                        label="text",
                        confidence=candidate.confidence,
                        content=None,
                    ),
                )
                candidate.w = curr.p.x - candidate.p.x
                res.append(candidate)
                candidate = curr
                curr = heapq.heappop(bboxes)
            else:
                assert curr.label == "text"
                curr = heapq.heappop(bboxes)
        else:
            assert False
    res.append(candidate)
    res.append(curr)
    return res
 def slice_from_image(img: np.ndarray, ocr_bboxes: list[Bbox]) -> list[np.ndarray]:
    sliced_imgs = []
    for bbox in ocr_bboxes:
        x, y = int(bbox.p.x), int(bbox.p.y)
        w, h = int(bbox.w), int(bbox.h)
        sliced_img = img[y : y + h, x : x + w]
        sliced_imgs.append(sliced_img)
    return sliced_imgs
 def draw_bboxes(img: Image.Image, bboxes: list[Bbox], name="annotated_image.png"):
    curr_work_dir = Path(os.getcwd())
    log_dir = curr_work_dir / "logs"
    log_dir.mkdir(exist_ok=True)
    drawer = ImageDraw.Draw(img)
    for bbox in bboxes:
        # Calculate the coordinates for the rectangle to be drawn
        left = bbox.p.x
        top = bbox.p.y
        right = bbox.p.x + bbox.w
        bottom = bbox.p.y + bbox.h
        # Draw the rectangle on the image
        drawer.rectangle([left, top, right, bottom], outline="green", width=1)
        # Optionally, add text label if it exists
        if bbox.label:
            drawer.text((left, top), bbox.label, fill="blue")
        if bbox.content:
            drawer.text((left, bottom - 10), bbox.content[:10], fill="red")
    # Save the image with drawn rectangles
    img.save(log_dir / name)
--- a/texteller/utils/device.py
+++ b/texteller/utils/device.py
@@ -0,0 +1,41 @@
 from typing import Literal
 import torch
 def str2device(device_str: Literal["cpu", "cuda", "mps"]) -> torch.device:
    if device_str == "cpu":
        return torch.device("cpu")
    elif device_str == "cuda":
        return torch.device("cuda")
    elif device_str == "mps":
        return torch.device("mps")
    else:
        raise ValueError(f"Invalid device: {device_str}")
 def get_device(device_index: int = None) -> torch.device:
    """
    Automatically detect the best available device for inference.
    Args:
        device_index: The index of GPU device to use if multiple are available.
                      Defaults to None, which uses the first available GPU.
    Returns:
        torch.device: Selected device for model inference.
    """
    if cuda_available():
        return str2device("cuda")
    elif mps_available():
        return str2device("mps")
    else:
        return str2device("cpu")
 def cuda_available() -> bool:
    return torch.cuda.is_available()
 def mps_available() -> bool:
    return torch.backends.mps.is_available()
--- a/texteller/utils/image.py
+++ b/texteller/utils/image.py
@@ -0,0 +1,121 @@
 from collections import Counter
 from typing import List, Union
 import cv2
 import numpy as np
 import torch
 from PIL import Image
 from torchvision.transforms import v2
 from texteller.constants import (
    FIXED_IMG_SIZE,
    IMG_CHANNELS,
    IMAGE_MEAN,
    IMAGE_STD,
 )
 from texteller.logger import get_logger
 _logger = get_logger()
 def readimgs(image_paths: list[str]) -> list[np.ndarray]:
    """
    Read and preprocess a list of images from their file paths.
    This function reads each image from the provided paths, handles different
    bit depths (converting 16-bit to 8-bit if necessary), and normalizes color
    channels to RGB format regardless of the original color space (BGR, BGRA,
    or grayscale).
    Args:
        image_paths (list[str]): A list of file paths to the images to be read.
    Returns:
        list[np.ndarray]: A list of NumPy arrays containing the preprocessed images
                         in RGB format. Images that could not be read are skipped.
    """
    processed_images = []
    for path in image_paths:
        image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
        if image is None:
            raise ValueError(f"Image at {path} could not be read.")
        if image.dtype == np.uint16:
            _logger.warning(f'Converting {path} to 8-bit, image may be lossy.')
            image = cv2.convertScaleAbs(image, alpha=(255.0 / 65535.0))
        channels = 1 if len(image.shape) == 2 else image.shape[2]
        if channels == 4:
            image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
        elif channels == 1:
            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif channels == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        processed_images.append(image)
    return processed_images
 def trim_white_border(image: np.ndarray) -> np.ndarray:
    if len(image.shape) != 3 or image.shape[2] != 3:
        raise ValueError("Image is not in RGB format or channel is not in third dimension")
    if image.dtype != np.uint8:
        raise ValueError(f"Image should stored in uint8")
    corners = [tuple(image[0, 0]), tuple(image[0, -1]), tuple(image[-1, 0]), tuple(image[-1, -1])]
    bg_color = Counter(corners).most_common(1)[0][0]
    bg_color_np = np.array(bg_color, dtype=np.uint8)
    h, w = image.shape[:2]
    bg = np.full((h, w, 3), bg_color_np, dtype=np.uint8)
    diff = cv2.absdiff(image, bg)
    mask = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
    threshold = 15
    _, diff = cv2.threshold(mask, threshold, 255, cv2.THRESH_BINARY)
    x, y, w, h = cv2.boundingRect(diff)
    trimmed_image = image[y : y + h, x : x + w]
    return trimmed_image
 def padding(images: List[torch.Tensor], required_size: int) -> List[torch.Tensor]:
    images = [
        v2.functional.pad(
            img, padding=[0, 0, required_size - img.shape[2], required_size - img.shape[1]]
        )
        for img in images
    ]
    return images
 def transform(images: List[Union[np.ndarray, Image.Image]]) -> List[torch.Tensor]:
    general_transform_pipeline = v2.Compose(
        [
            v2.ToImage(),
            v2.ToDtype(torch.uint8, scale=True),
            v2.Grayscale(),
            v2.Resize(
                size=FIXED_IMG_SIZE - 1,
                interpolation=v2.InterpolationMode.BICUBIC,
                max_size=FIXED_IMG_SIZE,
                antialias=True,
            ),
            v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
            v2.Normalize(mean=[IMAGE_MEAN], std=[IMAGE_STD]),
        ]
    )
    assert IMG_CHANNELS == 1, "Only support grayscale images for now"
    images = [
        np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images
    ]
    images = [trim_white_border(image) for image in images]
    images = [general_transform_pipeline(image) for image in images]
    images = padding(images, FIXED_IMG_SIZE)
    return images
--- a/texteller/utils/latex.py
+++ b/texteller/utils/latex.py
@@ -0,0 +1,128 @@
 import re
 def _change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
    result = ""
    i = 0
    n = len(input_str)
    while i < n:
        if input_str[i : i + len(old_inst)] == old_inst:
            # check if the old_inst is followed by old_surr_l
            start = i + len(old_inst)
        else:
            result += input_str[i]
            i += 1
            continue
        if start < n and input_str[start] == old_surr_l:
            # found an old_inst followed by old_surr_l, now look for the matching old_surr_r
            count = 1
            j = start + 1
            escaped = False
            while j < n and count > 0:
                if input_str[j] == '\\' and not escaped:
                    escaped = True
                    j += 1
                    continue
                if input_str[j] == old_surr_r and not escaped:
                    count -= 1
                    if count == 0:
                        break
                elif input_str[j] == old_surr_l and not escaped:
                    count += 1
                escaped = False
                j += 1
            if count == 0:
                assert j < n
                assert input_str[start] == old_surr_l
                assert input_str[j] == old_surr_r
                inner_content = input_str[start + 1 : j]
                # Replace the content with new pattern
                result += new_inst + new_surr_l + inner_content + new_surr_r
                i = j + 1
                continue
            else:
                assert count >= 1
                assert j == n
                print("Warning: unbalanced surrogate pair in input string")
                result += new_inst + new_surr_l
                i = start + 1
                continue
        else:
            result += input_str[i:start]
            i = start
    if old_inst != new_inst and (old_inst + old_surr_l) in result:
        return _change(result, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r)
    else:
        return result
 def _find_substring_positions(string, substring):
    positions = [match.start() for match in re.finditer(re.escape(substring), string)]
    return positions
 def change_all(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
    pos = _find_substring_positions(input_str, old_inst + old_surr_l)
    res = list(input_str)
    for p in pos[::-1]:
        res[p:] = list(
            _change(
                ''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r
            )
        )
    res = ''.join(res)
    return res
 def remove_style(input_str: str) -> str:
    input_str = change_all(input_str, r"\bm", r" ", r"{", r"}", r"", r" ")
    input_str = change_all(input_str, r"\boldsymbol", r" ", r"{", r"}", r"", r" ")
    input_str = change_all(input_str, r"\textit", r" ", r"{", r"}", r"", r" ")
    input_str = change_all(input_str, r"\textbf", r" ", r"{", r"}", r"", r" ")
    input_str = change_all(input_str, r"\textbf", r" ", r"{", r"}", r"", r" ")
    input_str = change_all(input_str, r"\mathbf", r" ", r"{", r"}", r"", r" ")
    output_str = input_str.strip()
    return output_str
 def add_newlines(latex_str: str) -> str:
    """
    Adds newlines to a LaTeX string based on specific patterns, ensuring no
    duplicate newlines are added around begin/end environments.
    - After \\ (if not already followed by newline)
    - Before \\begin{...} (if not already preceded by newline)
    - After \\begin{...} (if not already followed by newline)
    - Before \\end{...} (if not already preceded by newline)
    - After \\end{...} (if not already followed by newline)
    Args:
        latex_str: The input LaTeX string.
    Returns:
        The LaTeX string with added newlines, avoiding duplicates.
    """
    processed_str = latex_str
    # 1. Replace whitespace around \begin{...} with \n...\n
    # \s* matches zero or more whitespace characters (space, tab, newline)
    # Captures the \begin{...} part in group 1 (\g<1>)
    processed_str = re.sub(r"\s*(\\begin\{[^}]*\})\s*", r"\n\g<1>\n", processed_str)
    # 2. Replace whitespace around \end{...} with \n...\n
    # Same logic as for \begin
    processed_str = re.sub(r"\s*(\\end\{[^}]*\})\s*", r"\n\g<1>\n", processed_str)
    # 3. Add newline after \\ (if not already followed by newline)
    processed_str = re.sub(r"\\\\(?!\n| )|\\\\ ", r"\\\\\n", processed_str)
    # 4. Cleanup: Collapse multiple consecutive newlines into a single newline.
    # This handles cases where the replacements above might have created \n\n.
    processed_str = re.sub(r'\n{2,}', '\n', processed_str)
    # Remove leading/trailing whitespace (including potential single newlines
    # at the very start/end resulting from the replacements) from the entire result.
    return processed_str.strip()
--- a/texteller/utils/misc.py
+++ b/texteller/utils/misc.py
@@ -0,0 +1,5 @@
 from textwrap import dedent
 def lines_dedent(s: str) -> str:
    return dedent(s).strip()
--- a/texteller/utils/path.py
+++ b/texteller/utils/path.py
@@ -0,0 +1,52 @@
 from pathlib import Path
 from typing import Literal
 from texteller.logger import get_logger
 _logger = get_logger(__name__)
 def resolve_path(path: str | Path) -> str:
    if isinstance(path, str):
        path = Path(path)
    return str(path.expanduser().resolve())
 def touch(path: str | Path) -> None:
    if isinstance(path, str):
        path = Path(path)
    path.touch(exist_ok=True)
 def mkdir(path: str | Path) -> None:
    if isinstance(path, str):
        path = Path(path)
    path.mkdir(parents=True, exist_ok=True)
 def rmfile(path: str | Path) -> None:
    if isinstance(path, str):
        path = Path(path)
    path.unlink(missing_ok=False)
 def rmdir(path: str | Path, mode: Literal["empty", "recursive"] = "empty") -> None:
    """Remove a directory.
    Args:
        path: Path to directory to remove
        mode: "empty" to only remove empty directories, "all" to recursively remove all contents
    """
    if isinstance(path, str):
        path = Path(path)
    if mode == "empty":
        path.rmdir()
        _logger.info(f"Removed empty directory: {path}")
    elif mode == "recursive":
        import shutil
        shutil.rmtree(path)
        _logger.info(f"Recursively removed directory and all contents: {path}")
    else:
        raise ValueError(f"Invalid mode: {mode}. Must be 'empty' or 'all'")
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,4 @@`
							`from .ngram import DetectRepeatingNgramCriteria`


							`__all__ = ["DetectRepeatingNgramCriteria"]`
		`@@ -0,0 +1,3 @@`
							`from .detect import latex_detect`

							`__all__ = ["latex_detect"]`
		`@@ -0,0 +1,3 @@`
							`from .texteller import TexTeller`

							`__all__ = ['TexTeller']`