1) 实现了文本-公式混排识别; 2) 重构了项目结构

2024-04-21 00:05:14 +08:00
parent eab6e4c85d
commit 185b2e3db6
19 changed files with 753 additions and 296 deletions
--- a/src/models/det_model/Bbox.py
+++ b/src/models/det_model/Bbox.py
@@ -0,0 +1,85 @@
+from PIL import Image, ImageDraw
+from typing import List
+
+
+class Point:
+    def __init__(self, x: int, y: int):
+        self.x = int(x)
+        self.y = int(y)
+    
+    def __repr__(self) -> str:
+        return f"Point(x={self.x}, y={self.y})"
+
+
+class Bbox:
+    THREADHOLD = 0.4
+
+    def __init__(self, x, y, h, w, label: str = None, confidence: float = 0, content: str = None):
+        self.p = Point(x, y)
+        self.h = int(h)
+        self.w = int(w)
+        self.label = label
+        self.confidence = confidence
+        self.content = content
+
+    @property
+    def ul_point(self) -> Point:
+        return self.p
+    
+    @property
+    def ur_point(self) -> Point:
+        return Point(self.p.x + self.w, self.p.y)
+    
+    @property
+    def ll_point(self) -> Point:
+        return Point(self.p.x, self.p.y + self.h)
+    
+    @property
+    def lr_point(self) -> Point:
+        return Point(self.p.x + self.w, self.p.y + self.h)
+    
+    
+    def same_row(self, other) -> bool:
+        if (
+            (self.p.y >= other.p.y and self.ll_point.y <= other.ll_point.y)
+            or (self.p.y <= other.p.y and self.ll_point.y >= other.ll_point.y)
+        ):
+            return True
+        if self.ll_point.y <= other.p.y or self.p.y >= other.ll_point.y:
+            return False
+        return 1.0 * abs(self.p.y - other.p.y) / max(self.h, other.h) < self.THREADHOLD
+    
+    def __lt__(self, other) -> bool:
+        '''
+        from top to bottom, from left to right
+        '''
+        if not self.same_row(other):
+            return self.p.y < other.p.y
+        else:
+            return self.p.x < other.p.x
+    
+    def __repr__(self) -> str:
+        return f"Bbox(upper_left_point={self.p}, h={self.h}, w={self.w}), label={self.label}, confident={self.confidence}, content={self.content})"
+
+
+def draw_bboxes(img: Image.Image, bboxes: List[Bbox], name="annotated_image.png"):
+    drawer = ImageDraw.Draw(img)
+    for bbox in bboxes:
+        # Calculate the coordinates for the rectangle to be drawn
+        left = bbox.p.x
+        top = bbox.p.y
+        right = bbox.p.x + bbox.w
+        bottom = bbox.p.y + bbox.h
+        
+        # Draw the rectangle on the image
+        drawer.rectangle([left, top, right, bottom], outline="green", width=1)
+        
+        # Optionally, add text label if it exists
+        if bbox.label:
+            drawer.text((left, top), bbox.label, fill="blue")
+        
+        if bbox.content:
+            drawer.text((left, bottom - 10), bbox.content[:10], fill="red")
+
+    # Save the image with drawn rectangles
+    img.save(name)
--- a/src/models/det_model/inference.py
+++ b/src/models/det_model/inference.py
@@ -0,0 +1,161 @@
+import os
+import yaml
+import numpy as np
+import cv2
+
+from tqdm import tqdm
+from typing import List
+from .preprocess import Compose
+from .Bbox import Bbox
+
+
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'RCNN', 'SSD', 'Face', 'FCOS', 'SOLOv2', 'TTFNet',
+    'S2ANet', 'JDE', 'FairMOT', 'DeepSORT', 'GFL', 'PicoDet', 'CenterNet',
+    'TOOD', 'RetinaNet', 'StrongBaseline', 'STGCN', 'YOLOX', 'HRNet', 
+    'DETR'
+}
+
+
+class PredictConfig(object):
+    """set config of preprocess, postprocess and visualize
+    Args:
+        infer_config (str): path of infer_cfg.yml
+    """
+
+    def __init__(self, infer_config):
+        # parsing Yaml config for Preprocess
+        with open(infer_config) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.label_list = yml_conf['label_list']
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
+        self.mask = yml_conf.get("mask", False)
+        self.tracker = yml_conf.get("tracker", None)
+        self.nms = yml_conf.get("NMS", None)
+        self.fpn_stride = yml_conf.get("fpn_stride", None)
+
+        color_pool = [(0, 255, 0), (255, 0, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255)]
+        self.colors = {label: color_pool[i % len(color_pool)] for i, label in enumerate(self.label_list)}
+
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
+
+def draw_bbox(image, outputs, infer_config):
+    for output in outputs:
+        cls_id, score, xmin, ymin, xmax, ymax = output
+        if score > infer_config.draw_threshold:
+            label = infer_config.label_list[int(cls_id)]
+            color = infer_config.colors[label]
+            cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2)
+            cv2.putText(image, "{}: {:.2f}".format(label, score),
+                        (int(xmin), int(ymin - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+    return image
+
+
+def predict_image(imgsave_dir, infer_config, predictor, img_list):
+    # load preprocess transforms
+    transforms = Compose(infer_config.preprocess_infos)
+    errImgList = []
+
+    # Check and create subimg_save_dir if not exist
+    subimg_save_dir = os.path.join(imgsave_dir, 'subimages')
+    os.makedirs(subimg_save_dir, exist_ok=True)
+
+    # predict image
+    for img_path in tqdm(img_list):
+        img = cv2.imread(img_path)
+        if img is None:
+            print(f"Warning: Could not read image {img_path}. Skipping...")
+            errImgList.append(img_path)
+            continue
+
+        inputs = transforms(img_path)
+        inputs_name = [var.name for var in predictor.get_inputs()]
+        inputs = {k: inputs[k][None, ] for k in inputs_name}
+
+        outputs = predictor.run(output_names=None, input_feed=inputs)
+
+        print("ONNXRuntime predict: ")
+        if infer_config.arch in ["HRNet"]:
+            print(np.array(outputs[0]))
+        else:
+            bboxes = np.array(outputs[0])
+            for bbox in bboxes:
+                if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:
+                    print(f"{int(bbox[0])} {bbox[1]} "
+                          f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
+
+        # Save the subimages (crop from the original image)
+        subimg_counter = 1
+        for output in np.array(outputs[0]):
+            cls_id, score, xmin, ymin, xmax, ymax = output
+            if score > infer_config.draw_threshold:
+                label = infer_config.label_list[int(cls_id)]
+                subimg = img[int(max(ymin, 0)):int(ymax), int(max(xmin, 0)):int(xmax)]
+                if len(subimg) == 0:
+                    continue
+
+                subimg_filename = f"{os.path.splitext(os.path.basename(img_path))[0]}_{label}_{xmin:.2f}_{ymin:.2f}_{xmax:.2f}_{ymax:.2f}.jpg"
+                subimg_path = os.path.join(subimg_save_dir, subimg_filename)
+                cv2.imwrite(subimg_path, subimg)
+                subimg_counter += 1
+
+        # Draw bounding boxes and save the image with bounding boxes
+        img_with_bbox = draw_bbox(img, np.array(outputs[0]), infer_config)
+        output_dir = imgsave_dir
+        os.makedirs(output_dir, exist_ok=True)
+        output_file = os.path.join(output_dir, "output_" + os.path.basename(img_path))
+        cv2.imwrite(output_file, img_with_bbox)
+
+    print("ErrorImgs:")
+    print(errImgList)
+
+
+def predict(img_path: str, predictor, infer_config) -> List[Bbox]:
+    transforms = Compose(infer_config.preprocess_infos)
+    inputs = transforms(img_path)
+    inputs_name = [var.name for var in predictor.get_inputs()]
+    inputs = {k: inputs[k][None, ] for k in inputs_name}
+
+    outputs = predictor.run(output_names=None, input_feed=inputs)[0]
+    res = []
+    for output in outputs:
+        cls_name = infer_config.label_list[int(output[0])]
+        score = output[1]
+        xmin = int(max(output[2], 0))
+        ymin = int(max(output[3], 0))
+        xmax = int(output[4])
+        ymax = int(output[5])
+        if score > infer_config.draw_threshold:
+            res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score))
+
+    return res
--- a/src/models/det_model/model/infer_cfg.yml
+++ b/src/models/det_model/model/infer_cfg.yml
@@ -8,8 +8,8 @@ Preprocess:
 - interp: 2
  keep_ratio: false
  target_size:
-  - 640
-  - 640
+  - 1600
+  - 1600
  type: Resize
 - mean:
  - 0.0
--- a/src/models/det_model/preprocess.py
+++ b/src/models/det_model/preprocess.py
@@ -4,9 +4,14 @@ import copy


 def decode_image(img_path):
-    with open(img_path, 'rb') as f:
-        im_read = f.read()
-    data = np.frombuffer(im_read, dtype='uint8')
+    if isinstance(img_path, str):
+        with open(img_path, 'rb') as f:
+            im_read = f.read()
+        data = np.frombuffer(im_read, dtype='uint8')
+    else:
+        assert isinstance(img_path, np.ndarray)
+        data = img_path
+
    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    img_info = {
--- a/src/models/ocr_model/utils/inference.py
+++ b/src/models/ocr_model/utils/inference.py
@@ -4,19 +4,22 @@ import numpy as np
 from transformers import RobertaTokenizerFast, GenerationConfig
 from typing import List, Union

-from models.ocr_model.model.TexTeller import TexTeller
-from models.ocr_model.utils.transforms import inference_transform
-from models.ocr_model.utils.helpers import convert2rgb
-from models.globals import MAX_TOKEN_SIZE
+from .transforms import inference_transform
+from .helpers import convert2rgb
+from ..model.TexTeller import TexTeller
+from ...globals import MAX_TOKEN_SIZE


 def inference(
    model: TexTeller, 
    tokenizer: RobertaTokenizerFast,
    imgs: Union[List[str], List[np.ndarray]], 
-    inf_mode: str = 'cpu',
+    accelerator: str = 'cpu',
    num_beams: int = 1,
+    max_tokens = None
 ) -> List[str]:
+    if imgs == []:
+        return []
    model.eval()
    if isinstance(imgs[0], str):
        imgs = convert2rgb(imgs) 
@@ -26,11 +29,11 @@ def inference(
    imgs = inference_transform(imgs)
    pixel_values = torch.stack(imgs)

-    model = model.to(inf_mode)
-    pixel_values = pixel_values.to(inf_mode)
+    model = model.to(accelerator)
+    pixel_values = pixel_values.to(accelerator)

    generate_config = GenerationConfig(
-        max_new_tokens=MAX_TOKEN_SIZE,
+        max_new_tokens=MAX_TOKEN_SIZE if max_tokens is None else max_tokens,
        num_beams=num_beams,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
--- a/src/models/ocr_model/utils/to_katex.py
+++ b/src/models/ocr_model/utils/to_katex.py
@@ -0,0 +1,110 @@
+import re
+
+
+def change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
+    result = ""
+    i = 0
+    n = len(input_str)
+    
+    while i < n:
+        if input_str[i:i+len(old_inst)] == old_inst:
+            # check if the old_inst is followed by old_surr_l
+            start = i + len(old_inst)
+        else:
+            result += input_str[i]
+            i += 1
+            continue
+
+        if start < n and input_str[start] == old_surr_l:
+            # found an old_inst followed by old_surr_l, now look for the matching old_surr_r
+            count = 1
+            j = start + 1
+            escaped = False
+            while j < n and count > 0:
+                if input_str[j] == '\\' and not escaped:
+                    escaped = True
+                    j += 1
+                    continue
+                if input_str[j] == old_surr_r and not escaped:
+                    count -= 1
+                    if count == 0:
+                        break
+                elif input_str[j] == old_surr_l and not escaped:
+                    count += 1
+                escaped = False
+                j += 1
+            
+            if count == 0:
+                assert j < n
+                assert input_str[start] == old_surr_l
+                assert input_str[j] == old_surr_r
+                inner_content = input_str[start + 1:j]
+                # Replace the content with new pattern
+                result += new_inst + new_surr_l + inner_content + new_surr_r
+                i = j + 1
+                continue
+            else:
+                assert count > 1
+                assert j == n
+                print("Warning: unbalanced surrogate pair in input string")
+                result += new_inst + new_surr_l
+                i = start + 1
+                continue
+        else:
+            i = start
+    
+    if old_inst != new_inst and old_inst in result:
+        return change(result, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r)
+    else:
+        return result
+
+
+def to_katex(formula: str) -> str:
+    res = formula
+    res = change(res, r'\mbox', r'', r'{', r'}', r'', r'')
+
+    origin_instructions = [
+        r'\Huge',
+        r'\huge',
+        r'\LARGE',
+        r'\Large',
+        r'\large',
+        r'\normalsize',
+        r'\small',
+        r'\footnotesize',
+        r'\scriptsize',
+        r'\tiny'
+    ]
+    for (old_ins, new_ins) in zip(origin_instructions, origin_instructions):
+        res = change(res, old_ins, new_ins, r'$', r'$', '{', '}')
+    res = change(res, r'\boldmath', r'\bm', r'$', r'$', r'{', r'}')
+    
+    origin_instructions = [
+        r'\left',
+        r'\middle',
+        r'\right',
+        r'\big',
+        r'\Big',
+        r'\bigg',
+        r'\Bigg',
+        r'\bigl',
+        r'\Bigl',
+        r'\biggl',
+        r'\Biggl',
+        r'\bigm',
+        r'\Bigm',
+        r'\biggm',
+        r'\Biggm',
+        r'\bigr',
+        r'\Bigr',
+        r'\biggr',
+        r'\Biggr'
+    ]
+    for origin_ins in origin_instructions:
+        res = change(res, origin_ins, origin_ins, r'{', r'}', r'', r'')
+
+    res = re.sub(r'\\\[(.*?)\\\]', r'\1\\newline', res)
+
+    if res.endswith(r'\newline'):
+        res = res[:-8]
+    return res
--- a/src/models/utils/init.py
+++ b/src/models/utils/init.py
@@ -0,0 +1 @@
+from .mix_inference import mix_inference
--- a/src/models/utils/mix_inference.py
+++ b/src/models/utils/mix_inference.py
@@ -0,0 +1,257 @@
+import re
+import heapq
+import cv2
+import numpy as np
+
+from onnxruntime import InferenceSession
+from collections import Counter
+from typing import List
+
+from PIL import Image
+from surya.ocr import run_ocr
+from surya.detection import batch_text_detection
+from surya.input.processing import slice_polys_from_image, slice_bboxes_from_image
+from surya.recognition import batch_recognition
+from surya.model.detection import segformer
+from surya.model.recognition.model import load_model
+from surya.model.recognition.processor import load_processor
+
+from ..det_model.inference import PredictConfig
+from ..det_model.inference import predict as latex_det_predict
+from ..det_model.Bbox import Bbox, draw_bboxes
+
+from ..ocr_model.model.TexTeller import TexTeller
+from ..ocr_model.utils.inference import inference as latex_rec_predict
+from ..ocr_model.utils.to_katex import to_katex
+
+MAXV = 999999999
+
+
+def mask_img(img, bboxes: List[Bbox], bg_color: np.ndarray) -> np.ndarray:
+    mask_img = img.copy()
+    for bbox in bboxes:
+        mask_img[bbox.p.y:bbox.p.y + bbox.h, bbox.p.x:bbox.p.x + bbox.w] = bg_color
+    return mask_img
+
+
+def bbox_merge(sorted_bboxes: List[Bbox]) -> List[Bbox]:
+    if (len(sorted_bboxes) == 0):
+        return []
+    bboxes = sorted_bboxes.copy()
+    guard = Bbox(MAXV, bboxes[-1].p.y, -1, -1, label="guard")
+    bboxes.append(guard)
+    res = []
+    prev = bboxes[0]
+    for curr in bboxes:
+        if prev.ur_point.x <= curr.p.x or not prev.same_row(curr):
+            res.append(prev)
+            prev = curr
+        else:
+            prev.w = max(prev.w, curr.ur_point.x - prev.p.x)
+    return res
+
+
+def split_conflict(ocr_bboxes: List[Bbox], latex_bboxes: List[Bbox]) -> List[Bbox]:
+    if latex_bboxes == []:
+        return ocr_bboxes
+    if ocr_bboxes == [] or len(ocr_bboxes) == 1:
+        return ocr_bboxes
+
+    bboxes = sorted(ocr_bboxes + latex_bboxes)
+
+    ########  debug   #########
+    for idx, bbox in enumerate(bboxes):
+        bbox.content = str(idx)
+    draw_bboxes(Image.fromarray(img), bboxes, name="before_split_confict.png")
+    ########  debug   ###########
+
+    assert len(bboxes) > 1
+
+    heapq.heapify(bboxes)
+    res = []
+    candidate = heapq.heappop(bboxes)
+    curr = heapq.heappop(bboxes)
+    idx = 0
+    while (len(bboxes) > 0):
+        idx += 1
+        assert candidate.p.x < curr.p.x or not candidate.same_row(curr)
+
+        if candidate.ur_point.x <= curr.p.x or not candidate.same_row(curr):
+            res.append(candidate)
+            candidate = curr
+            curr = heapq.heappop(bboxes)
+        elif candidate.ur_point.x < curr.ur_point.x:
+            assert not (candidate.label != "text" and curr.label != "text")
+            if candidate.label == "text" and curr.label == "text":
+                candidate.w = curr.ur_point.x - candidate.p.x
+                curr = heapq.heappop(bboxes)
+            elif candidate.label != curr.label:
+                if candidate.label == "text":
+                    candidate.w = curr.p.x - candidate.p.x
+                    res.append(candidate)
+                    candidate = curr
+                    curr = heapq.heappop(bboxes)
+                else:
+                    curr.w = curr.ur_point.x - candidate.ur_point.x
+                    curr.p.x = candidate.ur_point.x
+                    heapq.heappush(bboxes, curr)
+                    curr = heapq.heappop(bboxes)
+                
+        elif candidate.ur_point.x >= curr.ur_point.x:
+            assert not (candidate.label != "text" and curr.label != "text")
+
+            if candidate.label == "text":
+                assert curr.label != "text"
+                heapq.heappush(
+                    bboxes,
+                    Bbox(
+                        curr.ur_point.x,
+                        candidate.p.y,
+                        candidate.h,
+                        candidate.ur_point.x - curr.ur_point.x,
+                        label="text",
+                        confidence=candidate.confidence,
+                        content=None
+                    )
+                )
+                candidate.w = curr.p.x - candidate.p.x
+                res.append(candidate)
+                candidate = curr
+                curr = heapq.heappop(bboxes)
+            else:
+                assert curr.label == "text"
+                curr = heapq.heappop(bboxes)
+        else:
+            assert False
+    res.append(candidate)
+    res.append(curr)
+    ########  debug   #########
+    for idx, bbox in enumerate(res):
+        bbox.content = str(idx)
+    draw_bboxes(Image.fromarray(img), res, name="after_split_confict.png")
+    ########  debug   ###########
+    return res
+
+
+def mix_inference(
+    img_path: str,
+    language: str,
+    infer_config,
+    latex_det_model,
+
+    lang_ocr_models,
+
+    latex_rec_models,
+    accelerator="cpu",
+    num_beams=1
+) -> str:
+    '''
+    Input a mixed image of formula text and output str (in markdown syntax)
+    '''
+    global img
+    img = cv2.imread(img_path)
+    corners = [tuple(img[0, 0]), tuple(img[0, -1]),
+               tuple(img[-1, 0]), tuple(img[-1, -1])]
+    bg_color = np.array(Counter(corners).most_common(1)[0][0])
+
+    latex_bboxes = latex_det_predict(img_path, latex_det_model, infer_config)
+    latex_bboxes = sorted(latex_bboxes)
+    draw_bboxes(Image.fromarray(img), latex_bboxes, name="latex_bboxes(unmerged).png")
+    latex_bboxes = bbox_merge(latex_bboxes)
+    draw_bboxes(Image.fromarray(img), latex_bboxes, name="latex_bboxes(merged).png")
+    masked_img = mask_img(img, latex_bboxes, bg_color)
+
+    det_model, det_processor, rec_model, rec_processor = lang_ocr_models
+    images = [Image.fromarray(masked_img)]
+    det_prediction = batch_text_detection(images, det_model, det_processor)[0]
+    draw_bboxes(Image.fromarray(img), latex_bboxes, name="ocr_bboxes(unmerged).png")
+
+    lang = [language]
+    slice_map = []
+    all_slices = []
+    all_langs = []
+    ocr_bboxes = [
+        Bbox(
+            p.bbox[0], p.bbox[1], p.bbox[3] - p.bbox[1], p.bbox[2] - p.bbox[0],
+            label="text",
+            confidence=p.confidence,
+            content=None
+        )
+        for p in det_prediction.bboxes
+    ]
+    ocr_bboxes = sorted(ocr_bboxes)
+    ocr_bboxes = bbox_merge(ocr_bboxes)
+    draw_bboxes(Image.fromarray(img), ocr_bboxes, name="ocr_bboxes(merged).png")
+    ocr_bboxes = split_conflict(ocr_bboxes, latex_bboxes)
+    ocr_bboxes = list(filter(lambda x: x.label == "text", ocr_bboxes))
+    polygons = [
+        [
+            [bbox.ul_point.x, bbox.ul_point.y],
+            [bbox.ur_point.x, bbox.ur_point.y],
+            [bbox.lr_point.x, bbox.lr_point.y],
+            [bbox.ll_point.x, bbox.ll_point.y]
+        ]
+        for bbox in ocr_bboxes
+    ]
+
+    slices = slice_polys_from_image(images[0], polygons)
+    slice_map.append(len(slices))
+    all_slices.extend(slices)
+    all_langs.extend([lang] * len(slices))
+
+    rec_predictions, _ = batch_recognition(all_slices, all_langs, rec_model, rec_processor)
+
+    assert len(rec_predictions) == len(ocr_bboxes)
+    for content, bbox in zip(rec_predictions, ocr_bboxes):
+        bbox.content = content
+    
+    latex_imgs =[]
+    for bbox in latex_bboxes:
+        latex_imgs.append(img[bbox.p.y:bbox.p.y + bbox.h, bbox.p.x:bbox.p.x + bbox.w])
+    latex_rec_res = latex_rec_predict(*latex_rec_models, latex_imgs, accelerator, num_beams, max_tokens=200)
+    for bbox, content in zip(latex_bboxes, latex_rec_res):
+        bbox.content = to_katex(content)
+        if bbox.label == "embedding":
+            bbox.content = " $" + bbox.content + "$ "
+        elif bbox.label == "isolated":
+            bbox.content = '\n' + r"$$" + bbox.content + r"$$" + '\n'
+
+    bboxes = sorted(ocr_bboxes + latex_bboxes)
+    if bboxes == []:
+        return ""
+
+    md = ""
+    prev = Bbox(bboxes[0].p.x, bboxes[0].p.y, -1, -1, label="guard")
+    # prev = bboxes[0]
+    for curr in bboxes:
+        if not prev.same_row(curr):
+            md += "\n"
+        md += curr.content
+        if (
+            prev.label == "isolated"
+            and curr.label == "text"
+            and bool(re.fullmatch(r"\([1-9]\d*?\)", curr.content))
+        ):
+            md += '\n'
+        prev = curr
+    return md
+
+
+if __name__ == '__main__':
+    img_path = "/Users/Leehy/Code/TexTeller/test3.png"
+
+    # latex_det_model = InferenceSession("/Users/Leehy/Code/TexTeller/src/models/det_model/model/rtdetr_r50vd_6x_coco_trained_on_IBEM_en_papers.onnx")
+    latex_det_model = InferenceSession("/Users/Leehy/Code/TexTeller/src/models/det_model/model/rtdetr_r50vd_6x_coco.onnx")
+    infer_config = PredictConfig("/Users/Leehy/Code/TexTeller/src/models/det_model/model/infer_cfg.yml")
+
+    det_processor, det_model = segformer.load_processor(), segformer.load_model()
+    rec_model, rec_processor = load_model(), load_processor()
+    lang_ocr_models = (det_model, det_processor, rec_model, rec_processor)
+
+    texteller = TexTeller.from_pretrained()
+    tokenizer = TexTeller.get_tokenizer()
+    latex_rec_models = (texteller, tokenizer)
+
+    res = mix_inference(img_path, "zh", infer_config, latex_det_model, lang_ocr_models, latex_rec_models)
+    print(res)
+    pause = 1