From e77c222469131e974f79d76a57b640d0d2c5ed93 Mon Sep 17 00:00:00 2001 From: OleehyO Date: Mon, 21 Apr 2025 12:22:46 +0000 Subject: [PATCH] deploy: 3de00312eb693bbd0b3a51a9c3b0417c37b45294 --- _modules/index.html | 386 +++++++++++ _modules/texteller/api/detection/detect.html | 454 +++++++++++++ _modules/texteller/api/inference.html | 669 +++++++++++++++++++ _modules/texteller/api/katex.html | 517 ++++++++++++++ _modules/texteller/api/load.html | 553 +++++++++++++++ api.html | 295 +++++++- genindex.html | 48 ++ objects.inv | Bin 550 -> 730 bytes searchindex.js | 2 +- 9 files changed, 2913 insertions(+), 11 deletions(-) create mode 100644 _modules/index.html create mode 100644 _modules/texteller/api/detection/detect.html create mode 100644 _modules/texteller/api/inference.html create mode 100644 _modules/texteller/api/katex.html create mode 100644 _modules/texteller/api/load.html diff --git a/_modules/index.html b/_modules/index.html new file mode 100644 index 0000000..7c3e83a --- /dev/null +++ b/_modules/index.html @@ -0,0 +1,386 @@ + + + + + + + + + + Overview: module code — TexTeller documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +
+
+
+
+
+ +
+ +
+ + + + + +
+
+ + + + + +
+ + + + + + + + + + + + + +
+ +
+ + + +
+ +
+
+ +
+
+ +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ + + +
+

+ +
+
+ +
+
+
+ + + + + + + + + + + +
+ +
+
+
+ +
+ + + + +
+
+ + +
+ + +
+
+
+ + + + + + + + \ No newline at end of file diff --git a/_modules/texteller/api/detection/detect.html b/_modules/texteller/api/detection/detect.html new file mode 100644 index 0000000..a6d7e54 --- /dev/null +++ b/_modules/texteller/api/detection/detect.html @@ -0,0 +1,454 @@ + + + + + + + + + + texteller.api.detection.detect — TexTeller documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +
+
+
+
+
+ +
+ +
+ + + + + +
+
+ + + + + +
+ + + + + + + + + + + + + +
+ +
+ + + +
+ +
+
+ +
+
+ +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ + + +
+

+ +
+
+ +
+
+
+ + + + +
+ +

Source code for texteller.api.detection.detect

+from typing import List
+
+from onnxruntime import InferenceSession
+
+from texteller.types import Bbox
+
+from .preprocess import Compose
+
+_config = {
+    "mode": "paddle",
+    "draw_threshold": 0.5,
+    "metric": "COCO",
+    "use_dynamic_shape": False,
+    "arch": "DETR",
+    "min_subgraph_size": 3,
+    "preprocess": [
+        {"interp": 2, "keep_ratio": False, "target_size": [1600, 1600], "type": "Resize"},
+        {
+            "mean": [0.0, 0.0, 0.0],
+            "norm_type": "none",
+            "std": [1.0, 1.0, 1.0],
+            "type": "NormalizeImage",
+        },
+        {"type": "Permute"},
+    ],
+    "label_list": ["isolated", "embedding"],
+}
+
+
+
+[docs] +def latex_detect(img_path: str, predictor: InferenceSession) -> List[Bbox]: + """ + Detect LaTeX formulas in an image and classify them as isolated or embedded. + + This function uses an ONNX model to detect LaTeX formulas in images. The model + identifies two types of LaTeX formulas: + - 'isolated': Standalone LaTeX formulas (typically displayed equations) + - 'embedding': Inline LaTeX formulas embedded within text + + Args: + img_path: Path to the input image file + predictor: ONNX InferenceSession model for LaTeX detection + + Returns: + List of Bbox objects representing the detected LaTeX formulas with their + positions, classifications, and confidence scores + + Example: + >>> from texteller.api import load_latexdet_model, latex_detect + >>> model = load_latexdet_model() + >>> bboxes = latex_detect("path/to/image.png", model) + """ + transforms = Compose(_config["preprocess"]) + inputs = transforms(img_path) + inputs_name = [var.name for var in predictor.get_inputs()] + inputs = {k: inputs[k][None,] for k in inputs_name} + + outputs = predictor.run(output_names=None, input_feed=inputs)[0] + res = [] + for output in outputs: + cls_name = _config["label_list"][int(output[0])] + score = output[1] + xmin = int(max(output[2], 0)) + ymin = int(max(output[3], 0)) + xmax = int(output[4]) + ymax = int(output[5]) + if score > 0.5: + res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score)) + + return res
+ +
+ +
+ + + + + + +
+ +
+
+
+ +
+ + + + +
+
+ + +
+ + +
+
+
+ + + + + + + + \ No newline at end of file diff --git a/_modules/texteller/api/inference.html b/_modules/texteller/api/inference.html new file mode 100644 index 0000000..93957a1 --- /dev/null +++ b/_modules/texteller/api/inference.html @@ -0,0 +1,669 @@ + + + + + + + + + + texteller.api.inference — TexTeller documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +
+
+
+
+
+ +
+ +
+ + + + + +
+
+ + + + + +
+ + + + + + + + + + + + + +
+ +
+ + + +
+ +
+
+ +
+
+ +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ + + +
+

+ +
+
+ +
+
+
+ + + + +
+ +

Source code for texteller.api.inference

+import re
+import time
+from collections import Counter
+from typing import Literal
+
+import cv2
+import numpy as np
+import torch
+from onnxruntime import InferenceSession
+from optimum.onnxruntime import ORTModelForVision2Seq
+from transformers import GenerationConfig, RobertaTokenizerFast
+
+from texteller.constants import MAX_TOKEN_SIZE
+from texteller.logger import get_logger
+from texteller.paddleocr import predict_det, predict_rec
+from texteller.types import Bbox, TexTellerModel
+from texteller.utils import (
+    bbox_merge,
+    get_device,
+    mask_img,
+    readimgs,
+    remove_style,
+    slice_from_image,
+    split_conflict,
+    transform,
+    add_newlines,
+)
+
+from .detection import latex_detect
+from .format import format_latex
+from .katex import to_katex
+
+_logger = get_logger()
+
+
+
+[docs] +def img2latex( + model: TexTellerModel, + tokenizer: RobertaTokenizerFast, + images: list[str] | list[np.ndarray], + device: torch.device | None = None, + out_format: Literal["latex", "katex"] = "latex", + keep_style: bool = False, + max_tokens: int = MAX_TOKEN_SIZE, + num_beams: int = 1, + no_repeat_ngram_size: int = 0, +) -> list[str]: + """ + Convert images to LaTeX or KaTeX formatted strings. + + Args: + model: The TexTeller or ORTModelForVision2Seq model instance + tokenizer: The tokenizer for the model + images: List of image paths or numpy arrays (RGB format) + device: The torch device to use (defaults to available GPU or CPU) + out_format: Output format, either "latex" or "katex" + keep_style: Whether to keep the style of the LaTeX + max_tokens: Maximum number of tokens to generate + num_beams: Number of beams for beam search + no_repeat_ngram_size: Size of n-grams to prevent repetition + + Returns: + List of LaTeX or KaTeX strings corresponding to each input image + + Example: + >>> import torch + >>> from texteller import load_model, load_tokenizer, img2latex + >>> + >>> model = load_model(model_path=None, use_onnx=False) + >>> tokenizer = load_tokenizer(tokenizer_path=None) + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> + >>> res = img2latex(model, tokenizer, ["path/to/image.png"], device=device, out_format="katex") + """ + assert isinstance(images, list) + assert len(images) > 0 + + if device is None: + device = get_device() + + if device.type != model.device.type: + if isinstance(model, ORTModelForVision2Seq): + _logger.warning( + f"Onnxruntime device mismatch: detected {str(device)} but model is on {str(model.device)}, using {str(model.device)} instead" + ) + else: + model = model.to(device=device) + + if isinstance(images[0], str): + images = readimgs(images) + else: # already numpy array(rgb format) + assert isinstance(images[0], np.ndarray) + images = images + + images = transform(images) + pixel_values = torch.stack(images) + + generate_config = GenerationConfig( + max_new_tokens=max_tokens, + num_beams=num_beams, + do_sample=False, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id, + bos_token_id=tokenizer.bos_token_id, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + pred = model.generate( + pixel_values.to(model.device), + generation_config=generate_config, + ) + + res = tokenizer.batch_decode(pred, skip_special_tokens=True) + + if out_format == "katex": + res = [to_katex(r) for r in res] + + if not keep_style: + res = [remove_style(r) for r in res] + + res = [format_latex(r) for r in res] + res = [add_newlines(r) for r in res] + return res
+ + + +
+[docs] +def paragraph2md( + img_path: str, + latexdet_model: InferenceSession, + textdet_model: predict_det.TextDetector, + textrec_model: predict_rec.TextRecognizer, + latexrec_model: TexTellerModel, + tokenizer: RobertaTokenizerFast, + device: torch.device | None = None, + num_beams=1, +) -> str: + """ + Convert an image containing both text and mathematical formulas to markdown format. + + This function processes a mixed-content image by: + 1. Detecting mathematical formulas using a latex detection model + 2. Masking detected formula areas and detecting text regions using OCR + 3. Recognizing text in the detected regions + 4. Converting formula regions to LaTeX using the latex recognition model + 5. Combining all detected elements into a properly formatted markdown string + + Args: + img_path: Path to the input image containing text and formulas + latexdet_model: ONNX InferenceSession for LaTeX formula detection + textdet_model: OCR text detector model + textrec_model: OCR text recognition model + latexrec_model: TexTeller model for LaTeX formula recognition + tokenizer: Tokenizer for the LaTeX recognition model + device: The torch device to use (defaults to available GPU or CPU) + num_beams: Number of beams for beam search during LaTeX generation + + Returns: + Markdown formatted string containing the recognized text and formulas + + Example: + >>> from texteller import load_latexdet_model, load_textdet_model, load_textrec_model, load_tokenizer, paragraph2md + >>> + >>> # Load all required models + >>> latexdet_model = load_latexdet_model() + >>> textdet_model = load_textdet_model() + >>> textrec_model = load_textrec_model() + >>> latexrec_model = load_model() + >>> tokenizer = load_tokenizer() + >>> + >>> # Convert image to markdown + >>> markdown_text = paragraph2md( + ... img_path="path/to/mixed_content_image.jpg", + ... latexdet_model=latexdet_model, + ... textdet_model=textdet_model, + ... textrec_model=textrec_model, + ... latexrec_model=latexrec_model, + ... tokenizer=tokenizer, + ... ) + """ + img = cv2.imread(img_path) + corners = [tuple(img[0, 0]), tuple(img[0, -1]), tuple(img[-1, 0]), tuple(img[-1, -1])] + bg_color = np.array(Counter(corners).most_common(1)[0][0]) + + start_time = time.time() + latex_bboxes = latex_detect(img_path, latexdet_model) + end_time = time.time() + _logger.info(f"latex_det_model time: {end_time - start_time:.2f}s") + latex_bboxes = sorted(latex_bboxes) + latex_bboxes = bbox_merge(latex_bboxes) + masked_img = mask_img(img, latex_bboxes, bg_color) + + start_time = time.time() + det_prediction, _ = textdet_model(masked_img) + end_time = time.time() + _logger.info(f"ocr_det_model time: {end_time - start_time:.2f}s") + ocr_bboxes = [ + Bbox( + p[0][0], + p[0][1], + p[3][1] - p[0][1], + p[1][0] - p[0][0], + label="text", + confidence=None, + content=None, + ) + for p in det_prediction + ] + + ocr_bboxes = sorted(ocr_bboxes) + ocr_bboxes = bbox_merge(ocr_bboxes) + ocr_bboxes = split_conflict(ocr_bboxes, latex_bboxes) + ocr_bboxes = list(filter(lambda x: x.label == "text", ocr_bboxes)) + + sliced_imgs: list[np.ndarray] = slice_from_image(img, ocr_bboxes) + start_time = time.time() + rec_predictions, _ = textrec_model(sliced_imgs) + end_time = time.time() + _logger.info(f"ocr_rec_model time: {end_time - start_time:.2f}s") + + assert len(rec_predictions) == len(ocr_bboxes) + for content, bbox in zip(rec_predictions, ocr_bboxes): + bbox.content = content[0] + + latex_imgs = [] + for bbox in latex_bboxes: + latex_imgs.append(img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w]) + start_time = time.time() + latex_rec_res = img2latex( + model=latexrec_model, + tokenizer=tokenizer, + images=latex_imgs, + num_beams=num_beams, + out_format="katex", + device=device, + keep_style=False, + ) + end_time = time.time() + _logger.info(f"latex_rec_model time: {end_time - start_time:.2f}s") + + for bbox, content in zip(latex_bboxes, latex_rec_res): + if bbox.label == "embedding": + bbox.content = " $" + content + "$ " + elif bbox.label == "isolated": + bbox.content = "\n\n" + r"$$" + content + r"$$" + "\n\n" + + bboxes = sorted(ocr_bboxes + latex_bboxes) + if bboxes == []: + return "" + + md = "" + prev = Bbox(bboxes[0].p.x, bboxes[0].p.y, -1, -1, label="guard") + for curr in bboxes: + # Add the formula number back to the isolated formula + if prev.label == "isolated" and curr.label == "text" and prev.same_row(curr): + curr.content = curr.content.strip() + if curr.content.startswith("(") and curr.content.endswith(")"): + curr.content = curr.content[1:-1] + + if re.search(r"\\tag\{.*\}$", md[:-4]) is not None: + # in case of multiple tag + md = md[:-5] + f", {curr.content}" + "}" + md[-4:] + else: + md = md[:-4] + f"\\tag{{{curr.content}}}" + md[-4:] + continue + + if not prev.same_row(curr): + md += " " + + if curr.label == "embedding": + # remove the bold effect from inline formulas + curr.content = remove_style(curr.content) + + # change split environment into aligned + curr.content = curr.content.replace(r"\begin{split}", r"\begin{aligned}") + curr.content = curr.content.replace(r"\end{split}", r"\end{aligned}") + + # remove extra spaces (keeping only one) + curr.content = re.sub(r" +", " ", curr.content) + assert curr.content.startswith("$") and curr.content.endswith("$") + curr.content = " $" + curr.content.strip("$") + "$ " + md += curr.content + prev = curr + + return md.strip()
+ +
+ +
+ + + + + + +
+ +
+
+
+ +
+ + + + +
+
+ + +
+ + +
+
+
+ + + + + + + + \ No newline at end of file diff --git a/_modules/texteller/api/katex.html b/_modules/texteller/api/katex.html new file mode 100644 index 0000000..e7ac432 --- /dev/null +++ b/_modules/texteller/api/katex.html @@ -0,0 +1,517 @@ + + + + + + + + + + texteller.api.katex — TexTeller documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +
+
+
+
+
+ +
+ +
+ + + + + +
+
+ + + + + +
+ + + + + + + + + + + + + +
+ +
+ + + +
+ +
+
+ +
+
+ +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ + + +
+

+ +
+
+ +
+
+
+ + + + +
+ +

Source code for texteller.api.katex

+import re
+
+from ..utils.latex import change_all
+from .format import format_latex
+
+
+def _rm_dollar_surr(content):
+    pattern = re.compile(r"\\[a-zA-Z]+\$.*?\$|\$.*?\$")
+    matches = pattern.findall(content)
+
+    for match in matches:
+        if not re.match(r"\\[a-zA-Z]+", match):
+            new_match = match.strip("$")
+            content = content.replace(match, " " + new_match + " ")
+
+    return content
+
+
+
+[docs] +def to_katex(formula: str) -> str: + """ + Convert LaTeX formula to KaTeX-compatible format. + + This function processes a LaTeX formula string and converts it to a format + that is compatible with KaTeX rendering. It removes unsupported commands + and structures, simplifies LaTeX environments, and optimizes the formula + for web display. + + Args: + formula: LaTeX formula string to convert + + Returns: + KaTeX-compatible formula string + """ + res = formula + # remove mbox surrounding + res = change_all(res, r"\mbox ", r" ", r"{", r"}", r"", r"") + res = change_all(res, r"\mbox", r" ", r"{", r"}", r"", r"") + # remove hbox surrounding + res = re.sub(r"\\hbox to ?-? ?\d+\.\d+(pt)?\{", r"\\hbox{", res) + res = change_all(res, r"\hbox", r" ", r"{", r"}", r"", r" ") + # remove raise surrounding + res = re.sub(r"\\raise ?-? ?\d+\.\d+(pt)?", r" ", res) + # remove makebox + res = re.sub(r"\\makebox ?\[\d+\.\d+(pt)?\]\{", r"\\makebox{", res) + res = change_all(res, r"\makebox", r" ", r"{", r"}", r"", r" ") + # remove vbox surrounding, scalebox surrounding + res = re.sub(r"\\raisebox\{-? ?\d+\.\d+(pt)?\}\{", r"\\raisebox{", res) + res = re.sub(r"\\scalebox\{-? ?\d+\.\d+(pt)?\}\{", r"\\scalebox{", res) + res = change_all(res, r"\scalebox", r" ", r"{", r"}", r"", r" ") + res = change_all(res, r"\raisebox", r" ", r"{", r"}", r"", r" ") + res = change_all(res, r"\vbox", r" ", r"{", r"}", r"", r" ") + + origin_instructions = [ + r"\Huge", + r"\huge", + r"\LARGE", + r"\Large", + r"\large", + r"\normalsize", + r"\small", + r"\footnotesize", + r"\tiny", + ] + for old_ins, new_ins in zip(origin_instructions, origin_instructions): + res = change_all(res, old_ins, new_ins, r"$", r"$", "{", "}") + res = change_all(res, r"\mathbf", r"\bm", r"{", r"}", r"{", r"}") + res = change_all(res, r"\boldmath ", r"\bm", r"{", r"}", r"{", r"}") + res = change_all(res, r"\boldmath", r"\bm", r"{", r"}", r"{", r"}") + res = change_all(res, r"\boldmath ", r"\bm", r"$", r"$", r"{", r"}") + res = change_all(res, r"\boldmath", r"\bm", r"$", r"$", r"{", r"}") + res = change_all(res, r"\scriptsize", r"\scriptsize", r"$", r"$", r"{", r"}") + res = change_all(res, r"\emph", r"\textit", r"{", r"}", r"{", r"}") + res = change_all(res, r"\emph ", r"\textit", r"{", r"}", r"{", r"}") + + # remove bold command + res = change_all(res, r"\bm", r" ", r"{", r"}", r"", r"") + + origin_instructions = [ + r"\left", + r"\middle", + r"\right", + r"\big", + r"\Big", + r"\bigg", + r"\Bigg", + r"\bigl", + r"\Bigl", + r"\biggl", + r"\Biggl", + r"\bigm", + r"\Bigm", + r"\biggm", + r"\Biggm", + r"\bigr", + r"\Bigr", + r"\biggr", + r"\Biggr", + ] + for origin_ins in origin_instructions: + res = change_all(res, origin_ins, origin_ins, r"{", r"}", r"", r"") + + res = re.sub(r"\\\[(.*?)\\\]", r"\1\\newline", res) + + if res.endswith(r"\newline"): + res = res[:-8] + + # remove multiple spaces + res = re.sub(r"(\\,){1,}", " ", res) + res = re.sub(r"(\\!){1,}", " ", res) + res = re.sub(r"(\\;){1,}", " ", res) + res = re.sub(r"(\\:){1,}", " ", res) + res = re.sub(r"\\vspace\{.*?}", "", res) + + # merge consecutive text + def merge_texts(match): + texts = match.group(0) + merged_content = "".join(re.findall(r"\\text\{([^}]*)\}", texts)) + return f"\\text{{{merged_content}}}" + + res = re.sub(r"(\\text\{[^}]*\}\s*){2,}", merge_texts, res) + + res = res.replace(r"\bf ", "") + res = _rm_dollar_surr(res) + + # remove extra spaces (keeping only one) + res = re.sub(r" +", " ", res) + + # format latex + res = res.strip() + res = format_latex(res) + + return res
+ +
+ +
+ + + + + + +
+ +
+
+
+ +
+ + + + +
+
+ + +
+ + +
+
+
+ + + + + + + + \ No newline at end of file diff --git a/_modules/texteller/api/load.html b/_modules/texteller/api/load.html new file mode 100644 index 0000000..1c3ad0f --- /dev/null +++ b/_modules/texteller/api/load.html @@ -0,0 +1,553 @@ + + + + + + + + + + texteller.api.load — TexTeller documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +
+
+
+
+
+ +
+ +
+ + + + + +
+
+ + + + + +
+ + + + + + + + + + + + + +
+ +
+ + + +
+ +
+
+ +
+
+ +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ + + +
+

+ +
+
+ +
+
+
+ + + + +
+ +

Source code for texteller.api.load

+from pathlib import Path
+
+import wget
+from onnxruntime import InferenceSession
+from transformers import RobertaTokenizerFast
+
+from texteller.constants import LATEX_DET_MODEL_URL, TEXT_DET_MODEL_URL, TEXT_REC_MODEL_URL
+from texteller.globals import Globals
+from texteller.logger import get_logger
+from texteller.models import TexTeller
+from texteller.paddleocr import predict_det, predict_rec
+from texteller.paddleocr.utility import parse_args
+from texteller.utils import cuda_available, mkdir, resolve_path
+from texteller.types import TexTellerModel
+
+_logger = get_logger(__name__)
+
+
+
+[docs] +def load_model(model_dir: str | None = None, use_onnx: bool = False) -> TexTellerModel: + """ + Load the TexTeller model for LaTeX recognition. + + This function loads the main TexTeller model, which is responsible for + converting images to LaTeX. It can load either the standard PyTorch model + or the optimized ONNX version. + + Args: + model_dir: Directory containing the model files. If None, uses the default model. + use_onnx: Whether to load the ONNX version of the model for faster inference. + Requires the 'optimum' package and ONNX Runtime. + + Returns: + Loaded TexTeller model instance + + Example: + >>> from texteller import load_model + >>> + >>> model = load_model(use_onnx=True) + """ + return TexTeller.from_pretrained(model_dir, use_onnx=use_onnx)
+ + + +
+[docs] +def load_tokenizer(tokenizer_dir: str | None = None) -> RobertaTokenizerFast: + """ + Load the tokenizer for the TexTeller model. + + This function loads the tokenizer used by the TexTeller model for + encoding and decoding LaTeX sequences. + + Args: + tokenizer_dir: Directory containing the tokenizer files. If None, uses the default tokenizer. + + Returns: + RobertaTokenizerFast instance + + Example: + >>> from texteller import load_tokenizer + >>> + >>> tokenizer = load_tokenizer() + """ + return TexTeller.get_tokenizer(tokenizer_dir)
+ + + +
+[docs] +def load_latexdet_model() -> InferenceSession: + """ + Load the LaTeX detection model. + + This function loads the model responsible for detecting LaTeX formulas in images. + The model is implemented as an ONNX InferenceSession for optimal performance. + + Returns: + ONNX InferenceSession for LaTeX detection + + Example: + >>> from texteller import load_latexdet_model + >>> + >>> detector = load_latexdet_model() + """ + fpath = _maybe_download(LATEX_DET_MODEL_URL) + return InferenceSession( + resolve_path(fpath), + providers=["CUDAExecutionProvider" if cuda_available() else "CPUExecutionProvider"], + )
+ + + +
+[docs] +def load_textrec_model() -> predict_rec.TextRecognizer: + """ + Load the text recognition model. + + This function loads the model responsible for recognizing regular text in images. + It's based on PaddleOCR's text recognition model. + + Returns: + PaddleOCR TextRecognizer instance + + Example: + >>> from texteller import load_textrec_model + >>> + >>> text_recognizer = load_textrec_model() + """ + fpath = _maybe_download(TEXT_REC_MODEL_URL) + paddleocr_args = parse_args() + paddleocr_args.use_onnx = True + paddleocr_args.rec_model_dir = resolve_path(fpath) + paddleocr_args.use_gpu = cuda_available() + predictor = predict_rec.TextRecognizer(paddleocr_args) + return predictor
+ + + +
+[docs] +def load_textdet_model() -> predict_det.TextDetector: + """ + Load the text detection model. + + This function loads the model responsible for detecting text regions in images. + It's based on PaddleOCR's text detection model. + + Returns: + PaddleOCR TextDetector instance + + Example: + >>> from texteller import load_textdet_model + >>> + >>> text_detector = load_textdet_model() + """ + fpath = _maybe_download(TEXT_DET_MODEL_URL) + paddleocr_args = parse_args() + paddleocr_args.use_onnx = True + paddleocr_args.det_model_dir = resolve_path(fpath) + paddleocr_args.use_gpu = cuda_available() + predictor = predict_det.TextDetector(paddleocr_args) + return predictor
+ + + +def _maybe_download(url: str, dirpath: str | None = None, force: bool = False) -> Path: + """ + Download a file if it doesn't already exist. + + Args: + url: URL to download from + dirpath: Directory to save the file in. If None, uses the default cache directory. + force: Whether to force download even if the file already exists + + Returns: + Path to the downloaded file + """ + if dirpath is None: + dirpath = Globals().cache_dir + mkdir(dirpath) + + fname = Path(url).name + fpath = Path(dirpath) / fname + if not fpath.exists() or force: + _logger.info(f"Downloading {fname} from {url} to {fpath}") + wget.download(url, resolve_path(fpath)) + + return fpath +
+ +
+ + + + + + +
+ +
+
+
+ +
+ + + + +
+
+ + +
+ + +
+
+
+ + + + + + + + \ No newline at end of file diff --git a/api.html b/api.html index 9dd9d3e..58c863d 100644 --- a/api.html +++ b/api.html @@ -373,11 +373,30 @@ document.write(` @@ -404,18 +423,255 @@ document.write(`

Image to LaTeX Conversion#

+
+
+img2latex(model: VisionEncoderDecoderModel | ORTModelForVision2Seq, tokenizer: RobertaTokenizerFast, images: list[str] | list[ndarray], device: device | None = None, out_format: Literal['latex', 'katex'] = 'latex', keep_style: bool = False, max_tokens: int = 1024, num_beams: int = 1, no_repeat_ngram_size: int = 0) list[str][source]#
+

Convert images to LaTeX or KaTeX formatted strings.

+
+
Parameters:
+
    +
  • model – The TexTeller or ORTModelForVision2Seq model instance

  • +
  • tokenizer – The tokenizer for the model

  • +
  • images – List of image paths or numpy arrays (RGB format)

  • +
  • device – The torch device to use (defaults to available GPU or CPU)

  • +
  • out_format – Output format, either “latex” or “katex”

  • +
  • keep_style – Whether to keep the style of the LaTeX

  • +
  • max_tokens – Maximum number of tokens to generate

  • +
  • num_beams – Number of beams for beam search

  • +
  • no_repeat_ngram_size – Size of n-grams to prevent repetition

  • +
+
+
Returns:
+

List of LaTeX or KaTeX strings corresponding to each input image

+
+
+

Example

+
>>> import torch
+>>> from texteller import load_model, load_tokenizer, img2latex
+>>>
+>>> model = load_model(model_path=None, use_onnx=False)
+>>> tokenizer = load_tokenizer(tokenizer_path=None)
+>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+>>>
+>>> res = img2latex(model, tokenizer, ["path/to/image.png"], device=device, out_format="katex")
+
+
+
+

Paragraph to Markdown Conversion#

+
+
+paragraph2md(img_path: str, latexdet_model: InferenceSession, textdet_model: TextDetector, textrec_model: TextRecognizer, latexrec_model: VisionEncoderDecoderModel | ORTModelForVision2Seq, tokenizer: RobertaTokenizerFast, device: device | None = None, num_beams=1) str[source]#
+

Convert an image containing both text and mathematical formulas to markdown format.

+

This function processes a mixed-content image by: +1. Detecting mathematical formulas using a latex detection model +2. Masking detected formula areas and detecting text regions using OCR +3. Recognizing text in the detected regions +4. Converting formula regions to LaTeX using the latex recognition model +5. Combining all detected elements into a properly formatted markdown string

+
+
Parameters:
+
    +
  • img_path – Path to the input image containing text and formulas

  • +
  • latexdet_model – ONNX InferenceSession for LaTeX formula detection

  • +
  • textdet_model – OCR text detector model

  • +
  • textrec_model – OCR text recognition model

  • +
  • latexrec_model – TexTeller model for LaTeX formula recognition

  • +
  • tokenizer – Tokenizer for the LaTeX recognition model

  • +
  • device – The torch device to use (defaults to available GPU or CPU)

  • +
  • num_beams – Number of beams for beam search during LaTeX generation

  • +
+
+
Returns:
+

Markdown formatted string containing the recognized text and formulas

+
+
+

Example

+
>>> from texteller import load_latexdet_model, load_textdet_model, load_textrec_model, load_tokenizer, paragraph2md
+>>>
+>>> # Load all required models
+>>> latexdet_model = load_latexdet_model()
+>>> textdet_model = load_textdet_model()
+>>> textrec_model = load_textrec_model()
+>>> latexrec_model = load_model()
+>>> tokenizer = load_tokenizer()
+>>>
+>>> # Convert image to markdown
+>>> markdown_text = paragraph2md(
+...     img_path="path/to/mixed_content_image.jpg",
+...     latexdet_model=latexdet_model,
+...     textdet_model=textdet_model,
+...     textrec_model=textrec_model,
+...     latexrec_model=latexrec_model,
+...     tokenizer=tokenizer,
+... )
+
+
+
+

LaTeX Detection#

+
+
+latex_detect(img_path: str, predictor: InferenceSession) List[Bbox][source]#
+

Detect LaTeX formulas in an image and classify them as isolated or embedded.

+

This function uses an ONNX model to detect LaTeX formulas in images. The model +identifies two types of LaTeX formulas: +- ‘isolated’: Standalone LaTeX formulas (typically displayed equations) +- ‘embedding’: Inline LaTeX formulas embedded within text

+
+
Parameters:
+
    +
  • img_path – Path to the input image file

  • +
  • predictor – ONNX InferenceSession model for LaTeX detection

  • +
+
+
Returns:
+

List of Bbox objects representing the detected LaTeX formulas with their +positions, classifications, and confidence scores

+
+
+

Example

+
>>> from texteller.api import load_latexdet_model, latex_detect
+>>> model = load_latexdet_model()
+>>> bboxes = latex_detect("path/to/image.png", model)
+
+
+
+

Model Loading#

+
+
+load_model(model_dir: str | None = None, use_onnx: bool = False) VisionEncoderDecoderModel | ORTModelForVision2Seq[source]#
+

Load the TexTeller model for LaTeX recognition.

+

This function loads the main TexTeller model, which is responsible for +converting images to LaTeX. It can load either the standard PyTorch model +or the optimized ONNX version.

+
+
Parameters:
+
    +
  • model_dir – Directory containing the model files. If None, uses the default model.

  • +
  • use_onnx – Whether to load the ONNX version of the model for faster inference. +Requires the ‘optimum’ package and ONNX Runtime.

  • +
+
+
Returns:
+

Loaded TexTeller model instance

+
+
+

Example

+
>>> from texteller import load_model
+>>>
+>>> model = load_model(use_onnx=True)
+
+
+
+ +
+
+load_tokenizer(tokenizer_dir: str | None = None) RobertaTokenizerFast[source]#
+

Load the tokenizer for the TexTeller model.

+

This function loads the tokenizer used by the TexTeller model for +encoding and decoding LaTeX sequences.

+
+
Parameters:
+

tokenizer_dir – Directory containing the tokenizer files. If None, uses the default tokenizer.

+
+
Returns:
+

RobertaTokenizerFast instance

+
+
+

Example

+
>>> from texteller import load_tokenizer
+>>>
+>>> tokenizer = load_tokenizer()
+
+
+
+ +
+
+load_latexdet_model() InferenceSession[source]#
+

Load the LaTeX detection model.

+

This function loads the model responsible for detecting LaTeX formulas in images. +The model is implemented as an ONNX InferenceSession for optimal performance.

+
+
Returns:
+

ONNX InferenceSession for LaTeX detection

+
+
+

Example

+
>>> from texteller import load_latexdet_model
+>>>
+>>> detector = load_latexdet_model()
+
+
+
+ +
+
+load_textdet_model() TextDetector[source]#
+

Load the text detection model.

+

This function loads the model responsible for detecting text regions in images. +It’s based on PaddleOCR’s text detection model.

+
+
Returns:
+

PaddleOCR TextDetector instance

+
+
+

Example

+
>>> from texteller import load_textdet_model
+>>>
+>>> text_detector = load_textdet_model()
+
+
+
+ +
+
+load_textrec_model() TextRecognizer[source]#
+

Load the text recognition model.

+

This function loads the model responsible for recognizing regular text in images. +It’s based on PaddleOCR’s text recognition model.

+
+
Returns:
+

PaddleOCR TextRecognizer instance

+
+
+

Example

+
>>> from texteller import load_textrec_model
+>>>
+>>> text_recognizer = load_textrec_model()
+
+
+
+

KaTeX Conversion#

+
+
+to_katex(formula: str) str[source]#
+

Convert LaTeX formula to KaTeX-compatible format.

+

This function processes a LaTeX formula string and converts it to a format +that is compatible with KaTeX rendering. It removes unsupported commands +and structures, simplifies LaTeX environments, and optimizes the formula +for web display.

+
+
Parameters:
+

formula – LaTeX formula string to convert

+
+
Returns:
+

KaTeX-compatible formula string

+
+
+
+
@@ -455,11 +711,30 @@ document.write(` diff --git a/genindex.html b/genindex.html index f03f1e6..e142200 100644 --- a/genindex.html +++ b/genindex.html @@ -314,8 +314,56 @@ document.write(`

Index

+ I + | L + | P + | T
+

I

+ + +
+ +

L

+ + + +
+ +

P

+ + +
+ +

T

+ + +
+ diff --git a/objects.inv b/objects.inv index d6b1518cd775f7e375235491c04cf058c3ed7134..af3592fbdde62abff633bd4f160ddd6c4afcd4e4 100644 GIT binary patch delta 621 zcmV-z0+Riv1lk3Vcz;(-Z`&{oz2{d5Xs%AN9(x;B3<%Z&*_y*{fo3y~VOjEIc}w=| zr;k{&S}RiAb*D(=dn8556eHj$fDqs-+*Yds1mvP(bVaa$?$d%+d93`6TPl-6>^MAOk23q|Ap3ZAA$0f(@ z;jSstS%U5ASRroJj&}leqPQpchdfQ@An4;}jlRJiIM5s{_(uL_d=O7lUtj`;ss@*k z36|;VXE~$4fOjg+X`^BMT4|7A=n-$>JNjfb&RL!2;}z^n(4vXLelY=qFHt7Khhgu` z>D!d4OH$D?V1LZVJn8%QXm-w!y)PFsz112*gQhLgC6iL(+ zP$(MLp;7{H=zu)rW~!^DIGo}|5f_8F(s$5h+>_r^lRf&h7$6#f>&#b0;Ake&bn61 zMK#PL=4S19m&wM3a?W?cN0W3Vm;ODh(~b}yoJ=m>Ixm?XJ%3D+x``sTr)*G|3`$Z@ z;9SmjJoyZx5&91k)hMh;yql9aNQ++Bx+S1_ZKDIs^Mg+_GUM;r=r{6qx<4N1efr8@ H;BHuv2SYOq delta 440 zcmV;p0Z0DY1*Qa$cz;yOZi6royyq({HCGz-+FRAA5^5sRCWl^K*nowxsWENx_w@q; zPU3))rYCIg?07x9%nX<0UQ`lQGI^x<6DSHEkUq?0LD5qbqIcMU02ae!)`0KSHxTXd z>C^*^hlmt7g+TIvVhP(I=Ik3pML6@2`thNTAj#1Jui*nd?|&oRW%;-}j$p4|Mq5P! z5@P8mJVeoh4;y~@U|Q4y zw9v~2;3z{_0^8$(vd3nm*05azr6ACp$8`ZrV(F%u(MkG2AK3|{9yBlU2C%FJR9?P4 z;H6pgvk9vtrhoK&MGh$JZTd`KHHq^;#e`QU2lJ-aXr}&0xq^awQmjeaza{nG5<2UiBENWeF*;=% iOF);|Mg>^JnVV+M?0qX6kJNNd_CA4b<14?!3=l5##?(sy diff --git a/searchindex.js b/searchindex.js index 2d703f1..2088780 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"API Documentation": [[1, "api-documentation"]], "API Reference": [[0, null]], "Features": [[1, "features"]], "Image to LaTeX Conversion": [[0, "image-to-latex-conversion"]], "Installation": [[1, "installation"]], "KaTeX Conversion": [[0, "katex-conversion"]], "LaTeX Detection": [[0, "latex-detection"]], "Model Loading": [[0, "model-loading"]], "Paragraph to Markdown Conversion": [[0, "paragraph-to-markdown-conversion"]], "Quick Start": [[1, "quick-start"]], "Table of Contents": [[0, "table-of-contents"]], "TexTeller Documentation": [[1, null]]}, "docnames": ["api", "index"], "envversion": {"nbsphinx": 4, "sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["api.rst", "index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"0": 1, "For": 1, "all": 1, "an": 1, "can": 1, "code": 1, "contain": 1, "convers": 1, "convert": [0, 1], "detail": [0, 1], "detect": 1, "document": 0, "fals": 1, "format": 1, "formula": [0, 1], "from": 1, "i": 0, "imag": 1, "img2latex": 1, "import": 1, "latex": 1, "latex_detector": 1, "latex_model": 1, "load": 1, "load_latexdet_model": 1, "load_model": 1, "load_textdet_model": 1, "load_textrec_model": 1, "load_token": 1, "locat": 1, "markdown": 1, "mix": [0, 1], "mixed_imag": 1, "model": 1, "packag": 0, "paragraph": 1, "paragraph2md": 1, "path": 1, "pip": 1, "pleas": 1, "png": 1, "process": 1, "provid": 0, "recogn": 0, "refer": 1, "requir": 1, "section": 0, "see": 1, "text": [0, 1], "text_detector": 1, "text_recogn": 1, "textel": 0, "thi": 0, "token": 1, "tool": 0, "us": 1, "use_onnx": 1, "you": 1}, "titles": ["API Reference", "TexTeller Documentation"], "titleterms": {"api": [0, 1], "content": 0, "convers": 0, "detect": 0, "document": 1, "featur": 1, "imag": 0, "instal": 1, "katex": 0, "latex": 0, "load": 0, "markdown": 0, "model": 0, "paragraph": 0, "quick": 1, "refer": 0, "start": 1, "tabl": 0, "textel": 1}}) \ No newline at end of file +Search.setIndex({"alltitles": {"API Documentation": [[1, "api-documentation"]], "API Reference": [[0, null]], "Features": [[1, "features"]], "Image to LaTeX Conversion": [[0, "image-to-latex-conversion"]], "Installation": [[1, "installation"]], "KaTeX Conversion": [[0, "katex-conversion"]], "LaTeX Detection": [[0, "latex-detection"]], "Model Loading": [[0, "model-loading"]], "Paragraph to Markdown Conversion": [[0, "paragraph-to-markdown-conversion"]], "Quick Start": [[1, "quick-start"]], "Table of Contents": [[0, "table-of-contents"]], "TexTeller Documentation": [[1, null]]}, "docnames": ["api", "index"], "envversion": {"nbsphinx": 4, "sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["api.rst", "index.rst"], "indexentries": {"img2latex() (in module texteller.api)": [[0, "texteller.api.img2latex", false]], "latex_detect() (in module texteller.api.detection)": [[0, "texteller.api.detection.latex_detect", false]], "load_latexdet_model() (in module texteller.api)": [[0, "texteller.api.load_latexdet_model", false]], "load_model() (in module texteller.api)": [[0, "texteller.api.load_model", false]], "load_textdet_model() (in module texteller.api)": [[0, "texteller.api.load_textdet_model", false]], "load_textrec_model() (in module texteller.api)": [[0, "texteller.api.load_textrec_model", false]], "load_tokenizer() (in module texteller.api)": [[0, "texteller.api.load_tokenizer", false]], "paragraph2md() (in module texteller.api)": [[0, "texteller.api.paragraph2md", false]], "to_katex() (in module texteller.api)": [[0, "texteller.api.to_katex", false]]}, "objects": {"texteller.api": [[0, 0, 1, "", "img2latex"], [0, 0, 1, "", "load_latexdet_model"], [0, 0, 1, "", "load_model"], [0, 0, 1, "", "load_textdet_model"], [0, 0, 1, "", "load_textrec_model"], [0, 0, 1, "", "load_tokenizer"], [0, 0, 1, "", "paragraph2md"], [0, 0, 1, "", "to_katex"]], "texteller.api.detection": [[0, 0, 1, "", "latex_detect"]]}, "objnames": {"0": ["py", "function", "Python function"]}, "objtypes": {"0": "py:function"}, "terms": {"": 0, "0": [0, 1], "1": 0, "1024": 0, "2": 0, "3": 0, "4": 0, "5": 0, "For": 1, "If": 0, "It": 0, "The": 0, "all": [0, 1], "an": [0, 1], "area": 0, "arrai": 0, "avail": 0, "base": 0, "bbox": 0, "beam": 0, "bool": 0, "both": 0, "can": [0, 1], "classif": 0, "classifi": 0, "code": 1, "combin": 0, "command": 0, "compat": 0, "confid": 0, "contain": [0, 1], "convers": 1, "convert": [0, 1], "correspond": 0, "cpu": 0, "cuda": 0, "decod": 0, "default": 0, "detail": [0, 1], "detect": 1, "detector": 0, "devic": 0, "directori": 0, "displai": 0, "document": 0, "dure": 0, "each": 0, "either": 0, "element": 0, "els": 0, "embed": 0, "encod": 0, "environ": 0, "equat": 0, "exampl": 0, "fals": [0, 1], "faster": 0, "file": 0, "format": [0, 1], "formula": [0, 1], "from": [0, 1], "function": 0, "gener": 0, "gpu": 0, "gram": 0, "i": 0, "identifi": 0, "imag": 1, "img2latex": [0, 1], "img_path": 0, "implement": 0, "import": [0, 1], "infer": 0, "inferencesess": 0, "inlin": 0, "input": 0, "instanc": 0, "int": 0, "is_avail": 0, "isol": 0, "jpg": 0, "keep": 0, "keep_styl": 0, "latex": 1, "latex_detect": 0, "latex_detector": 1, "latex_model": 1, "latexdet_model": 0, "latexrec_model": 0, "list": 0, "liter": 0, "load": 1, "load_latexdet_model": [0, 1], "load_model": [0, 1], "load_textdet_model": [0, 1], "load_textrec_model": [0, 1], "load_token": [0, 1], "locat": 1, "main": 0, "markdown": 1, "markdown_text": 0, "mask": 0, "mathemat": 0, "max_token": 0, "maximum": 0, "mix": [0, 1], "mixed_content_imag": 0, "mixed_imag": 1, "model": 1, "model_dir": 0, "model_path": 0, "n": 0, "ndarrai": 0, "no_repeat_ngram_s": 0, "none": 0, "num_beam": 0, "number": 0, "numpi": 0, "object": 0, "ocr": 0, "onnx": 0, "optim": 0, "optimum": 0, "ortmodelforvision2seq": 0, "out_format": 0, "output": 0, "packag": 0, "paddleocr": 0, "paragraph": 1, "paragraph2md": [0, 1], "paramet": 0, "path": [0, 1], "perform": 0, "pip": 1, "pleas": 1, "png": [0, 1], "posit": 0, "predictor": 0, "prevent": 0, "process": [0, 1], "properli": 0, "provid": 0, "pytorch": 0, "re": 0, "recogn": 0, "recognit": 0, "refer": 1, "region": 0, "regular": 0, "remov": 0, "render": 0, "repetit": 0, "repres": 0, "requir": [0, 1], "respons": 0, "return": 0, "rgb": 0, "robertatokenizerfast": 0, "runtim": 0, "score": 0, "search": 0, "section": 0, "see": 1, "sequenc": 0, "simplifi": 0, "size": 0, "sourc": 0, "standalon": 0, "standard": 0, "str": 0, "string": 0, "structur": 0, "style": 0, "text": [0, 1], "text_detector": [0, 1], "text_recogn": [0, 1], "textdet_model": 0, "textdetector": 0, "textel": 0, "textrec_model": 0, "textrecogn": 0, "them": 0, "thi": 0, "to_katex": 0, "token": [0, 1], "tokenizer_dir": 0, "tokenizer_path": 0, "tool": 0, "torch": 0, "true": 0, "two": 0, "type": 0, "typic": 0, "unsupport": 0, "us": [0, 1], "use_onnx": [0, 1], "version": 0, "visionencoderdecodermodel": 0, "web": 0, "whether": 0, "which": 0, "within": 0, "you": 1}, "titles": ["API Reference", "TexTeller Documentation"], "titleterms": {"api": [0, 1], "content": 0, "convers": 0, "detect": 0, "document": 1, "featur": 1, "imag": 0, "instal": 1, "katex": 0, "latex": 0, "load": 0, "markdown": 0, "model": 0, "paragraph": 0, "quick": 1, "refer": 0, "start": 1, "tabl": 0, "textel": 1}}) \ No newline at end of file