diff --git a/_modules/index.html b/_modules/index.html deleted file mode 100644 index 7c3e83a..0000000 --- a/_modules/index.html +++ /dev/null @@ -1,386 +0,0 @@ - - - - - - - - - - Overview: module code — TexTeller documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - -
-
-
-
-
- -
- -
- - - - - -
-
- - - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

- -
-
- -
-
-
- - - - - - - - - - - -
- -
-
-
- -
- - - - -
- - - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/_modules/texteller/api/detection/detect.html b/_modules/texteller/api/detection/detect.html deleted file mode 100644 index a6d7e54..0000000 --- a/_modules/texteller/api/detection/detect.html +++ /dev/null @@ -1,454 +0,0 @@ - - - - - - - - - - texteller.api.detection.detect — TexTeller documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - -
-
-
-
-
- -
- -
- - - - - -
-
- - - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

- -
-
- -
-
-
- - - - -
- -

Source code for texteller.api.detection.detect

-from typing import List
-
-from onnxruntime import InferenceSession
-
-from texteller.types import Bbox
-
-from .preprocess import Compose
-
-_config = {
-    "mode": "paddle",
-    "draw_threshold": 0.5,
-    "metric": "COCO",
-    "use_dynamic_shape": False,
-    "arch": "DETR",
-    "min_subgraph_size": 3,
-    "preprocess": [
-        {"interp": 2, "keep_ratio": False, "target_size": [1600, 1600], "type": "Resize"},
-        {
-            "mean": [0.0, 0.0, 0.0],
-            "norm_type": "none",
-            "std": [1.0, 1.0, 1.0],
-            "type": "NormalizeImage",
-        },
-        {"type": "Permute"},
-    ],
-    "label_list": ["isolated", "embedding"],
-}
-
-
-
-[docs] -def latex_detect(img_path: str, predictor: InferenceSession) -> List[Bbox]: - """ - Detect LaTeX formulas in an image and classify them as isolated or embedded. - - This function uses an ONNX model to detect LaTeX formulas in images. The model - identifies two types of LaTeX formulas: - - 'isolated': Standalone LaTeX formulas (typically displayed equations) - - 'embedding': Inline LaTeX formulas embedded within text - - Args: - img_path: Path to the input image file - predictor: ONNX InferenceSession model for LaTeX detection - - Returns: - List of Bbox objects representing the detected LaTeX formulas with their - positions, classifications, and confidence scores - - Example: - >>> from texteller.api import load_latexdet_model, latex_detect - >>> model = load_latexdet_model() - >>> bboxes = latex_detect("path/to/image.png", model) - """ - transforms = Compose(_config["preprocess"]) - inputs = transforms(img_path) - inputs_name = [var.name for var in predictor.get_inputs()] - inputs = {k: inputs[k][None,] for k in inputs_name} - - outputs = predictor.run(output_names=None, input_feed=inputs)[0] - res = [] - for output in outputs: - cls_name = _config["label_list"][int(output[0])] - score = output[1] - xmin = int(max(output[2], 0)) - ymin = int(max(output[3], 0)) - xmax = int(output[4]) - ymax = int(output[5]) - if score > 0.5: - res.append(Bbox(xmin, ymin, ymax - ymin, xmax - xmin, cls_name, score)) - - return res
- -
- -
- - - - - - -
- -
-
-
- -
- - - - -
- - - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/_modules/texteller/api/inference.html b/_modules/texteller/api/inference.html deleted file mode 100644 index 93957a1..0000000 --- a/_modules/texteller/api/inference.html +++ /dev/null @@ -1,669 +0,0 @@ - - - - - - - - - - texteller.api.inference — TexTeller documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - -
-
-
-
-
- -
- -
- - - - - -
-
- - - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

- -
-
- -
-
-
- - - - -
- -

Source code for texteller.api.inference

-import re
-import time
-from collections import Counter
-from typing import Literal
-
-import cv2
-import numpy as np
-import torch
-from onnxruntime import InferenceSession
-from optimum.onnxruntime import ORTModelForVision2Seq
-from transformers import GenerationConfig, RobertaTokenizerFast
-
-from texteller.constants import MAX_TOKEN_SIZE
-from texteller.logger import get_logger
-from texteller.paddleocr import predict_det, predict_rec
-from texteller.types import Bbox, TexTellerModel
-from texteller.utils import (
-    bbox_merge,
-    get_device,
-    mask_img,
-    readimgs,
-    remove_style,
-    slice_from_image,
-    split_conflict,
-    transform,
-    add_newlines,
-)
-
-from .detection import latex_detect
-from .format import format_latex
-from .katex import to_katex
-
-_logger = get_logger()
-
-
-
-[docs] -def img2latex( - model: TexTellerModel, - tokenizer: RobertaTokenizerFast, - images: list[str] | list[np.ndarray], - device: torch.device | None = None, - out_format: Literal["latex", "katex"] = "latex", - keep_style: bool = False, - max_tokens: int = MAX_TOKEN_SIZE, - num_beams: int = 1, - no_repeat_ngram_size: int = 0, -) -> list[str]: - """ - Convert images to LaTeX or KaTeX formatted strings. - - Args: - model: The TexTeller or ORTModelForVision2Seq model instance - tokenizer: The tokenizer for the model - images: List of image paths or numpy arrays (RGB format) - device: The torch device to use (defaults to available GPU or CPU) - out_format: Output format, either "latex" or "katex" - keep_style: Whether to keep the style of the LaTeX - max_tokens: Maximum number of tokens to generate - num_beams: Number of beams for beam search - no_repeat_ngram_size: Size of n-grams to prevent repetition - - Returns: - List of LaTeX or KaTeX strings corresponding to each input image - - Example: - >>> import torch - >>> from texteller import load_model, load_tokenizer, img2latex - >>> - >>> model = load_model(model_path=None, use_onnx=False) - >>> tokenizer = load_tokenizer(tokenizer_path=None) - >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - >>> - >>> res = img2latex(model, tokenizer, ["path/to/image.png"], device=device, out_format="katex") - """ - assert isinstance(images, list) - assert len(images) > 0 - - if device is None: - device = get_device() - - if device.type != model.device.type: - if isinstance(model, ORTModelForVision2Seq): - _logger.warning( - f"Onnxruntime device mismatch: detected {str(device)} but model is on {str(model.device)}, using {str(model.device)} instead" - ) - else: - model = model.to(device=device) - - if isinstance(images[0], str): - images = readimgs(images) - else: # already numpy array(rgb format) - assert isinstance(images[0], np.ndarray) - images = images - - images = transform(images) - pixel_values = torch.stack(images) - - generate_config = GenerationConfig( - max_new_tokens=max_tokens, - num_beams=num_beams, - do_sample=False, - pad_token_id=tokenizer.pad_token_id, - eos_token_id=tokenizer.eos_token_id, - bos_token_id=tokenizer.bos_token_id, - no_repeat_ngram_size=no_repeat_ngram_size, - ) - pred = model.generate( - pixel_values.to(model.device), - generation_config=generate_config, - ) - - res = tokenizer.batch_decode(pred, skip_special_tokens=True) - - if out_format == "katex": - res = [to_katex(r) for r in res] - - if not keep_style: - res = [remove_style(r) for r in res] - - res = [format_latex(r) for r in res] - res = [add_newlines(r) for r in res] - return res
- - - -
-[docs] -def paragraph2md( - img_path: str, - latexdet_model: InferenceSession, - textdet_model: predict_det.TextDetector, - textrec_model: predict_rec.TextRecognizer, - latexrec_model: TexTellerModel, - tokenizer: RobertaTokenizerFast, - device: torch.device | None = None, - num_beams=1, -) -> str: - """ - Convert an image containing both text and mathematical formulas to markdown format. - - This function processes a mixed-content image by: - 1. Detecting mathematical formulas using a latex detection model - 2. Masking detected formula areas and detecting text regions using OCR - 3. Recognizing text in the detected regions - 4. Converting formula regions to LaTeX using the latex recognition model - 5. Combining all detected elements into a properly formatted markdown string - - Args: - img_path: Path to the input image containing text and formulas - latexdet_model: ONNX InferenceSession for LaTeX formula detection - textdet_model: OCR text detector model - textrec_model: OCR text recognition model - latexrec_model: TexTeller model for LaTeX formula recognition - tokenizer: Tokenizer for the LaTeX recognition model - device: The torch device to use (defaults to available GPU or CPU) - num_beams: Number of beams for beam search during LaTeX generation - - Returns: - Markdown formatted string containing the recognized text and formulas - - Example: - >>> from texteller import load_latexdet_model, load_textdet_model, load_textrec_model, load_tokenizer, paragraph2md - >>> - >>> # Load all required models - >>> latexdet_model = load_latexdet_model() - >>> textdet_model = load_textdet_model() - >>> textrec_model = load_textrec_model() - >>> latexrec_model = load_model() - >>> tokenizer = load_tokenizer() - >>> - >>> # Convert image to markdown - >>> markdown_text = paragraph2md( - ... img_path="path/to/mixed_content_image.jpg", - ... latexdet_model=latexdet_model, - ... textdet_model=textdet_model, - ... textrec_model=textrec_model, - ... latexrec_model=latexrec_model, - ... tokenizer=tokenizer, - ... ) - """ - img = cv2.imread(img_path) - corners = [tuple(img[0, 0]), tuple(img[0, -1]), tuple(img[-1, 0]), tuple(img[-1, -1])] - bg_color = np.array(Counter(corners).most_common(1)[0][0]) - - start_time = time.time() - latex_bboxes = latex_detect(img_path, latexdet_model) - end_time = time.time() - _logger.info(f"latex_det_model time: {end_time - start_time:.2f}s") - latex_bboxes = sorted(latex_bboxes) - latex_bboxes = bbox_merge(latex_bboxes) - masked_img = mask_img(img, latex_bboxes, bg_color) - - start_time = time.time() - det_prediction, _ = textdet_model(masked_img) - end_time = time.time() - _logger.info(f"ocr_det_model time: {end_time - start_time:.2f}s") - ocr_bboxes = [ - Bbox( - p[0][0], - p[0][1], - p[3][1] - p[0][1], - p[1][0] - p[0][0], - label="text", - confidence=None, - content=None, - ) - for p in det_prediction - ] - - ocr_bboxes = sorted(ocr_bboxes) - ocr_bboxes = bbox_merge(ocr_bboxes) - ocr_bboxes = split_conflict(ocr_bboxes, latex_bboxes) - ocr_bboxes = list(filter(lambda x: x.label == "text", ocr_bboxes)) - - sliced_imgs: list[np.ndarray] = slice_from_image(img, ocr_bboxes) - start_time = time.time() - rec_predictions, _ = textrec_model(sliced_imgs) - end_time = time.time() - _logger.info(f"ocr_rec_model time: {end_time - start_time:.2f}s") - - assert len(rec_predictions) == len(ocr_bboxes) - for content, bbox in zip(rec_predictions, ocr_bboxes): - bbox.content = content[0] - - latex_imgs = [] - for bbox in latex_bboxes: - latex_imgs.append(img[bbox.p.y : bbox.p.y + bbox.h, bbox.p.x : bbox.p.x + bbox.w]) - start_time = time.time() - latex_rec_res = img2latex( - model=latexrec_model, - tokenizer=tokenizer, - images=latex_imgs, - num_beams=num_beams, - out_format="katex", - device=device, - keep_style=False, - ) - end_time = time.time() - _logger.info(f"latex_rec_model time: {end_time - start_time:.2f}s") - - for bbox, content in zip(latex_bboxes, latex_rec_res): - if bbox.label == "embedding": - bbox.content = " $" + content + "$ " - elif bbox.label == "isolated": - bbox.content = "\n\n" + r"$$" + content + r"$$" + "\n\n" - - bboxes = sorted(ocr_bboxes + latex_bboxes) - if bboxes == []: - return "" - - md = "" - prev = Bbox(bboxes[0].p.x, bboxes[0].p.y, -1, -1, label="guard") - for curr in bboxes: - # Add the formula number back to the isolated formula - if prev.label == "isolated" and curr.label == "text" and prev.same_row(curr): - curr.content = curr.content.strip() - if curr.content.startswith("(") and curr.content.endswith(")"): - curr.content = curr.content[1:-1] - - if re.search(r"\\tag\{.*\}$", md[:-4]) is not None: - # in case of multiple tag - md = md[:-5] + f", {curr.content}" + "}" + md[-4:] - else: - md = md[:-4] + f"\\tag{{{curr.content}}}" + md[-4:] - continue - - if not prev.same_row(curr): - md += " " - - if curr.label == "embedding": - # remove the bold effect from inline formulas - curr.content = remove_style(curr.content) - - # change split environment into aligned - curr.content = curr.content.replace(r"\begin{split}", r"\begin{aligned}") - curr.content = curr.content.replace(r"\end{split}", r"\end{aligned}") - - # remove extra spaces (keeping only one) - curr.content = re.sub(r" +", " ", curr.content) - assert curr.content.startswith("$") and curr.content.endswith("$") - curr.content = " $" + curr.content.strip("$") + "$ " - md += curr.content - prev = curr - - return md.strip()
- -
- -
- - - - - - -
- -
-
-
- -
- - - - -
- - - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/_modules/texteller/api/katex.html b/_modules/texteller/api/katex.html deleted file mode 100644 index e7ac432..0000000 --- a/_modules/texteller/api/katex.html +++ /dev/null @@ -1,517 +0,0 @@ - - - - - - - - - - texteller.api.katex — TexTeller documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - -
-
-
-
-
- -
- -
- - - - - -
-
- - - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

- -
-
- -
-
-
- - - - -
- -

Source code for texteller.api.katex

-import re
-
-from ..utils.latex import change_all
-from .format import format_latex
-
-
-def _rm_dollar_surr(content):
-    pattern = re.compile(r"\\[a-zA-Z]+\$.*?\$|\$.*?\$")
-    matches = pattern.findall(content)
-
-    for match in matches:
-        if not re.match(r"\\[a-zA-Z]+", match):
-            new_match = match.strip("$")
-            content = content.replace(match, " " + new_match + " ")
-
-    return content
-
-
-
-[docs] -def to_katex(formula: str) -> str: - """ - Convert LaTeX formula to KaTeX-compatible format. - - This function processes a LaTeX formula string and converts it to a format - that is compatible with KaTeX rendering. It removes unsupported commands - and structures, simplifies LaTeX environments, and optimizes the formula - for web display. - - Args: - formula: LaTeX formula string to convert - - Returns: - KaTeX-compatible formula string - """ - res = formula - # remove mbox surrounding - res = change_all(res, r"\mbox ", r" ", r"{", r"}", r"", r"") - res = change_all(res, r"\mbox", r" ", r"{", r"}", r"", r"") - # remove hbox surrounding - res = re.sub(r"\\hbox to ?-? ?\d+\.\d+(pt)?\{", r"\\hbox{", res) - res = change_all(res, r"\hbox", r" ", r"{", r"}", r"", r" ") - # remove raise surrounding - res = re.sub(r"\\raise ?-? ?\d+\.\d+(pt)?", r" ", res) - # remove makebox - res = re.sub(r"\\makebox ?\[\d+\.\d+(pt)?\]\{", r"\\makebox{", res) - res = change_all(res, r"\makebox", r" ", r"{", r"}", r"", r" ") - # remove vbox surrounding, scalebox surrounding - res = re.sub(r"\\raisebox\{-? ?\d+\.\d+(pt)?\}\{", r"\\raisebox{", res) - res = re.sub(r"\\scalebox\{-? ?\d+\.\d+(pt)?\}\{", r"\\scalebox{", res) - res = change_all(res, r"\scalebox", r" ", r"{", r"}", r"", r" ") - res = change_all(res, r"\raisebox", r" ", r"{", r"}", r"", r" ") - res = change_all(res, r"\vbox", r" ", r"{", r"}", r"", r" ") - - origin_instructions = [ - r"\Huge", - r"\huge", - r"\LARGE", - r"\Large", - r"\large", - r"\normalsize", - r"\small", - r"\footnotesize", - r"\tiny", - ] - for old_ins, new_ins in zip(origin_instructions, origin_instructions): - res = change_all(res, old_ins, new_ins, r"$", r"$", "{", "}") - res = change_all(res, r"\mathbf", r"\bm", r"{", r"}", r"{", r"}") - res = change_all(res, r"\boldmath ", r"\bm", r"{", r"}", r"{", r"}") - res = change_all(res, r"\boldmath", r"\bm", r"{", r"}", r"{", r"}") - res = change_all(res, r"\boldmath ", r"\bm", r"$", r"$", r"{", r"}") - res = change_all(res, r"\boldmath", r"\bm", r"$", r"$", r"{", r"}") - res = change_all(res, r"\scriptsize", r"\scriptsize", r"$", r"$", r"{", r"}") - res = change_all(res, r"\emph", r"\textit", r"{", r"}", r"{", r"}") - res = change_all(res, r"\emph ", r"\textit", r"{", r"}", r"{", r"}") - - # remove bold command - res = change_all(res, r"\bm", r" ", r"{", r"}", r"", r"") - - origin_instructions = [ - r"\left", - r"\middle", - r"\right", - r"\big", - r"\Big", - r"\bigg", - r"\Bigg", - r"\bigl", - r"\Bigl", - r"\biggl", - r"\Biggl", - r"\bigm", - r"\Bigm", - r"\biggm", - r"\Biggm", - r"\bigr", - r"\Bigr", - r"\biggr", - r"\Biggr", - ] - for origin_ins in origin_instructions: - res = change_all(res, origin_ins, origin_ins, r"{", r"}", r"", r"") - - res = re.sub(r"\\\[(.*?)\\\]", r"\1\\newline", res) - - if res.endswith(r"\newline"): - res = res[:-8] - - # remove multiple spaces - res = re.sub(r"(\\,){1,}", " ", res) - res = re.sub(r"(\\!){1,}", " ", res) - res = re.sub(r"(\\;){1,}", " ", res) - res = re.sub(r"(\\:){1,}", " ", res) - res = re.sub(r"\\vspace\{.*?}", "", res) - - # merge consecutive text - def merge_texts(match): - texts = match.group(0) - merged_content = "".join(re.findall(r"\\text\{([^}]*)\}", texts)) - return f"\\text{{{merged_content}}}" - - res = re.sub(r"(\\text\{[^}]*\}\s*){2,}", merge_texts, res) - - res = res.replace(r"\bf ", "") - res = _rm_dollar_surr(res) - - # remove extra spaces (keeping only one) - res = re.sub(r" +", " ", res) - - # format latex - res = res.strip() - res = format_latex(res) - - return res
- -
- -
- - - - - - -
- -
-
-
- -
- - - - -
- - - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/_modules/texteller/api/load.html b/_modules/texteller/api/load.html deleted file mode 100644 index 1c3ad0f..0000000 --- a/_modules/texteller/api/load.html +++ /dev/null @@ -1,553 +0,0 @@ - - - - - - - - - - texteller.api.load — TexTeller documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - -
-
-
-
-
- -
- -
- - - - - -
-
- - - - - -
- - - - - - - - - - - - - -
- -
- - - -
- -
-
- -
-
- -
- -
- -
- - -
- -
- -
- - - - - - - - - - - - - - - - - - - -
- -
- -
-
- - - -
-

- -
-
- -
-
-
- - - - -
- -

Source code for texteller.api.load

-from pathlib import Path
-
-import wget
-from onnxruntime import InferenceSession
-from transformers import RobertaTokenizerFast
-
-from texteller.constants import LATEX_DET_MODEL_URL, TEXT_DET_MODEL_URL, TEXT_REC_MODEL_URL
-from texteller.globals import Globals
-from texteller.logger import get_logger
-from texteller.models import TexTeller
-from texteller.paddleocr import predict_det, predict_rec
-from texteller.paddleocr.utility import parse_args
-from texteller.utils import cuda_available, mkdir, resolve_path
-from texteller.types import TexTellerModel
-
-_logger = get_logger(__name__)
-
-
-
-[docs] -def load_model(model_dir: str | None = None, use_onnx: bool = False) -> TexTellerModel: - """ - Load the TexTeller model for LaTeX recognition. - - This function loads the main TexTeller model, which is responsible for - converting images to LaTeX. It can load either the standard PyTorch model - or the optimized ONNX version. - - Args: - model_dir: Directory containing the model files. If None, uses the default model. - use_onnx: Whether to load the ONNX version of the model for faster inference. - Requires the 'optimum' package and ONNX Runtime. - - Returns: - Loaded TexTeller model instance - - Example: - >>> from texteller import load_model - >>> - >>> model = load_model(use_onnx=True) - """ - return TexTeller.from_pretrained(model_dir, use_onnx=use_onnx)
- - - -
-[docs] -def load_tokenizer(tokenizer_dir: str | None = None) -> RobertaTokenizerFast: - """ - Load the tokenizer for the TexTeller model. - - This function loads the tokenizer used by the TexTeller model for - encoding and decoding LaTeX sequences. - - Args: - tokenizer_dir: Directory containing the tokenizer files. If None, uses the default tokenizer. - - Returns: - RobertaTokenizerFast instance - - Example: - >>> from texteller import load_tokenizer - >>> - >>> tokenizer = load_tokenizer() - """ - return TexTeller.get_tokenizer(tokenizer_dir)
- - - -
-[docs] -def load_latexdet_model() -> InferenceSession: - """ - Load the LaTeX detection model. - - This function loads the model responsible for detecting LaTeX formulas in images. - The model is implemented as an ONNX InferenceSession for optimal performance. - - Returns: - ONNX InferenceSession for LaTeX detection - - Example: - >>> from texteller import load_latexdet_model - >>> - >>> detector = load_latexdet_model() - """ - fpath = _maybe_download(LATEX_DET_MODEL_URL) - return InferenceSession( - resolve_path(fpath), - providers=["CUDAExecutionProvider" if cuda_available() else "CPUExecutionProvider"], - )
- - - -
-[docs] -def load_textrec_model() -> predict_rec.TextRecognizer: - """ - Load the text recognition model. - - This function loads the model responsible for recognizing regular text in images. - It's based on PaddleOCR's text recognition model. - - Returns: - PaddleOCR TextRecognizer instance - - Example: - >>> from texteller import load_textrec_model - >>> - >>> text_recognizer = load_textrec_model() - """ - fpath = _maybe_download(TEXT_REC_MODEL_URL) - paddleocr_args = parse_args() - paddleocr_args.use_onnx = True - paddleocr_args.rec_model_dir = resolve_path(fpath) - paddleocr_args.use_gpu = cuda_available() - predictor = predict_rec.TextRecognizer(paddleocr_args) - return predictor
- - - -
-[docs] -def load_textdet_model() -> predict_det.TextDetector: - """ - Load the text detection model. - - This function loads the model responsible for detecting text regions in images. - It's based on PaddleOCR's text detection model. - - Returns: - PaddleOCR TextDetector instance - - Example: - >>> from texteller import load_textdet_model - >>> - >>> text_detector = load_textdet_model() - """ - fpath = _maybe_download(TEXT_DET_MODEL_URL) - paddleocr_args = parse_args() - paddleocr_args.use_onnx = True - paddleocr_args.det_model_dir = resolve_path(fpath) - paddleocr_args.use_gpu = cuda_available() - predictor = predict_det.TextDetector(paddleocr_args) - return predictor
- - - -def _maybe_download(url: str, dirpath: str | None = None, force: bool = False) -> Path: - """ - Download a file if it doesn't already exist. - - Args: - url: URL to download from - dirpath: Directory to save the file in. If None, uses the default cache directory. - force: Whether to force download even if the file already exists - - Returns: - Path to the downloaded file - """ - if dirpath is None: - dirpath = Globals().cache_dir - mkdir(dirpath) - - fname = Path(url).name - fpath = Path(dirpath) / fname - if not fpath.exists() or force: - _logger.info(f"Downloading {fname} from {url} to {fpath}") - wget.download(url, resolve_path(fpath)) - - return fpath -
- -
- - - - - - -
- -
-
-
- -
- - - - -
- - - -
-
-
- - - - - - - - \ No newline at end of file diff --git a/api.html b/api.html index 58c863d..9dd9d3e 100644 --- a/api.html +++ b/api.html @@ -373,30 +373,11 @@ document.write(` @@ -423,255 +404,18 @@ document.write(`

Image to LaTeX Conversion#

-
-
-img2latex(model: VisionEncoderDecoderModel | ORTModelForVision2Seq, tokenizer: RobertaTokenizerFast, images: list[str] | list[ndarray], device: device | None = None, out_format: Literal['latex', 'katex'] = 'latex', keep_style: bool = False, max_tokens: int = 1024, num_beams: int = 1, no_repeat_ngram_size: int = 0) list[str][source]#
-

Convert images to LaTeX or KaTeX formatted strings.

-
-
Parameters:
-
    -
  • model – The TexTeller or ORTModelForVision2Seq model instance

  • -
  • tokenizer – The tokenizer for the model

  • -
  • images – List of image paths or numpy arrays (RGB format)

  • -
  • device – The torch device to use (defaults to available GPU or CPU)

  • -
  • out_format – Output format, either “latex” or “katex”

  • -
  • keep_style – Whether to keep the style of the LaTeX

  • -
  • max_tokens – Maximum number of tokens to generate

  • -
  • num_beams – Number of beams for beam search

  • -
  • no_repeat_ngram_size – Size of n-grams to prevent repetition

  • -
-
-
Returns:
-

List of LaTeX or KaTeX strings corresponding to each input image

-
-
-

Example

-
>>> import torch
->>> from texteller import load_model, load_tokenizer, img2latex
->>>
->>> model = load_model(model_path=None, use_onnx=False)
->>> tokenizer = load_tokenizer(tokenizer_path=None)
->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
->>>
->>> res = img2latex(model, tokenizer, ["path/to/image.png"], device=device, out_format="katex")
-
-
-
-

Paragraph to Markdown Conversion#

-
-
-paragraph2md(img_path: str, latexdet_model: InferenceSession, textdet_model: TextDetector, textrec_model: TextRecognizer, latexrec_model: VisionEncoderDecoderModel | ORTModelForVision2Seq, tokenizer: RobertaTokenizerFast, device: device | None = None, num_beams=1) str[source]#
-

Convert an image containing both text and mathematical formulas to markdown format.

-

This function processes a mixed-content image by: -1. Detecting mathematical formulas using a latex detection model -2. Masking detected formula areas and detecting text regions using OCR -3. Recognizing text in the detected regions -4. Converting formula regions to LaTeX using the latex recognition model -5. Combining all detected elements into a properly formatted markdown string

-
-
Parameters:
-
    -
  • img_path – Path to the input image containing text and formulas

  • -
  • latexdet_model – ONNX InferenceSession for LaTeX formula detection

  • -
  • textdet_model – OCR text detector model

  • -
  • textrec_model – OCR text recognition model

  • -
  • latexrec_model – TexTeller model for LaTeX formula recognition

  • -
  • tokenizer – Tokenizer for the LaTeX recognition model

  • -
  • device – The torch device to use (defaults to available GPU or CPU)

  • -
  • num_beams – Number of beams for beam search during LaTeX generation

  • -
-
-
Returns:
-

Markdown formatted string containing the recognized text and formulas

-
-
-

Example

-
>>> from texteller import load_latexdet_model, load_textdet_model, load_textrec_model, load_tokenizer, paragraph2md
->>>
->>> # Load all required models
->>> latexdet_model = load_latexdet_model()
->>> textdet_model = load_textdet_model()
->>> textrec_model = load_textrec_model()
->>> latexrec_model = load_model()
->>> tokenizer = load_tokenizer()
->>>
->>> # Convert image to markdown
->>> markdown_text = paragraph2md(
-...     img_path="path/to/mixed_content_image.jpg",
-...     latexdet_model=latexdet_model,
-...     textdet_model=textdet_model,
-...     textrec_model=textrec_model,
-...     latexrec_model=latexrec_model,
-...     tokenizer=tokenizer,
-... )
-
-
-
-

LaTeX Detection#

-
-
-latex_detect(img_path: str, predictor: InferenceSession) List[Bbox][source]#
-

Detect LaTeX formulas in an image and classify them as isolated or embedded.

-

This function uses an ONNX model to detect LaTeX formulas in images. The model -identifies two types of LaTeX formulas: -- ‘isolated’: Standalone LaTeX formulas (typically displayed equations) -- ‘embedding’: Inline LaTeX formulas embedded within text

-
-
Parameters:
-
    -
  • img_path – Path to the input image file

  • -
  • predictor – ONNX InferenceSession model for LaTeX detection

  • -
-
-
Returns:
-

List of Bbox objects representing the detected LaTeX formulas with their -positions, classifications, and confidence scores

-
-
-

Example

-
>>> from texteller.api import load_latexdet_model, latex_detect
->>> model = load_latexdet_model()
->>> bboxes = latex_detect("path/to/image.png", model)
-
-
-
-

Model Loading#

-
-
-load_model(model_dir: str | None = None, use_onnx: bool = False) VisionEncoderDecoderModel | ORTModelForVision2Seq[source]#
-

Load the TexTeller model for LaTeX recognition.

-

This function loads the main TexTeller model, which is responsible for -converting images to LaTeX. It can load either the standard PyTorch model -or the optimized ONNX version.

-
-
Parameters:
-
    -
  • model_dir – Directory containing the model files. If None, uses the default model.

  • -
  • use_onnx – Whether to load the ONNX version of the model for faster inference. -Requires the ‘optimum’ package and ONNX Runtime.

  • -
-
-
Returns:
-

Loaded TexTeller model instance

-
-
-

Example

-
>>> from texteller import load_model
->>>
->>> model = load_model(use_onnx=True)
-
-
-
- -
-
-load_tokenizer(tokenizer_dir: str | None = None) RobertaTokenizerFast[source]#
-

Load the tokenizer for the TexTeller model.

-

This function loads the tokenizer used by the TexTeller model for -encoding and decoding LaTeX sequences.

-
-
Parameters:
-

tokenizer_dir – Directory containing the tokenizer files. If None, uses the default tokenizer.

-
-
Returns:
-

RobertaTokenizerFast instance

-
-
-

Example

-
>>> from texteller import load_tokenizer
->>>
->>> tokenizer = load_tokenizer()
-
-
-
- -
-
-load_latexdet_model() InferenceSession[source]#
-

Load the LaTeX detection model.

-

This function loads the model responsible for detecting LaTeX formulas in images. -The model is implemented as an ONNX InferenceSession for optimal performance.

-
-
Returns:
-

ONNX InferenceSession for LaTeX detection

-
-
-

Example

-
>>> from texteller import load_latexdet_model
->>>
->>> detector = load_latexdet_model()
-
-
-
- -
-
-load_textdet_model() TextDetector[source]#
-

Load the text detection model.

-

This function loads the model responsible for detecting text regions in images. -It’s based on PaddleOCR’s text detection model.

-
-
Returns:
-

PaddleOCR TextDetector instance

-
-
-

Example

-
>>> from texteller import load_textdet_model
->>>
->>> text_detector = load_textdet_model()
-
-
-
- -
-
-load_textrec_model() TextRecognizer[source]#
-

Load the text recognition model.

-

This function loads the model responsible for recognizing regular text in images. -It’s based on PaddleOCR’s text recognition model.

-
-
Returns:
-

PaddleOCR TextRecognizer instance

-
-
-

Example

-
>>> from texteller import load_textrec_model
->>>
->>> text_recognizer = load_textrec_model()
-
-
-
-

KaTeX Conversion#

-
-
-to_katex(formula: str) str[source]#
-

Convert LaTeX formula to KaTeX-compatible format.

-

This function processes a LaTeX formula string and converts it to a format -that is compatible with KaTeX rendering. It removes unsupported commands -and structures, simplifies LaTeX environments, and optimizes the formula -for web display.

-
-
Parameters:
-

formula – LaTeX formula string to convert

-
-
Returns:
-

KaTeX-compatible formula string

-
-
-
-
@@ -711,30 +455,11 @@ for web display.

diff --git a/genindex.html b/genindex.html index e142200..f03f1e6 100644 --- a/genindex.html +++ b/genindex.html @@ -314,56 +314,8 @@ document.write(`

Index

- I - | L - | P - | T
-

I

- - -
- -

L

- - - -
- -

P

- - -
- -

T

- - -
- diff --git a/objects.inv b/objects.inv index af3592f..d6b1518 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/searchindex.js b/searchindex.js index 2088780..2d703f1 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"API Documentation": [[1, "api-documentation"]], "API Reference": [[0, null]], "Features": [[1, "features"]], "Image to LaTeX Conversion": [[0, "image-to-latex-conversion"]], "Installation": [[1, "installation"]], "KaTeX Conversion": [[0, "katex-conversion"]], "LaTeX Detection": [[0, "latex-detection"]], "Model Loading": [[0, "model-loading"]], "Paragraph to Markdown Conversion": [[0, "paragraph-to-markdown-conversion"]], "Quick Start": [[1, "quick-start"]], "Table of Contents": [[0, "table-of-contents"]], "TexTeller Documentation": [[1, null]]}, "docnames": ["api", "index"], "envversion": {"nbsphinx": 4, "sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["api.rst", "index.rst"], "indexentries": {"img2latex() (in module texteller.api)": [[0, "texteller.api.img2latex", false]], "latex_detect() (in module texteller.api.detection)": [[0, "texteller.api.detection.latex_detect", false]], "load_latexdet_model() (in module texteller.api)": [[0, "texteller.api.load_latexdet_model", false]], "load_model() (in module texteller.api)": [[0, "texteller.api.load_model", false]], "load_textdet_model() (in module texteller.api)": [[0, "texteller.api.load_textdet_model", false]], "load_textrec_model() (in module texteller.api)": [[0, "texteller.api.load_textrec_model", false]], "load_tokenizer() (in module texteller.api)": [[0, "texteller.api.load_tokenizer", false]], "paragraph2md() (in module texteller.api)": [[0, "texteller.api.paragraph2md", false]], "to_katex() (in module texteller.api)": [[0, "texteller.api.to_katex", false]]}, "objects": {"texteller.api": [[0, 0, 1, "", "img2latex"], [0, 0, 1, "", "load_latexdet_model"], [0, 0, 1, "", "load_model"], [0, 0, 1, "", "load_textdet_model"], [0, 0, 1, "", "load_textrec_model"], [0, 0, 1, "", "load_tokenizer"], [0, 0, 1, "", "paragraph2md"], [0, 0, 1, "", "to_katex"]], "texteller.api.detection": [[0, 0, 1, "", "latex_detect"]]}, "objnames": {"0": ["py", "function", "Python function"]}, "objtypes": {"0": "py:function"}, "terms": {"": 0, "0": [0, 1], "1": 0, "1024": 0, "2": 0, "3": 0, "4": 0, "5": 0, "For": 1, "If": 0, "It": 0, "The": 0, "all": [0, 1], "an": [0, 1], "area": 0, "arrai": 0, "avail": 0, "base": 0, "bbox": 0, "beam": 0, "bool": 0, "both": 0, "can": [0, 1], "classif": 0, "classifi": 0, "code": 1, "combin": 0, "command": 0, "compat": 0, "confid": 0, "contain": [0, 1], "convers": 1, "convert": [0, 1], "correspond": 0, "cpu": 0, "cuda": 0, "decod": 0, "default": 0, "detail": [0, 1], "detect": 1, "detector": 0, "devic": 0, "directori": 0, "displai": 0, "document": 0, "dure": 0, "each": 0, "either": 0, "element": 0, "els": 0, "embed": 0, "encod": 0, "environ": 0, "equat": 0, "exampl": 0, "fals": [0, 1], "faster": 0, "file": 0, "format": [0, 1], "formula": [0, 1], "from": [0, 1], "function": 0, "gener": 0, "gpu": 0, "gram": 0, "i": 0, "identifi": 0, "imag": 1, "img2latex": [0, 1], "img_path": 0, "implement": 0, "import": [0, 1], "infer": 0, "inferencesess": 0, "inlin": 0, "input": 0, "instanc": 0, "int": 0, "is_avail": 0, "isol": 0, "jpg": 0, "keep": 0, "keep_styl": 0, "latex": 1, "latex_detect": 0, "latex_detector": 1, "latex_model": 1, "latexdet_model": 0, "latexrec_model": 0, "list": 0, "liter": 0, "load": 1, "load_latexdet_model": [0, 1], "load_model": [0, 1], "load_textdet_model": [0, 1], "load_textrec_model": [0, 1], "load_token": [0, 1], "locat": 1, "main": 0, "markdown": 1, "markdown_text": 0, "mask": 0, "mathemat": 0, "max_token": 0, "maximum": 0, "mix": [0, 1], "mixed_content_imag": 0, "mixed_imag": 1, "model": 1, "model_dir": 0, "model_path": 0, "n": 0, "ndarrai": 0, "no_repeat_ngram_s": 0, "none": 0, "num_beam": 0, "number": 0, "numpi": 0, "object": 0, "ocr": 0, "onnx": 0, "optim": 0, "optimum": 0, "ortmodelforvision2seq": 0, "out_format": 0, "output": 0, "packag": 0, "paddleocr": 0, "paragraph": 1, "paragraph2md": [0, 1], "paramet": 0, "path": [0, 1], "perform": 0, "pip": 1, "pleas": 1, "png": [0, 1], "posit": 0, "predictor": 0, "prevent": 0, "process": [0, 1], "properli": 0, "provid": 0, "pytorch": 0, "re": 0, "recogn": 0, "recognit": 0, "refer": 1, "region": 0, "regular": 0, "remov": 0, "render": 0, "repetit": 0, "repres": 0, "requir": [0, 1], "respons": 0, "return": 0, "rgb": 0, "robertatokenizerfast": 0, "runtim": 0, "score": 0, "search": 0, "section": 0, "see": 1, "sequenc": 0, "simplifi": 0, "size": 0, "sourc": 0, "standalon": 0, "standard": 0, "str": 0, "string": 0, "structur": 0, "style": 0, "text": [0, 1], "text_detector": [0, 1], "text_recogn": [0, 1], "textdet_model": 0, "textdetector": 0, "textel": 0, "textrec_model": 0, "textrecogn": 0, "them": 0, "thi": 0, "to_katex": 0, "token": [0, 1], "tokenizer_dir": 0, "tokenizer_path": 0, "tool": 0, "torch": 0, "true": 0, "two": 0, "type": 0, "typic": 0, "unsupport": 0, "us": [0, 1], "use_onnx": [0, 1], "version": 0, "visionencoderdecodermodel": 0, "web": 0, "whether": 0, "which": 0, "within": 0, "you": 1}, "titles": ["API Reference", "TexTeller Documentation"], "titleterms": {"api": [0, 1], "content": 0, "convers": 0, "detect": 0, "document": 1, "featur": 1, "imag": 0, "instal": 1, "katex": 0, "latex": 0, "load": 0, "markdown": 0, "model": 0, "paragraph": 0, "quick": 1, "refer": 0, "start": 1, "tabl": 0, "textel": 1}}) \ No newline at end of file +Search.setIndex({"alltitles": {"API Documentation": [[1, "api-documentation"]], "API Reference": [[0, null]], "Features": [[1, "features"]], "Image to LaTeX Conversion": [[0, "image-to-latex-conversion"]], "Installation": [[1, "installation"]], "KaTeX Conversion": [[0, "katex-conversion"]], "LaTeX Detection": [[0, "latex-detection"]], "Model Loading": [[0, "model-loading"]], "Paragraph to Markdown Conversion": [[0, "paragraph-to-markdown-conversion"]], "Quick Start": [[1, "quick-start"]], "Table of Contents": [[0, "table-of-contents"]], "TexTeller Documentation": [[1, null]]}, "docnames": ["api", "index"], "envversion": {"nbsphinx": 4, "sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["api.rst", "index.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"0": 1, "For": 1, "all": 1, "an": 1, "can": 1, "code": 1, "contain": 1, "convers": 1, "convert": [0, 1], "detail": [0, 1], "detect": 1, "document": 0, "fals": 1, "format": 1, "formula": [0, 1], "from": 1, "i": 0, "imag": 1, "img2latex": 1, "import": 1, "latex": 1, "latex_detector": 1, "latex_model": 1, "load": 1, "load_latexdet_model": 1, "load_model": 1, "load_textdet_model": 1, "load_textrec_model": 1, "load_token": 1, "locat": 1, "markdown": 1, "mix": [0, 1], "mixed_imag": 1, "model": 1, "packag": 0, "paragraph": 1, "paragraph2md": 1, "path": 1, "pip": 1, "pleas": 1, "png": 1, "process": 1, "provid": 0, "recogn": 0, "refer": 1, "requir": 1, "section": 0, "see": 1, "text": [0, 1], "text_detector": 1, "text_recogn": 1, "textel": 0, "thi": 0, "token": 1, "tool": 0, "us": 1, "use_onnx": 1, "you": 1}, "titles": ["API Reference", "TexTeller Documentation"], "titleterms": {"api": [0, 1], "content": 0, "convers": 0, "detect": 0, "document": 1, "featur": 1, "imag": 0, "instal": 1, "katex": 0, "latex": 0, "load": 0, "markdown": 0, "model": 0, "paragraph": 0, "quick": 1, "refer": 0, "start": 1, "tabl": 0, "textel": 1}}) \ No newline at end of file