diff --git a/.github/workflows/pr-welcome.yml b/.github/workflows/pr-welcome.yml
index 16841ed..5e981ac 100644
--- a/.github/workflows/pr-welcome.yml
+++ b/.github/workflows/pr-welcome.yml
@@ -4,6 +4,10 @@ on:
pull_request:
types: [opened]
+permissions:
+ pull-requests: write
+ issues: write
+
jobs:
welcome:
runs-on: ubuntu-latest
diff --git a/assets/fire.svg b/assets/fire.svg
index 8f9f7eb..522dff7 100644
--- a/assets/fire.svg
+++ b/assets/fire.svg
@@ -457,4 +457,4 @@
-
\ No newline at end of file
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 33d39e4..7a246ae 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -12,64 +12,64 @@
import os
import sys
-sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath("../.."))
# -- Project information -----------------------------------------------------
-project = 'TexTeller'
-copyright = '2025, TexTeller Team'
-author = 'TexTeller Team'
+project = "TexTeller"
+copyright = "2025, TexTeller Team"
+author = "TexTeller Team"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = [
- 'myst_parser',
- 'sphinx.ext.duration',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.autosectionlabel',
- 'sphinx.ext.autodoc',
- 'sphinx.ext.viewcode',
- 'sphinx.ext.napoleon',
- 'sphinx.ext.autosummary',
- 'sphinx_copybutton',
+ "myst_parser",
+ "sphinx.ext.duration",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.autosectionlabel",
+ "sphinx.ext.autodoc",
+ "sphinx.ext.viewcode",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.autosummary",
+ "sphinx_copybutton",
# 'sphinx.ext.linkcode',
# 'sphinxarg.ext',
- 'sphinx_design',
- 'nbsphinx',
+ "sphinx_design",
+ "nbsphinx",
]
-templates_path = ['_templates']
+templates_path = ["_templates"]
exclude_patterns = []
# Autodoc settings
-autodoc_member_order = 'bysource'
+autodoc_member_order = "bysource"
add_module_names = False
-autoclass_content = 'both'
+autoclass_content = "both"
autodoc_default_options = {
- 'members': True,
- 'member-order': 'bysource',
- 'undoc-members': True,
- 'show-inheritance': True,
- 'imported-members': True,
+ "members": True,
+ "member-order": "bysource",
+ "undoc-members": True,
+ "show-inheritance": True,
+ "imported-members": True,
}
# Intersphinx settings
intersphinx_mapping = {
- 'python': ('https://docs.python.org/3', None),
- 'numpy': ('https://numpy.org/doc/stable', None),
- 'torch': ('https://pytorch.org/docs/stable', None),
- 'transformers': ('https://huggingface.co/docs/transformers/main/en', None),
+ "python": ("https://docs.python.org/3", None),
+ "numpy": ("https://numpy.org/doc/stable", None),
+ "torch": ("https://pytorch.org/docs/stable", None),
+ "transformers": ("https://huggingface.co/docs/transformers/main/en", None),
}
-html_theme = 'sphinx_book_theme'
+html_theme = "sphinx_book_theme"
html_theme_options = {
- 'repository_url': 'https://github.com/OleehyO/TexTeller',
- 'use_repository_button': True,
- 'use_issues_button': True,
- 'use_edit_page_button': True,
- 'use_download_button': True,
+ "repository_url": "https://github.com/OleehyO/TexTeller",
+ "use_repository_button": True,
+ "use_issues_button": True,
+ "use_edit_page_button": True,
+ "use_download_button": True,
}
html_logo = "../../assets/logo.svg"
diff --git a/examples/client_demo.py b/examples/client_demo.py
index a6445ad..cb3b76f 100644
--- a/examples/client_demo.py
+++ b/examples/client_demo.py
@@ -3,8 +3,8 @@ import requests
server_url = "http://127.0.0.1:8000/predict"
img_path = "/path/to/your/image"
-with open(img_path, 'rb') as img:
- files = {'img': img}
+with open(img_path, "rb") as img:
+ files = {"img": img}
response = requests.post(server_url, files=files)
print(response.text)
diff --git a/texteller/api/format.py b/texteller/api/format.py
index 0b76bc8..5e88981 100644
--- a/texteller/api/format.py
+++ b/texteller/api/format.py
@@ -19,8 +19,8 @@ TEXT_LINE_START = ""
COMMENT_LINE_START = "% "
# Opening and closing delimiters
-OPENS = ['{', '(', '[']
-CLOSES = ['}', ')', ']']
+OPENS = ["{", "(", "["]
+CLOSES = ["}", ")", "]"]
# Names of LaTeX verbatim environments
VERBATIMS = ["verbatim", "Verbatim", "lstlisting", "minted", "comment"]
@@ -138,7 +138,7 @@ class Pattern:
contains_env_end=ENV_END in s,
contains_item=ITEM in s,
contains_splitting=True,
- contains_comment='%' in s,
+ contains_comment="%" in s,
)
else:
return cls(
@@ -146,7 +146,7 @@ class Pattern:
contains_env_end=False,
contains_item=False,
contains_splitting=False,
- contains_comment='%' in s,
+ contains_comment="%" in s,
)
@@ -169,11 +169,11 @@ def find_comment_index(line: str, pattern: Pattern) -> Optional[int]:
in_command = False
for i, c in enumerate(line):
- if c == '\\':
+ if c == "\\":
in_command = True
elif in_command and not c.isalpha():
in_command = False
- elif c == '%' and not in_command:
+ elif c == "%" and not in_command:
return i
return None
@@ -390,10 +390,10 @@ def find_wrap_point(line: str, indent_length: int, args: Args) -> Optional[int]:
line_width += 1
if line_width > wrap_boundary and wrap_point is not None:
break
- if c == ' ' and prev_char != '\\':
+ if c == " " and prev_char != "\\":
if after_char:
wrap_point = i
- elif c != '%':
+ elif c != "%":
after_char = True
prev_char = c
@@ -483,8 +483,8 @@ def split_line(line: str, state: State, file: str, args: Args, logs: List[Log])
if not match:
return line, ""
- prev = match.group('prev')
- rest = match.group('env')
+ prev = match.group("prev")
+ rest = match.group("env")
if args.verbosity >= 3: # Trace level
logs.append(
@@ -517,8 +517,8 @@ def clean_text(text: str, args: Args) -> str:
text = RE_NEWLINES.sub(f"{LINE_END}{LINE_END}", text)
# Remove tabs if they shouldn't be used
- if args.tabchar != '\t':
- text = text.replace('\t', ' ' * args.tabsize)
+ if args.tabchar != "\t":
+ text = text.replace("\t", " " * args.tabsize)
# Remove trailing spaces
text = RE_TRAIL.sub(LINE_END, text)
@@ -577,7 +577,7 @@ def _format_latex(old_text: str, file: str, args: Args) -> Tuple[str, List[Log]]
new_text = ""
# Select the character used for indentation
- indent_char = '\t' if args.tabchar == '\t' else ' '
+ indent_char = "\t" if args.tabchar == "\t" else " "
# Get any extra environments to be indented as lists
lists_begin = [f"\\begin{{{l}}}" for l in args.lists]
diff --git a/texteller/api/katex.py b/texteller/api/katex.py
index 83eefdf..81c350c 100644
--- a/texteller/api/katex.py
+++ b/texteller/api/katex.py
@@ -5,13 +5,13 @@ from .format import format_latex
def _rm_dollar_surr(content):
- pattern = re.compile(r'\\[a-zA-Z]+\$.*?\$|\$.*?\$')
+ pattern = re.compile(r"\\[a-zA-Z]+\$.*?\$|\$.*?\$")
matches = pattern.findall(content)
for match in matches:
- if not re.match(r'\\[a-zA-Z]+', match):
- new_match = match.strip('$')
- content = content.replace(match, ' ' + new_match + ' ')
+ if not re.match(r"\\[a-zA-Z]+", match):
+ new_match = match.strip("$")
+ content = content.replace(match, " " + new_match + " ")
return content
@@ -33,97 +33,97 @@ def to_katex(formula: str) -> str:
"""
res = formula
# remove mbox surrounding
- res = change_all(res, r'\mbox ', r' ', r'{', r'}', r'', r'')
- res = change_all(res, r'\mbox', r' ', r'{', r'}', r'', r'')
+ res = change_all(res, r"\mbox ", r" ", r"{", r"}", r"", r"")
+ res = change_all(res, r"\mbox", r" ", r"{", r"}", r"", r"")
# remove hbox surrounding
- res = re.sub(r'\\hbox to ?-? ?\d+\.\d+(pt)?\{', r'\\hbox{', res)
- res = change_all(res, r'\hbox', r' ', r'{', r'}', r'', r' ')
+ res = re.sub(r"\\hbox to ?-? ?\d+\.\d+(pt)?\{", r"\\hbox{", res)
+ res = change_all(res, r"\hbox", r" ", r"{", r"}", r"", r" ")
# remove raise surrounding
- res = re.sub(r'\\raise ?-? ?\d+\.\d+(pt)?', r' ', res)
+ res = re.sub(r"\\raise ?-? ?\d+\.\d+(pt)?", r" ", res)
# remove makebox
- res = re.sub(r'\\makebox ?\[\d+\.\d+(pt)?\]\{', r'\\makebox{', res)
- res = change_all(res, r'\makebox', r' ', r'{', r'}', r'', r' ')
+ res = re.sub(r"\\makebox ?\[\d+\.\d+(pt)?\]\{", r"\\makebox{", res)
+ res = change_all(res, r"\makebox", r" ", r"{", r"}", r"", r" ")
# remove vbox surrounding, scalebox surrounding
- res = re.sub(r'\\raisebox\{-? ?\d+\.\d+(pt)?\}\{', r'\\raisebox{', res)
- res = re.sub(r'\\scalebox\{-? ?\d+\.\d+(pt)?\}\{', r'\\scalebox{', res)
- res = change_all(res, r'\scalebox', r' ', r'{', r'}', r'', r' ')
- res = change_all(res, r'\raisebox', r' ', r'{', r'}', r'', r' ')
- res = change_all(res, r'\vbox', r' ', r'{', r'}', r'', r' ')
+ res = re.sub(r"\\raisebox\{-? ?\d+\.\d+(pt)?\}\{", r"\\raisebox{", res)
+ res = re.sub(r"\\scalebox\{-? ?\d+\.\d+(pt)?\}\{", r"\\scalebox{", res)
+ res = change_all(res, r"\scalebox", r" ", r"{", r"}", r"", r" ")
+ res = change_all(res, r"\raisebox", r" ", r"{", r"}", r"", r" ")
+ res = change_all(res, r"\vbox", r" ", r"{", r"}", r"", r" ")
origin_instructions = [
- r'\Huge',
- r'\huge',
- r'\LARGE',
- r'\Large',
- r'\large',
- r'\normalsize',
- r'\small',
- r'\footnotesize',
- r'\tiny',
+ r"\Huge",
+ r"\huge",
+ r"\LARGE",
+ r"\Large",
+ r"\large",
+ r"\normalsize",
+ r"\small",
+ r"\footnotesize",
+ r"\tiny",
]
for old_ins, new_ins in zip(origin_instructions, origin_instructions):
- res = change_all(res, old_ins, new_ins, r'$', r'$', '{', '}')
- res = change_all(res, r'\mathbf', r'\bm', r'{', r'}', r'{', r'}')
- res = change_all(res, r'\boldmath ', r'\bm', r'{', r'}', r'{', r'}')
- res = change_all(res, r'\boldmath', r'\bm', r'{', r'}', r'{', r'}')
- res = change_all(res, r'\boldmath ', r'\bm', r'$', r'$', r'{', r'}')
- res = change_all(res, r'\boldmath', r'\bm', r'$', r'$', r'{', r'}')
- res = change_all(res, r'\scriptsize', r'\scriptsize', r'$', r'$', r'{', r'}')
- res = change_all(res, r'\emph', r'\textit', r'{', r'}', r'{', r'}')
- res = change_all(res, r'\emph ', r'\textit', r'{', r'}', r'{', r'}')
+ res = change_all(res, old_ins, new_ins, r"$", r"$", "{", "}")
+ res = change_all(res, r"\mathbf", r"\bm", r"{", r"}", r"{", r"}")
+ res = change_all(res, r"\boldmath ", r"\bm", r"{", r"}", r"{", r"}")
+ res = change_all(res, r"\boldmath", r"\bm", r"{", r"}", r"{", r"}")
+ res = change_all(res, r"\boldmath ", r"\bm", r"$", r"$", r"{", r"}")
+ res = change_all(res, r"\boldmath", r"\bm", r"$", r"$", r"{", r"}")
+ res = change_all(res, r"\scriptsize", r"\scriptsize", r"$", r"$", r"{", r"}")
+ res = change_all(res, r"\emph", r"\textit", r"{", r"}", r"{", r"}")
+ res = change_all(res, r"\emph ", r"\textit", r"{", r"}", r"{", r"}")
# remove bold command
- res = change_all(res, r'\bm', r' ', r'{', r'}', r'', r'')
+ res = change_all(res, r"\bm", r" ", r"{", r"}", r"", r"")
origin_instructions = [
- r'\left',
- r'\middle',
- r'\right',
- r'\big',
- r'\Big',
- r'\bigg',
- r'\Bigg',
- r'\bigl',
- r'\Bigl',
- r'\biggl',
- r'\Biggl',
- r'\bigm',
- r'\Bigm',
- r'\biggm',
- r'\Biggm',
- r'\bigr',
- r'\Bigr',
- r'\biggr',
- r'\Biggr',
+ r"\left",
+ r"\middle",
+ r"\right",
+ r"\big",
+ r"\Big",
+ r"\bigg",
+ r"\Bigg",
+ r"\bigl",
+ r"\Bigl",
+ r"\biggl",
+ r"\Biggl",
+ r"\bigm",
+ r"\Bigm",
+ r"\biggm",
+ r"\Biggm",
+ r"\bigr",
+ r"\Bigr",
+ r"\biggr",
+ r"\Biggr",
]
for origin_ins in origin_instructions:
- res = change_all(res, origin_ins, origin_ins, r'{', r'}', r'', r'')
+ res = change_all(res, origin_ins, origin_ins, r"{", r"}", r"", r"")
- res = re.sub(r'\\\[(.*?)\\\]', r'\1\\newline', res)
+ res = re.sub(r"\\\[(.*?)\\\]", r"\1\\newline", res)
- if res.endswith(r'\newline'):
+ if res.endswith(r"\newline"):
res = res[:-8]
# remove multiple spaces
- res = re.sub(r'(\\,){1,}', ' ', res)
- res = re.sub(r'(\\!){1,}', ' ', res)
- res = re.sub(r'(\\;){1,}', ' ', res)
- res = re.sub(r'(\\:){1,}', ' ', res)
- res = re.sub(r'\\vspace\{.*?}', '', res)
+ res = re.sub(r"(\\,){1,}", " ", res)
+ res = re.sub(r"(\\!){1,}", " ", res)
+ res = re.sub(r"(\\;){1,}", " ", res)
+ res = re.sub(r"(\\:){1,}", " ", res)
+ res = re.sub(r"\\vspace\{.*?}", "", res)
# merge consecutive text
def merge_texts(match):
texts = match.group(0)
- merged_content = ''.join(re.findall(r'\\text\{([^}]*)\}', texts))
- return f'\\text{{{merged_content}}}'
+ merged_content = "".join(re.findall(r"\\text\{([^}]*)\}", texts))
+ return f"\\text{{{merged_content}}}"
- res = re.sub(r'(\\text\{[^}]*\}\s*){2,}', merge_texts, res)
+ res = re.sub(r"(\\text\{[^}]*\}\s*){2,}", merge_texts, res)
- res = res.replace(r'\bf ', '')
+ res = res.replace(r"\bf ", "")
res = _rm_dollar_surr(res)
# remove extra spaces (keeping only one)
- res = re.sub(r' +', ' ', res)
+ res = re.sub(r" +", " ", res)
# format latex
res = res.strip()
diff --git a/texteller/models/__init__.py b/texteller/models/__init__.py
index 1beda4e..f0cf366 100644
--- a/texteller/models/__init__.py
+++ b/texteller/models/__init__.py
@@ -1,3 +1,3 @@
from .texteller import TexTeller
-__all__ = ['TexTeller']
+__all__ = ["TexTeller"]
diff --git a/texteller/utils/image.py b/texteller/utils/image.py
index cc50a3c..3af5000 100644
--- a/texteller/utils/image.py
+++ b/texteller/utils/image.py
@@ -41,7 +41,7 @@ def readimgs(image_paths: list[str]) -> list[np.ndarray]:
if image is None:
raise ValueError(f"Image at {path} could not be read.")
if image.dtype == np.uint16:
- _logger.warning(f'Converting {path} to 8-bit, image may be lossy.')
+ _logger.warning(f"Converting {path} to 8-bit, image may be lossy.")
image = cv2.convertScaleAbs(image, alpha=(255.0 / 65535.0))
channels = 1 if len(image.shape) == 2 else image.shape[2]
@@ -112,7 +112,7 @@ def transform(images: List[Union[np.ndarray, Image.Image]]) -> List[torch.Tensor
assert IMG_CHANNELS == 1, "Only support grayscale images for now"
images = [
- np.array(img.convert('RGB')) if isinstance(img, Image.Image) else img for img in images
+ np.array(img.convert("RGB")) if isinstance(img, Image.Image) else img for img in images
]
images = [trim_white_border(image) for image in images]
images = [general_transform_pipeline(image) for image in images]
diff --git a/texteller/utils/latex.py b/texteller/utils/latex.py
index d778924..85c729a 100644
--- a/texteller/utils/latex.py
+++ b/texteller/utils/latex.py
@@ -21,7 +21,7 @@ def _change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, n
j = start + 1
escaped = False
while j < n and count > 0:
- if input_str[j] == '\\' and not escaped:
+ if input_str[j] == "\\" and not escaped:
escaped = True
j += 1
continue
@@ -71,10 +71,10 @@ def change_all(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l
for p in pos[::-1]:
res[p:] = list(
_change(
- ''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r
+ "".join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r
)
)
- res = ''.join(res)
+ res = "".join(res)
return res
@@ -121,7 +121,7 @@ def add_newlines(latex_str: str) -> str:
# 4. Cleanup: Collapse multiple consecutive newlines into a single newline.
# This handles cases where the replacements above might have created \n\n.
- processed_str = re.sub(r'\n{2,}', '\n', processed_str)
+ processed_str = re.sub(r"\n{2,}", "\n", processed_str)
# Remove leading/trailing whitespace (including potential single newlines
# at the very start/end resulting from the replacements) from the entire result.