[refactor] Init
This commit is contained in:
128
texteller/utils/latex.py
Normal file
128
texteller/utils/latex.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import re
|
||||
|
||||
|
||||
def _change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
|
||||
result = ""
|
||||
i = 0
|
||||
n = len(input_str)
|
||||
|
||||
while i < n:
|
||||
if input_str[i : i + len(old_inst)] == old_inst:
|
||||
# check if the old_inst is followed by old_surr_l
|
||||
start = i + len(old_inst)
|
||||
else:
|
||||
result += input_str[i]
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if start < n and input_str[start] == old_surr_l:
|
||||
# found an old_inst followed by old_surr_l, now look for the matching old_surr_r
|
||||
count = 1
|
||||
j = start + 1
|
||||
escaped = False
|
||||
while j < n and count > 0:
|
||||
if input_str[j] == '\\' and not escaped:
|
||||
escaped = True
|
||||
j += 1
|
||||
continue
|
||||
if input_str[j] == old_surr_r and not escaped:
|
||||
count -= 1
|
||||
if count == 0:
|
||||
break
|
||||
elif input_str[j] == old_surr_l and not escaped:
|
||||
count += 1
|
||||
escaped = False
|
||||
j += 1
|
||||
|
||||
if count == 0:
|
||||
assert j < n
|
||||
assert input_str[start] == old_surr_l
|
||||
assert input_str[j] == old_surr_r
|
||||
inner_content = input_str[start + 1 : j]
|
||||
# Replace the content with new pattern
|
||||
result += new_inst + new_surr_l + inner_content + new_surr_r
|
||||
i = j + 1
|
||||
continue
|
||||
else:
|
||||
assert count >= 1
|
||||
assert j == n
|
||||
print("Warning: unbalanced surrogate pair in input string")
|
||||
result += new_inst + new_surr_l
|
||||
i = start + 1
|
||||
continue
|
||||
else:
|
||||
result += input_str[i:start]
|
||||
i = start
|
||||
|
||||
if old_inst != new_inst and (old_inst + old_surr_l) in result:
|
||||
return _change(result, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r)
|
||||
else:
|
||||
return result
|
||||
|
||||
|
||||
def _find_substring_positions(string, substring):
|
||||
positions = [match.start() for match in re.finditer(re.escape(substring), string)]
|
||||
return positions
|
||||
|
||||
|
||||
def change_all(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
|
||||
pos = _find_substring_positions(input_str, old_inst + old_surr_l)
|
||||
res = list(input_str)
|
||||
for p in pos[::-1]:
|
||||
res[p:] = list(
|
||||
_change(
|
||||
''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r
|
||||
)
|
||||
)
|
||||
res = ''.join(res)
|
||||
return res
|
||||
|
||||
|
||||
def remove_style(input_str: str) -> str:
|
||||
input_str = change_all(input_str, r"\bm", r" ", r"{", r"}", r"", r" ")
|
||||
input_str = change_all(input_str, r"\boldsymbol", r" ", r"{", r"}", r"", r" ")
|
||||
input_str = change_all(input_str, r"\textit", r" ", r"{", r"}", r"", r" ")
|
||||
input_str = change_all(input_str, r"\textbf", r" ", r"{", r"}", r"", r" ")
|
||||
input_str = change_all(input_str, r"\textbf", r" ", r"{", r"}", r"", r" ")
|
||||
input_str = change_all(input_str, r"\mathbf", r" ", r"{", r"}", r"", r" ")
|
||||
output_str = input_str.strip()
|
||||
return output_str
|
||||
|
||||
|
||||
def add_newlines(latex_str: str) -> str:
|
||||
"""
|
||||
Adds newlines to a LaTeX string based on specific patterns, ensuring no
|
||||
duplicate newlines are added around begin/end environments.
|
||||
- After \\ (if not already followed by newline)
|
||||
- Before \\begin{...} (if not already preceded by newline)
|
||||
- After \\begin{...} (if not already followed by newline)
|
||||
- Before \\end{...} (if not already preceded by newline)
|
||||
- After \\end{...} (if not already followed by newline)
|
||||
|
||||
Args:
|
||||
latex_str: The input LaTeX string.
|
||||
|
||||
Returns:
|
||||
The LaTeX string with added newlines, avoiding duplicates.
|
||||
"""
|
||||
processed_str = latex_str
|
||||
|
||||
# 1. Replace whitespace around \begin{...} with \n...\n
|
||||
# \s* matches zero or more whitespace characters (space, tab, newline)
|
||||
# Captures the \begin{...} part in group 1 (\g<1>)
|
||||
processed_str = re.sub(r"\s*(\\begin\{[^}]*\})\s*", r"\n\g<1>\n", processed_str)
|
||||
|
||||
# 2. Replace whitespace around \end{...} with \n...\n
|
||||
# Same logic as for \begin
|
||||
processed_str = re.sub(r"\s*(\\end\{[^}]*\})\s*", r"\n\g<1>\n", processed_str)
|
||||
|
||||
# 3. Add newline after \\ (if not already followed by newline)
|
||||
processed_str = re.sub(r"\\\\(?!\n| )|\\\\ ", r"\\\\\n", processed_str)
|
||||
|
||||
# 4. Cleanup: Collapse multiple consecutive newlines into a single newline.
|
||||
# This handles cases where the replacements above might have created \n\n.
|
||||
processed_str = re.sub(r'\n{2,}', '\n', processed_str)
|
||||
|
||||
# Remove leading/trailing whitespace (including potential single newlines
|
||||
# at the very start/end resulting from the replacements) from the entire result.
|
||||
return processed_str.strip()
|
||||
Reference in New Issue
Block a user