[refactor] Init

2025-04-16 14:23:02 +00:00
parent e0cbf2c99f
commit 0cba17d9ce
101 changed files with 1854 additions and 2758 deletions
--- a/texteller/utils/latex.py
+++ b/texteller/utils/latex.py
@@ -0,0 +1,128 @@
+import re
+
+
+def _change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
+    result = ""
+    i = 0
+    n = len(input_str)
+
+    while i < n:
+        if input_str[i : i + len(old_inst)] == old_inst:
+            # check if the old_inst is followed by old_surr_l
+            start = i + len(old_inst)
+        else:
+            result += input_str[i]
+            i += 1
+            continue
+
+        if start < n and input_str[start] == old_surr_l:
+            # found an old_inst followed by old_surr_l, now look for the matching old_surr_r
+            count = 1
+            j = start + 1
+            escaped = False
+            while j < n and count > 0:
+                if input_str[j] == '\\' and not escaped:
+                    escaped = True
+                    j += 1
+                    continue
+                if input_str[j] == old_surr_r and not escaped:
+                    count -= 1
+                    if count == 0:
+                        break
+                elif input_str[j] == old_surr_l and not escaped:
+                    count += 1
+                escaped = False
+                j += 1
+
+            if count == 0:
+                assert j < n
+                assert input_str[start] == old_surr_l
+                assert input_str[j] == old_surr_r
+                inner_content = input_str[start + 1 : j]
+                # Replace the content with new pattern
+                result += new_inst + new_surr_l + inner_content + new_surr_r
+                i = j + 1
+                continue
+            else:
+                assert count >= 1
+                assert j == n
+                print("Warning: unbalanced surrogate pair in input string")
+                result += new_inst + new_surr_l
+                i = start + 1
+                continue
+        else:
+            result += input_str[i:start]
+            i = start
+
+    if old_inst != new_inst and (old_inst + old_surr_l) in result:
+        return _change(result, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r)
+    else:
+        return result
+
+
+def _find_substring_positions(string, substring):
+    positions = [match.start() for match in re.finditer(re.escape(substring), string)]
+    return positions
+
+
+def change_all(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r):
+    pos = _find_substring_positions(input_str, old_inst + old_surr_l)
+    res = list(input_str)
+    for p in pos[::-1]:
+        res[p:] = list(
+            _change(
+                ''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r
+            )
+        )
+    res = ''.join(res)
+    return res
+
+
+def remove_style(input_str: str) -> str:
+    input_str = change_all(input_str, r"\bm", r" ", r"{", r"}", r"", r" ")
+    input_str = change_all(input_str, r"\boldsymbol", r" ", r"{", r"}", r"", r" ")
+    input_str = change_all(input_str, r"\textit", r" ", r"{", r"}", r"", r" ")
+    input_str = change_all(input_str, r"\textbf", r" ", r"{", r"}", r"", r" ")
+    input_str = change_all(input_str, r"\textbf", r" ", r"{", r"}", r"", r" ")
+    input_str = change_all(input_str, r"\mathbf", r" ", r"{", r"}", r"", r" ")
+    output_str = input_str.strip()
+    return output_str
+
+
+def add_newlines(latex_str: str) -> str:
+    """
+    Adds newlines to a LaTeX string based on specific patterns, ensuring no
+    duplicate newlines are added around begin/end environments.
+    - After \\ (if not already followed by newline)
+    - Before \\begin{...} (if not already preceded by newline)
+    - After \\begin{...} (if not already followed by newline)
+    - Before \\end{...} (if not already preceded by newline)
+    - After \\end{...} (if not already followed by newline)
+
+    Args:
+        latex_str: The input LaTeX string.
+
+    Returns:
+        The LaTeX string with added newlines, avoiding duplicates.
+    """
+    processed_str = latex_str
+
+    # 1. Replace whitespace around \begin{...} with \n...\n
+    # \s* matches zero or more whitespace characters (space, tab, newline)
+    # Captures the \begin{...} part in group 1 (\g<1>)
+    processed_str = re.sub(r"\s*(\\begin\{[^}]*\})\s*", r"\n\g<1>\n", processed_str)
+
+    # 2. Replace whitespace around \end{...} with \n...\n
+    # Same logic as for \begin
+    processed_str = re.sub(r"\s*(\\end\{[^}]*\})\s*", r"\n\g<1>\n", processed_str)
+
+    # 3. Add newline after \\ (if not already followed by newline)
+    processed_str = re.sub(r"\\\\(?!\n| )|\\\\ ", r"\\\\\n", processed_str)
+
+    # 4. Cleanup: Collapse multiple consecutive newlines into a single newline.
+    # This handles cases where the replacements above might have created \n\n.
+    processed_str = re.sub(r'\n{2,}', '\n', processed_str)
+
+    # Remove leading/trailing whitespace (including potential single newlines
+    # at the very start/end resulting from the replacements) from the entire result.
+    return processed_str.strip()