fix: post hanlde for ocr

2026-02-07 21:28:46 +08:00
parent f514f98142
commit 1a4d54ce34
2 changed files with 281 additions and 227 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
    "log",
    "ln",
    "exp",
+    # set relations (often glued by OCR)
+    "in",
+    "notin",
+    "subset",
+    "supset",
+    "subseteq",
+    "supseteq",
+    "cap",
+    "cup",
    # misc
    "partial",
    "nabla",
 }

 _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
+# Match LaTeX commands: \command (greedy match all letters)
+# The splitting logic in _split_glued_command_token will handle \inX -> \in X
 _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")

 # stage2: differentials inside math segments
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
    Examples:
    - \\cdotdS -> \\cdot dS
    - \\intdx  -> \\int dx
+    - \\inX    -> \\in X (stop at uppercase letter)
    """
    if not token.startswith("\\"):
        return token
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
        return token

    best = None
-    # longest prefix that is in whitelist
-    for i in range(1, len(body)):
+    # Find longest prefix that is in whitelist
+    for i in range(1, len(body) + 1):
        prefix = body[:i]
        if prefix in _COMMANDS_NEED_SPACE:
            best = prefix
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
    # Pattern 2: Spaces inside braces that follow _ or ^
    # _{i 1} -> _{i1}, ^{2 3} -> ^{23}
    # This is safe because spaces inside subscript/superscript braces are usually OCR errors
+    # BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
+    # must be preserved as they serve as command terminators (\in X != \inX)
    def clean_subscript_superscript_braces(match):
        operator = match.group(1)  # _ or ^
        content = match.group(2)  # content inside braces
-        # Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
-        # Only remove spaces between non-backslash characters
-        cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
+        if "\\" not in content:
+            # No LaTeX commands: safe to remove all spaces
+            cleaned = re.sub(r"\s+", "", content)
+        else:
+            # Contains LaTeX commands: remove spaces carefully
+            # Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
+            # Remove spaces everywhere else (e.g., x \in -> x\in is fine)
+            # Strategy: remove spaces before \ and between non-command chars,
+            # but preserve the space after \command when followed by a non-\ char
+            cleaned = re.sub(r"\s+(?=\\)", "", content)       # remove space before \cmd
+            cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned)  # remove space after non-letter non-\
        return f"{operator}{{{cleaned}}}"

    # Match _{ ... } or ^{ ... }
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:

    Processing stages:
    0. Fix OCR number errors (spaces in numbers)
-    1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
+    1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
    2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
    3. Normalize differentials (DISABLED by default to avoid breaking variables)

@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
    # stage0: fix OCR number errors (digits with spaces)
    expr = _fix_ocr_number_errors(expr)

-    # stage1: split glued command tokens (e.g. \cdotdS)
+    # stage1: split glued command tokens (e.g. \cdotdS, \inX)
    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)

    # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)