fix: post hanlde for ocr

This commit is contained in:
liuyuanchuang
2026-02-07 21:28:46 +08:00
parent f514f98142
commit 1a4d54ce34
2 changed files with 281 additions and 227 deletions

View File

@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
"log",
"ln",
"exp",
# set relations (often glued by OCR)
"in",
"notin",
"subset",
"supset",
"subseteq",
"supseteq",
"cap",
"cup",
# misc
"partial",
"nabla",
}
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
# Match LaTeX commands: \command (greedy match all letters)
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
# stage2: differentials inside math segments
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
Examples:
- \\cdotdS -> \\cdot dS
- \\intdx -> \\int dx
- \\inX -> \\in X (stop at uppercase letter)
"""
if not token.startswith("\\"):
return token
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
return token
best = None
# longest prefix that is in whitelist
for i in range(1, len(body)):
# Find longest prefix that is in whitelist
for i in range(1, len(body) + 1):
prefix = body[:i]
if prefix in _COMMANDS_NEED_SPACE:
best = prefix
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
# Pattern 2: Spaces inside braces that follow _ or ^
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
# must be preserved as they serve as command terminators (\in X != \inX)
def clean_subscript_superscript_braces(match):
operator = match.group(1) # _ or ^
content = match.group(2) # content inside braces
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
# Only remove spaces between non-backslash characters
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
if "\\" not in content:
# No LaTeX commands: safe to remove all spaces
cleaned = re.sub(r"\s+", "", content)
else:
# Contains LaTeX commands: remove spaces carefully
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
# Strategy: remove spaces before \ and between non-command chars,
# but preserve the space after \command when followed by a non-\ char
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
return f"{operator}{{{cleaned}}}"
# Match _{ ... } or ^{ ... }
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:
Processing stages:
0. Fix OCR number errors (spaces in numbers)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
3. Normalize differentials (DISABLED by default to avoid breaking variables)
@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
# stage0: fix OCR number errors (digits with spaces)
expr = _fix_ocr_number_errors(expr)
# stage1: split glued command tokens (e.g. \cdotdS)
# stage1: split glued command tokens (e.g. \cdotdS, \inX)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)