fix: post hanlde for ocr
This commit is contained in:
@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
|
||||
"log",
|
||||
"ln",
|
||||
"exp",
|
||||
# set relations (often glued by OCR)
|
||||
"in",
|
||||
"notin",
|
||||
"subset",
|
||||
"supset",
|
||||
"subseteq",
|
||||
"supseteq",
|
||||
"cap",
|
||||
"cup",
|
||||
# misc
|
||||
"partial",
|
||||
"nabla",
|
||||
}
|
||||
|
||||
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||
# Match LaTeX commands: \command (greedy match all letters)
|
||||
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
|
||||
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
||||
|
||||
# stage2: differentials inside math segments
|
||||
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
|
||||
Examples:
|
||||
- \\cdotdS -> \\cdot dS
|
||||
- \\intdx -> \\int dx
|
||||
- \\inX -> \\in X (stop at uppercase letter)
|
||||
"""
|
||||
if not token.startswith("\\"):
|
||||
return token
|
||||
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
|
||||
return token
|
||||
|
||||
best = None
|
||||
# longest prefix that is in whitelist
|
||||
for i in range(1, len(body)):
|
||||
# Find longest prefix that is in whitelist
|
||||
for i in range(1, len(body) + 1):
|
||||
prefix = body[:i]
|
||||
if prefix in _COMMANDS_NEED_SPACE:
|
||||
best = prefix
|
||||
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
|
||||
# Pattern 2: Spaces inside braces that follow _ or ^
|
||||
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
||||
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
||||
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
|
||||
# must be preserved as they serve as command terminators (\in X != \inX)
|
||||
def clean_subscript_superscript_braces(match):
|
||||
operator = match.group(1) # _ or ^
|
||||
content = match.group(2) # content inside braces
|
||||
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
|
||||
# Only remove spaces between non-backslash characters
|
||||
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
|
||||
if "\\" not in content:
|
||||
# No LaTeX commands: safe to remove all spaces
|
||||
cleaned = re.sub(r"\s+", "", content)
|
||||
else:
|
||||
# Contains LaTeX commands: remove spaces carefully
|
||||
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
|
||||
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
|
||||
# Strategy: remove spaces before \ and between non-command chars,
|
||||
# but preserve the space after \command when followed by a non-\ char
|
||||
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
|
||||
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
|
||||
return f"{operator}{{{cleaned}}}"
|
||||
|
||||
# Match _{ ... } or ^{ ... }
|
||||
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:
|
||||
|
||||
Processing stages:
|
||||
0. Fix OCR number errors (spaces in numbers)
|
||||
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
|
||||
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
|
||||
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
|
||||
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
||||
|
||||
@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
|
||||
# stage0: fix OCR number errors (digits with spaces)
|
||||
expr = _fix_ocr_number_errors(expr)
|
||||
|
||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||
# stage1: split glued command tokens (e.g. \cdotdS, \inX)
|
||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||
|
||||
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
|
||||
|
||||
Reference in New Issue
Block a user