feat: rm space in markdown

This commit is contained in:
liuyuanchuang
2026-02-05 13:32:13 +08:00
parent 280a8cdaeb
commit cee93ab616
3 changed files with 518 additions and 3 deletions

View File

@@ -88,12 +88,75 @@ def _split_glued_command_token(token: str) -> str:
return f"\\{best} {suffix}"
def _clean_latex_syntax_spaces(expr: str) -> str:
"""Clean unwanted spaces in LaTeX syntax (common OCR errors).
OCR often adds spaces in LaTeX syntax structures where they shouldn't be:
- Subscripts: a _ {i 1} -> a_{i1}
- Superscripts: x ^ {2 3} -> x^{23}
- Fractions: \\frac { a } { b } -> \\frac{a}{b}
- Commands: \\ alpha -> \\alpha
- Braces: { a b } -> {ab} (within subscripts/superscripts)
This is safe because these spaces are always OCR errors - LaTeX doesn't
need or want spaces in these positions.
Args:
expr: LaTeX math expression.
Returns:
Expression with LaTeX syntax spaces cleaned.
"""
# Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
# a _ {i} -> a_{i}, x ^ {2} -> x^{2}
expr = re.sub(r'\s*_\s*', '_', expr)
expr = re.sub(r'\s*\^\s*', '^', expr)
# Pattern 2: Spaces inside braces that follow _ or ^
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
def clean_subscript_superscript_braces(match):
operator = match.group(1) # _ or ^
content = match.group(2) # content inside braces
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
# Only remove spaces between non-backslash characters
cleaned = re.sub(r'(?<!\\)\s+(?!\\)', '', content)
return f"{operator}{{{cleaned}}}"
# Match _{ ... } or ^{ ... }
expr = re.sub(r'([_^])\{([^}]+)\}', clean_subscript_superscript_braces, expr)
# Pattern 3: Spaces inside \frac arguments
# \frac { a } { b } -> \frac{a}{b}
# \frac{ a + b }{ c } -> \frac{a+b}{c}
def clean_frac_braces(match):
numerator = match.group(1).strip()
denominator = match.group(2).strip()
return f"\\frac{{{numerator}}}{{{denominator}}}"
expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}',
clean_frac_braces, expr)
# Pattern 4: Spaces after backslash in LaTeX commands
# \ alpha -> \alpha, \ beta -> \beta
expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr)
# Pattern 5: Spaces before/after braces in general contexts (conservative)
# Only remove if the space is clearly wrong (e.g., after operators)
# { x } in standalone context is kept as-is to avoid breaking valid spacing
# But after operators like \sqrt{ x } -> \sqrt{x}
expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr) # \sqrt { -> \sqrt{
return expr
def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$).
Processing stages:
1. Fix OCR number errors (spaces in numbers)
2. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
0. Fix OCR number errors (spaces in numbers)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
3. Normalize differentials (DISABLED by default to avoid breaking variables)
Args:
@@ -108,7 +171,10 @@ def _postprocess_math(expr: str) -> str:
# stage1: split glued command tokens (e.g. \cdotdS)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: normalize differentials - DISABLED
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
expr = _clean_latex_syntax_spaces(expr)
# stage3: normalize differentials - DISABLED
# This feature is disabled because it's too aggressive and can break:
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
# - Variable names: dx, dy, dz might be variable names, not differentials