fix: markdown post handel
This commit is contained in:
@@ -48,8 +48,13 @@ _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
||||
|
||||
# stage2: differentials inside math segments
|
||||
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
|
||||
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
|
||||
# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
|
||||
# Only match differentials in specific contexts (after integrals, in fractions)
|
||||
# (?<!\\) - not preceded by backslash (not a LaTeX command)
|
||||
# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
|
||||
# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
|
||||
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
|
||||
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")
|
||||
|
||||
|
||||
def _split_glued_command_token(token: str) -> str:
|
||||
@@ -84,14 +89,71 @@ def _split_glued_command_token(token: str) -> str:
|
||||
|
||||
|
||||
def _postprocess_math(expr: str) -> str:
|
||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$).
|
||||
|
||||
Processing stages:
|
||||
1. Fix OCR number errors (spaces in numbers)
|
||||
2. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
|
||||
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression without delimiters.
|
||||
|
||||
Returns:
|
||||
Processed LaTeX expression.
|
||||
"""
|
||||
# stage0: fix OCR number errors (digits with spaces)
|
||||
expr = _fix_ocr_number_errors(expr)
|
||||
|
||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||
# stage2: normalize differentials (keep conservative)
|
||||
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
|
||||
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
|
||||
|
||||
# stage2: normalize differentials - DISABLED
|
||||
# This feature is disabled because it's too aggressive and can break:
|
||||
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
|
||||
# - Variable names: dx, dy, dz might be variable names, not differentials
|
||||
# - Subscripts: x_{dx}, y_{dy}
|
||||
# - Function names or custom notation
|
||||
#
|
||||
# The risk of false positives (breaking valid LaTeX) outweighs the benefit
|
||||
# of normalizing differentials for OCR output.
|
||||
#
|
||||
# If differential normalization is needed, implement a context-aware version:
|
||||
# expr = _normalize_differentials_contextaware(expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
def _normalize_differentials_contextaware(expr: str) -> str:
|
||||
"""Context-aware differential normalization (optional, not used by default).
|
||||
|
||||
Only normalizes differentials in specific mathematical contexts:
|
||||
1. After integral symbols: \\int dx, \\iint dA, \\oint dr
|
||||
2. In fraction denominators: \\frac{dy}{dx}
|
||||
3. In explicit differential notation: f(x)dx (function followed by differential)
|
||||
|
||||
This avoids false positives like variable names, subscripts, or LaTeX commands.
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression.
|
||||
|
||||
Returns:
|
||||
Expression with differentials normalized in safe contexts only.
|
||||
"""
|
||||
# Pattern 1: After integral commands
|
||||
# \int dx -> \int d x
|
||||
integral_pattern = re.compile(
|
||||
r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
|
||||
)
|
||||
expr = integral_pattern.sub(r'\1 \2 d \3', expr)
|
||||
|
||||
# Pattern 2: In fraction denominators
|
||||
# \frac{...}{dx} -> \frac{...}{d x}
|
||||
frac_pattern = re.compile(
|
||||
r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
|
||||
)
|
||||
expr = frac_pattern.sub(r'\1d \2\3', expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user