fix: markdown post handel

2026-02-05 13:18:55 +08:00
parent 808d29bd45
commit 280a8cdaeb
9 changed files with 2108 additions and 24 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -48,8 +48,13 @@ _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
 _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")

 # stage2: differentials inside math segments
-_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
-_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
+# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
+# Only match differentials in specific contexts (after integrals, in fractions)
+# (?<!\\) - not preceded by backslash (not a LaTeX command)
+# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
+# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
+_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
+_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")


 def _split_glued_command_token(token: str) -> str:
@@ -84,14 +89,71 @@ def _split_glued_command_token(token: str) -> str:


 def _postprocess_math(expr: str) -> str:
-    """Postprocess a *math* expression (already inside $...$ or $$...$$)."""
+    """Postprocess a *math* expression (already inside $...$ or $$...$$).
+    
+    Processing stages:
+    1. Fix OCR number errors (spaces in numbers)
+    2. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
+    3. Normalize differentials (DISABLED by default to avoid breaking variables)
+    
+    Args:
+        expr: LaTeX math expression without delimiters.
+        
+    Returns:
+        Processed LaTeX expression.
+    """
    # stage0: fix OCR number errors (digits with spaces)
    expr = _fix_ocr_number_errors(expr)
+    
    # stage1: split glued command tokens (e.g. \cdotdS)
    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
-    # stage2: normalize differentials (keep conservative)
-    expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
-    expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
+    
+    # stage2: normalize differentials - DISABLED
+    # This feature is disabled because it's too aggressive and can break:
+    # - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
+    # - Variable names: dx, dy, dz might be variable names, not differentials
+    # - Subscripts: x_{dx}, y_{dy}
+    # - Function names or custom notation
+    #
+    # The risk of false positives (breaking valid LaTeX) outweighs the benefit
+    # of normalizing differentials for OCR output.
+    #
+    # If differential normalization is needed, implement a context-aware version:
+    # expr = _normalize_differentials_contextaware(expr)
+    
+    return expr
+
+
+def _normalize_differentials_contextaware(expr: str) -> str:
+    """Context-aware differential normalization (optional, not used by default).
+    
+    Only normalizes differentials in specific mathematical contexts:
+    1. After integral symbols: \\int dx, \\iint dA, \\oint dr
+    2. In fraction denominators: \\frac{dy}{dx}
+    3. In explicit differential notation: f(x)dx (function followed by differential)
+    
+    This avoids false positives like variable names, subscripts, or LaTeX commands.
+    
+    Args:
+        expr: LaTeX math expression.
+        
+    Returns:
+        Expression with differentials normalized in safe contexts only.
+    """
+    # Pattern 1: After integral commands
+    # \int dx -> \int d x
+    integral_pattern = re.compile(
+        r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
+    )
+    expr = integral_pattern.sub(r'\1 \2 d \3', expr)
+    
+    # Pattern 2: In fraction denominators
+    # \frac{...}{dx} -> \frac{...}{d x}
+    frac_pattern = re.compile(
+        r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
+    )
+    expr = frac_pattern.sub(r'\1d \2\3', expr)
+    
    return expr