fix: markdown post handel

2026-02-05 13:18:55 +08:00
parent 808d29bd45
commit 280a8cdaeb
9 changed files with 2108 additions and 24 deletions
--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -419,6 +419,7 @@ class Converter:
        
        # Step 7: Decode common Unicode entities to actual characters (Word prefers this)
        unicode_map = {
+            # Basic operators
            '&#x0002B;': '+',
            '&#x0002D;': '-',
            '&#x0002A;': '*',
@@ -431,30 +432,177 @@ class Converter:
            '&#x0002C;': ',',
            '&#x0002E;': '.',
            '&#x0007C;': '|',
-            '&#x02026;': '⋯',
-            '&#x022EE;': '⋮',
-            '&#x022EF;': '⋯',
            '&#x00B0;': '°',
-            '&#x03B3;': 'γ',
-            '&#x03C6;': 'φ',
-            '&#x03D5;': 'ϕ',
-            '&#x03B1;': 'α',
-            '&#x03B2;': 'β',
-            '&#x03B4;': 'δ',
-            '&#x03B5;': 'ε',
-            '&#x03B8;': 'θ',
-            '&#x03BB;': 'λ',
-            '&#x03BC;': 'μ',
-            '&#x03C0;': 'π',
-            '&#x03C1;': 'ρ',
-            '&#x03C3;': 'σ',
-            '&#x03C4;': 'τ',
-            '&#x03C9;': 'ω',
+            '&#x00D7;': '×',  # times
+            '&#x00F7;': '÷',  # div
+            '&#x00B1;': '±',  # pm
+            '&#x2213;': '∓',  # mp
+            
+            # Ellipsis symbols
+            '&#x02026;': '…',  # ldots (horizontal)
+            '&#x022EE;': '⋮',  # vdots (vertical)
+            '&#x022EF;': '⋯',  # cdots (centered)
+            '&#x022F0;': '⋰',  # iddots (diagonal up)
+            '&#x022F1;': '⋱',  # ddots (diagonal down)
+            
+            # Greek letters (lowercase)
+            '&#x03B1;': 'α',  # alpha
+            '&#x03B2;': 'β',  # beta
+            '&#x03B3;': 'γ',  # gamma
+            '&#x03B4;': 'δ',  # delta
+            '&#x03B5;': 'ε',  # epsilon
+            '&#x03B6;': 'ζ',  # zeta
+            '&#x03B7;': 'η',  # eta
+            '&#x03B8;': 'θ',  # theta
+            '&#x03B9;': 'ι',  # iota
+            '&#x03BA;': 'κ',  # kappa
+            '&#x03BB;': 'λ',  # lambda
+            '&#x03BC;': 'μ',  # mu
+            '&#x03BD;': 'ν',  # nu
+            '&#x03BE;': 'ξ',  # xi
+            '&#x03BF;': 'ο',  # omicron
+            '&#x03C0;': 'π',  # pi
+            '&#x03C1;': 'ρ',  # rho
+            '&#x03C2;': 'ς',  # final sigma
+            '&#x03C3;': 'σ',  # sigma
+            '&#x03C4;': 'τ',  # tau
+            '&#x03C5;': 'υ',  # upsilon
+            '&#x03C6;': 'φ',  # phi
+            '&#x03C7;': 'χ',  # chi
+            '&#x03C8;': 'ψ',  # psi
+            '&#x03C9;': 'ω',  # omega
+            '&#x03D5;': 'ϕ',  # phi variant
+            
+            # Greek letters (uppercase)
+            '&#x0391;': 'Α',  # Alpha
+            '&#x0392;': 'Β',  # Beta
+            '&#x0393;': 'Γ',  # Gamma
+            '&#x0394;': 'Δ',  # Delta
+            '&#x0395;': 'Ε',  # Epsilon
+            '&#x0396;': 'Ζ',  # Zeta
+            '&#x0397;': 'Η',  # Eta
+            '&#x0398;': 'Θ',  # Theta
+            '&#x0399;': 'Ι',  # Iota
+            '&#x039A;': 'Κ',  # Kappa
+            '&#x039B;': 'Λ',  # Lambda
+            '&#x039C;': 'Μ',  # Mu
+            '&#x039D;': 'Ν',  # Nu
+            '&#x039E;': 'Ξ',  # Xi
+            '&#x039F;': 'Ο',  # Omicron
+            '&#x03A0;': 'Π',  # Pi
+            '&#x03A1;': 'Ρ',  # Rho
+            '&#x03A3;': 'Σ',  # Sigma
+            '&#x03A4;': 'Τ',  # Tau
+            '&#x03A5;': 'Υ',  # Upsilon
+            '&#x03A6;': 'Φ',  # Phi
+            '&#x03A7;': 'Χ',  # Chi
+            '&#x03A8;': 'Ψ',  # Psi
+            '&#x03A9;': 'Ω',  # Omega
+            
+            # Math symbols
+            '&#x2205;': '∅',  # emptyset
+            '&#x2208;': '∈',  # in
+            '&#x2209;': '∉',  # notin
+            '&#x220B;': '∋',  # ni
+            '&#x220C;': '∌',  # nni
+            '&#x2211;': '∑',  # sum
+            '&#x220F;': '∏',  # prod
+            '&#x221A;': '√',  # sqrt
+            '&#x221B;': '∛',  # cbrt
+            '&#x221C;': '∜',  # fourthroot
+            '&#x221E;': '∞',  # infty
+            '&#x2229;': '∩',  # cap
+            '&#x222A;': '∪',  # cup
+            '&#x222B;': '∫',  # int
+            '&#x222C;': '∬',  # iint
+            '&#x222D;': '∭',  # iiint
+            '&#x222E;': '∮',  # oint
+            '&#x2282;': '⊂',  # subset
+            '&#x2283;': '⊃',  # supset
+            '&#x2284;': '⊄',  # nsubset
+            '&#x2285;': '⊅',  # nsupset
+            '&#x2286;': '⊆',  # subseteq
+            '&#x2287;': '⊇',  # supseteq
+            '&#x2288;': '⊈',  # nsubseteq
+            '&#x2289;': '⊉',  # nsupseteq
+            '&#x2264;': '≤',  # leq
+            '&#x2265;': '≥',  # geq
+            '&#x2260;': '≠',  # neq
+            '&#x2261;': '≡',  # equiv
+            '&#x2248;': '≈',  # approx
+            '&#x2243;': '≃',  # simeq
+            '&#x2245;': '≅',  # cong
+            '&#x2202;': '∂',  # partial
+            '&#x2207;': '∇',  # nabla
+            '&#x2200;': '∀',  # forall
+            '&#x2203;': '∃',  # exists
+            '&#x2204;': '∄',  # nexists
+            '&#x00AC;': '¬',  # neg/lnot
+            '&#x2227;': '∧',  # wedge/land
+            '&#x2228;': '∨',  # vee/lor
+            '&#x2192;': '→',  # to/rightarrow
+            '&#x2190;': '←',  # leftarrow
+            '&#x2194;': '↔',  # leftrightarrow
+            '&#x21D2;': '⇒',  # Rightarrow
+            '&#x21D0;': '⇐',  # Leftarrow
+            '&#x21D4;': '⇔',  # Leftrightarrow
+            '&#x2191;': '↑',  # uparrow
+            '&#x2193;': '↓',  # downarrow
+            '&#x21D1;': '⇑',  # Uparrow
+            '&#x21D3;': '⇓',  # Downarrow
+            '&#x2195;': '↕',  # updownarrow
+            '&#x21D5;': '⇕',  # Updownarrow
+            '&#x2260;': '≠',  # ne
+            '&#x226A;': '≪',  # ll
+            '&#x226B;': '≫',  # gg
+            '&#x2A7D;': '⩽',  # leqslant
+            '&#x2A7E;': '⩾',  # geqslant
+            '&#x22A5;': '⊥',  # perp
+            '&#x2225;': '∥',  # parallel
+            '&#x2220;': '∠',  # angle
+            '&#x25B3;': '△',  # triangle
+            '&#x25A1;': '□',  # square
+            '&#x25CA;': '◊',  # diamond
+            '&#x2660;': '♠',  # spadesuit
+            '&#x2661;': '♡',  # heartsuit
+            '&#x2662;': '♢',  # diamondsuit
+            '&#x2663;': '♣',  # clubsuit
+            '&#x2113;': 'ℓ',  # ell
+            '&#x2118;': '℘',  # wp (Weierstrass p)
+            '&#x211C;': 'ℜ',  # Re (real part)
+            '&#x2111;': 'ℑ',  # Im (imaginary part)
+            '&#x2135;': 'ℵ',  # aleph
+            '&#x2136;': 'ℶ',  # beth
        }
        
        for entity, char in unicode_map.items():
            mathml = mathml.replace(entity, char)
        
+        # Also handle decimal entity format (&#NNNN;) for common characters
+        # Convert decimal to hex-based lookup
+        decimal_patterns = [
+            (r'&#955;', 'λ'),    # lambda (decimal 955 = hex 03BB)
+            (r'&#8942;', '⋮'),   # vdots (decimal 8942 = hex 22EE)
+            (r'&#8943;', '⋯'),   # cdots (decimal 8943 = hex 22EF)
+            (r'&#8230;', '…'),   # ldots (decimal 8230 = hex 2026)
+            (r'&#8734;', '∞'),   # infty (decimal 8734 = hex 221E)
+            (r'&#8721;', '∑'),   # sum (decimal 8721 = hex 2211)
+            (r'&#8719;', '∏'),   # prod (decimal 8719 = hex 220F)
+            (r'&#8730;', '√'),   # sqrt (decimal 8730 = hex 221A)
+            (r'&#8712;', '∈'),   # in (decimal 8712 = hex 2208)
+            (r'&#8713;', '∉'),   # notin (decimal 8713 = hex 2209)
+            (r'&#8745;', '∩'),   # cap (decimal 8745 = hex 2229)
+            (r'&#8746;', '∪'),   # cup (decimal 8746 = hex 222A)
+            (r'&#8804;', '≤'),   # leq (decimal 8804 = hex 2264)
+            (r'&#8805;', '≥'),   # geq (decimal 8805 = hex 2265)
+            (r'&#8800;', '≠'),   # neq (decimal 8800 = hex 2260)
+            (r'&#8776;', '≈'),   # approx (decimal 8776 = hex 2248)
+            (r'&#8801;', '≡'),   # equiv (decimal 8801 = hex 2261)
+        ]
+        
+        for pattern, char in decimal_patterns:
+            mathml = mathml.replace(pattern, char)
+        
        # Step 8: Clean up extra whitespace
        mathml = re.sub(r'>\s+<', '><', mathml)
        
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -48,8 +48,13 @@ _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
 _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")

 # stage2: differentials inside math segments
-_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
-_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
+# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
+# Only match differentials in specific contexts (after integrals, in fractions)
+# (?<!\\) - not preceded by backslash (not a LaTeX command)
+# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
+# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
+_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
+_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")


 def _split_glued_command_token(token: str) -> str:
@@ -84,14 +89,71 @@ def _split_glued_command_token(token: str) -> str:


 def _postprocess_math(expr: str) -> str:
-    """Postprocess a *math* expression (already inside $...$ or $$...$$)."""
+    """Postprocess a *math* expression (already inside $...$ or $$...$$).
+    
+    Processing stages:
+    1. Fix OCR number errors (spaces in numbers)
+    2. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
+    3. Normalize differentials (DISABLED by default to avoid breaking variables)
+    
+    Args:
+        expr: LaTeX math expression without delimiters.
+        
+    Returns:
+        Processed LaTeX expression.
+    """
    # stage0: fix OCR number errors (digits with spaces)
    expr = _fix_ocr_number_errors(expr)
+    
    # stage1: split glued command tokens (e.g. \cdotdS)
    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
-    # stage2: normalize differentials (keep conservative)
-    expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
-    expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
+    
+    # stage2: normalize differentials - DISABLED
+    # This feature is disabled because it's too aggressive and can break:
+    # - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
+    # - Variable names: dx, dy, dz might be variable names, not differentials
+    # - Subscripts: x_{dx}, y_{dy}
+    # - Function names or custom notation
+    #
+    # The risk of false positives (breaking valid LaTeX) outweighs the benefit
+    # of normalizing differentials for OCR output.
+    #
+    # If differential normalization is needed, implement a context-aware version:
+    # expr = _normalize_differentials_contextaware(expr)
+    
+    return expr
+
+
+def _normalize_differentials_contextaware(expr: str) -> str:
+    """Context-aware differential normalization (optional, not used by default).
+    
+    Only normalizes differentials in specific mathematical contexts:
+    1. After integral symbols: \\int dx, \\iint dA, \\oint dr
+    2. In fraction denominators: \\frac{dy}{dx}
+    3. In explicit differential notation: f(x)dx (function followed by differential)
+    
+    This avoids false positives like variable names, subscripts, or LaTeX commands.
+    
+    Args:
+        expr: LaTeX math expression.
+        
+    Returns:
+        Expression with differentials normalized in safe contexts only.
+    """
+    # Pattern 1: After integral commands
+    # \int dx -> \int d x
+    integral_pattern = re.compile(
+        r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
+    )
+    expr = integral_pattern.sub(r'\1 \2 d \3', expr)
+    
+    # Pattern 2: In fraction denominators
+    # \frac{...}{dx} -> \frac{...}{d x}
+    frac_pattern = re.compile(
+        r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
+    )
+    expr = frac_pattern.sub(r'\1d \2\3', expr)
+    
    return expr