fix: markdown post handel

This commit is contained in:
liuyuanchuang
2026-02-05 13:18:55 +08:00
parent 808d29bd45
commit 280a8cdaeb
9 changed files with 2108 additions and 24 deletions

View File

@@ -419,6 +419,7 @@ class Converter:
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
unicode_map = {
# Basic operators
'+': '+',
'-': '-',
'*': '*',
@@ -431,30 +432,177 @@ class Converter:
',': ',',
'.': '.',
'|': '|',
'…': '',
'⋮': '',
'⋯': '',
'°': '°',
'γ': 'γ',
'φ': 'φ',
'ϕ': 'ϕ',
'α': 'α',
'β': 'β',
'δ': 'δ',
'ε': 'ε',
'θ': 'θ',
'λ': 'λ',
'μ': 'μ',
'π': 'π',
'ρ': 'ρ',
'σ': 'σ',
'τ': 'τ',
'ω': 'ω',
'×': '×', # times
'÷': '÷', # div
'±': '±', # pm
'∓': '', # mp
# Ellipsis symbols
'…': '', # ldots (horizontal)
'⋮': '', # vdots (vertical)
'⋯': '', # cdots (centered)
'⋰': '', # iddots (diagonal up)
'⋱': '', # ddots (diagonal down)
# Greek letters (lowercase)
'α': 'α', # alpha
'β': 'β', # beta
'γ': 'γ', # gamma
'δ': 'δ', # delta
'ε': 'ε', # epsilon
'ζ': 'ζ', # zeta
'η': 'η', # eta
'θ': 'θ', # theta
'ι': 'ι', # iota
'κ': 'κ', # kappa
'λ': 'λ', # lambda
'μ': 'μ', # mu
'ν': 'ν', # nu
'ξ': 'ξ', # xi
'ο': 'ο', # omicron
'π': 'π', # pi
'ρ': 'ρ', # rho
'ς': 'ς', # final sigma
'σ': 'σ', # sigma
'τ': 'τ', # tau
'υ': 'υ', # upsilon
'φ': 'φ', # phi
'χ': 'χ', # chi
'ψ': 'ψ', # psi
'ω': 'ω', # omega
'ϕ': 'ϕ', # phi variant
# Greek letters (uppercase)
'Α': 'Α', # Alpha
'Β': 'Β', # Beta
'Γ': 'Γ', # Gamma
'Δ': 'Δ', # Delta
'Ε': 'Ε', # Epsilon
'Ζ': 'Ζ', # Zeta
'Η': 'Η', # Eta
'Θ': 'Θ', # Theta
'Ι': 'Ι', # Iota
'Κ': 'Κ', # Kappa
'Λ': 'Λ', # Lambda
'Μ': 'Μ', # Mu
'Ν': 'Ν', # Nu
'Ξ': 'Ξ', # Xi
'Ο': 'Ο', # Omicron
'Π': 'Π', # Pi
'Ρ': 'Ρ', # Rho
'Σ': 'Σ', # Sigma
'Τ': 'Τ', # Tau
'Υ': 'Υ', # Upsilon
'Φ': 'Φ', # Phi
'Χ': 'Χ', # Chi
'Ψ': 'Ψ', # Psi
'Ω': 'Ω', # Omega
# Math symbols
'∅': '', # emptyset
'∈': '', # in
'∉': '', # notin
'∋': '', # ni
'∌': '', # nni
'∑': '', # sum
'∏': '', # prod
'√': '', # sqrt
'∛': '', # cbrt
'∜': '', # fourthroot
'∞': '', # infty
'∩': '', # cap
'∪': '', # cup
'∫': '', # int
'∬': '', # iint
'∭': '', # iiint
'∮': '', # oint
'⊂': '', # subset
'⊃': '', # supset
'⊄': '', # nsubset
'⊅': '', # nsupset
'⊆': '', # subseteq
'⊇': '', # supseteq
'⊈': '', # nsubseteq
'⊉': '', # nsupseteq
'≤': '', # leq
'≥': '', # geq
'≠': '', # neq
'≡': '', # equiv
'≈': '', # approx
'≃': '', # simeq
'≅': '', # cong
'∂': '', # partial
'∇': '', # nabla
'∀': '', # forall
'∃': '', # exists
'∄': '', # nexists
'¬': '¬', # neg/lnot
'∧': '', # wedge/land
'∨': '', # vee/lor
'→': '', # to/rightarrow
'←': '', # leftarrow
'↔': '', # leftrightarrow
'⇒': '', # Rightarrow
'⇐': '', # Leftarrow
'⇔': '', # Leftrightarrow
'↑': '', # uparrow
'↓': '', # downarrow
'⇑': '', # Uparrow
'⇓': '', # Downarrow
'↕': '', # updownarrow
'⇕': '', # Updownarrow
'≠': '', # ne
'≪': '', # ll
'≫': '', # gg
'⩽': '', # leqslant
'⩾': '', # geqslant
'⊥': '', # perp
'∥': '', # parallel
'∠': '', # angle
'△': '', # triangle
'□': '', # square
'◊': '', # diamond
'♠': '', # spadesuit
'♡': '', # heartsuit
'♢': '', # diamondsuit
'♣': '', # clubsuit
'ℓ': '', # ell
'℘': '', # wp (Weierstrass p)
'ℜ': '', # Re (real part)
'ℑ': '', # Im (imaginary part)
'ℵ': '', # aleph
'ℶ': '', # beth
}
for entity, char in unicode_map.items():
mathml = mathml.replace(entity, char)
# Also handle decimal entity format (&#NNNN;) for common characters
# Convert decimal to hex-based lookup
decimal_patterns = [
(r'λ', 'λ'), # lambda (decimal 955 = hex 03BB)
(r'⋮', ''), # vdots (decimal 8942 = hex 22EE)
(r'⋯', ''), # cdots (decimal 8943 = hex 22EF)
(r'…', ''), # ldots (decimal 8230 = hex 2026)
(r'∞', ''), # infty (decimal 8734 = hex 221E)
(r'∑', ''), # sum (decimal 8721 = hex 2211)
(r'∏', ''), # prod (decimal 8719 = hex 220F)
(r'√', ''), # sqrt (decimal 8730 = hex 221A)
(r'∈', ''), # in (decimal 8712 = hex 2208)
(r'∉', ''), # notin (decimal 8713 = hex 2209)
(r'∩', ''), # cap (decimal 8745 = hex 2229)
(r'∪', ''), # cup (decimal 8746 = hex 222A)
(r'≤', ''), # leq (decimal 8804 = hex 2264)
(r'≥', ''), # geq (decimal 8805 = hex 2265)
(r'≠', ''), # neq (decimal 8800 = hex 2260)
(r'≈', ''), # approx (decimal 8776 = hex 2248)
(r'≡', ''), # equiv (decimal 8801 = hex 2261)
]
for pattern, char in decimal_patterns:
mathml = mathml.replace(pattern, char)
# Step 8: Clean up extra whitespace
mathml = re.sub(r'>\s+<', '><', mathml)

View File

@@ -48,8 +48,13 @@ _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
# stage2: differentials inside math segments
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
# Only match differentials in specific contexts (after integrals, in fractions)
# (?<!\\) - not preceded by backslash (not a LaTeX command)
# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")
def _split_glued_command_token(token: str) -> str:
@@ -84,14 +89,71 @@ def _split_glued_command_token(token: str) -> str:
def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
"""Postprocess a *math* expression (already inside $...$ or $$...$$).
Processing stages:
1. Fix OCR number errors (spaces in numbers)
2. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
3. Normalize differentials (DISABLED by default to avoid breaking variables)
Args:
expr: LaTeX math expression without delimiters.
Returns:
Processed LaTeX expression.
"""
# stage0: fix OCR number errors (digits with spaces)
expr = _fix_ocr_number_errors(expr)
# stage1: split glued command tokens (e.g. \cdotdS)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: normalize differentials (keep conservative)
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
# stage2: normalize differentials - DISABLED
# This feature is disabled because it's too aggressive and can break:
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
# - Variable names: dx, dy, dz might be variable names, not differentials
# - Subscripts: x_{dx}, y_{dy}
# - Function names or custom notation
#
# The risk of false positives (breaking valid LaTeX) outweighs the benefit
# of normalizing differentials for OCR output.
#
# If differential normalization is needed, implement a context-aware version:
# expr = _normalize_differentials_contextaware(expr)
return expr
def _normalize_differentials_contextaware(expr: str) -> str:
"""Context-aware differential normalization (optional, not used by default).
Only normalizes differentials in specific mathematical contexts:
1. After integral symbols: \\int dx, \\iint dA, \\oint dr
2. In fraction denominators: \\frac{dy}{dx}
3. In explicit differential notation: f(x)dx (function followed by differential)
This avoids false positives like variable names, subscripts, or LaTeX commands.
Args:
expr: LaTeX math expression.
Returns:
Expression with differentials normalized in safe contexts only.
"""
# Pattern 1: After integral commands
# \int dx -> \int d x
integral_pattern = re.compile(
r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
)
expr = integral_pattern.sub(r'\1 \2 d \3', expr)
# Pattern 2: In fraction denominators
# \frac{...}{dx} -> \frac{...}{d x}
frac_pattern = re.compile(
r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
)
expr = frac_pattern.sub(r'\1d \2\3', expr)
return expr