fix: markdown post handel
This commit is contained in:
@@ -419,6 +419,7 @@ class Converter:
|
||||
|
||||
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
|
||||
unicode_map = {
|
||||
# Basic operators
|
||||
'+': '+',
|
||||
'-': '-',
|
||||
'*': '*',
|
||||
@@ -431,30 +432,177 @@ class Converter:
|
||||
',': ',',
|
||||
'.': '.',
|
||||
'|': '|',
|
||||
'…': '⋯',
|
||||
'⋮': '⋮',
|
||||
'⋯': '⋯',
|
||||
'°': '°',
|
||||
'γ': 'γ',
|
||||
'φ': 'φ',
|
||||
'ϕ': 'ϕ',
|
||||
'α': 'α',
|
||||
'β': 'β',
|
||||
'δ': 'δ',
|
||||
'ε': 'ε',
|
||||
'θ': 'θ',
|
||||
'λ': 'λ',
|
||||
'μ': 'μ',
|
||||
'π': 'π',
|
||||
'ρ': 'ρ',
|
||||
'σ': 'σ',
|
||||
'τ': 'τ',
|
||||
'ω': 'ω',
|
||||
'×': '×', # times
|
||||
'÷': '÷', # div
|
||||
'±': '±', # pm
|
||||
'∓': '∓', # mp
|
||||
|
||||
# Ellipsis symbols
|
||||
'…': '…', # ldots (horizontal)
|
||||
'⋮': '⋮', # vdots (vertical)
|
||||
'⋯': '⋯', # cdots (centered)
|
||||
'⋰': '⋰', # iddots (diagonal up)
|
||||
'⋱': '⋱', # ddots (diagonal down)
|
||||
|
||||
# Greek letters (lowercase)
|
||||
'α': 'α', # alpha
|
||||
'β': 'β', # beta
|
||||
'γ': 'γ', # gamma
|
||||
'δ': 'δ', # delta
|
||||
'ε': 'ε', # epsilon
|
||||
'ζ': 'ζ', # zeta
|
||||
'η': 'η', # eta
|
||||
'θ': 'θ', # theta
|
||||
'ι': 'ι', # iota
|
||||
'κ': 'κ', # kappa
|
||||
'λ': 'λ', # lambda
|
||||
'μ': 'μ', # mu
|
||||
'ν': 'ν', # nu
|
||||
'ξ': 'ξ', # xi
|
||||
'ο': 'ο', # omicron
|
||||
'π': 'π', # pi
|
||||
'ρ': 'ρ', # rho
|
||||
'ς': 'ς', # final sigma
|
||||
'σ': 'σ', # sigma
|
||||
'τ': 'τ', # tau
|
||||
'υ': 'υ', # upsilon
|
||||
'φ': 'φ', # phi
|
||||
'χ': 'χ', # chi
|
||||
'ψ': 'ψ', # psi
|
||||
'ω': 'ω', # omega
|
||||
'ϕ': 'ϕ', # phi variant
|
||||
|
||||
# Greek letters (uppercase)
|
||||
'Α': 'Α', # Alpha
|
||||
'Β': 'Β', # Beta
|
||||
'Γ': 'Γ', # Gamma
|
||||
'Δ': 'Δ', # Delta
|
||||
'Ε': 'Ε', # Epsilon
|
||||
'Ζ': 'Ζ', # Zeta
|
||||
'Η': 'Η', # Eta
|
||||
'Θ': 'Θ', # Theta
|
||||
'Ι': 'Ι', # Iota
|
||||
'Κ': 'Κ', # Kappa
|
||||
'Λ': 'Λ', # Lambda
|
||||
'Μ': 'Μ', # Mu
|
||||
'Ν': 'Ν', # Nu
|
||||
'Ξ': 'Ξ', # Xi
|
||||
'Ο': 'Ο', # Omicron
|
||||
'Π': 'Π', # Pi
|
||||
'Ρ': 'Ρ', # Rho
|
||||
'Σ': 'Σ', # Sigma
|
||||
'Τ': 'Τ', # Tau
|
||||
'Υ': 'Υ', # Upsilon
|
||||
'Φ': 'Φ', # Phi
|
||||
'Χ': 'Χ', # Chi
|
||||
'Ψ': 'Ψ', # Psi
|
||||
'Ω': 'Ω', # Omega
|
||||
|
||||
# Math symbols
|
||||
'∅': '∅', # emptyset
|
||||
'∈': '∈', # in
|
||||
'∉': '∉', # notin
|
||||
'∋': '∋', # ni
|
||||
'∌': '∌', # nni
|
||||
'∑': '∑', # sum
|
||||
'∏': '∏', # prod
|
||||
'√': '√', # sqrt
|
||||
'∛': '∛', # cbrt
|
||||
'∜': '∜', # fourthroot
|
||||
'∞': '∞', # infty
|
||||
'∩': '∩', # cap
|
||||
'∪': '∪', # cup
|
||||
'∫': '∫', # int
|
||||
'∬': '∬', # iint
|
||||
'∭': '∭', # iiint
|
||||
'∮': '∮', # oint
|
||||
'⊂': '⊂', # subset
|
||||
'⊃': '⊃', # supset
|
||||
'⊄': '⊄', # nsubset
|
||||
'⊅': '⊅', # nsupset
|
||||
'⊆': '⊆', # subseteq
|
||||
'⊇': '⊇', # supseteq
|
||||
'⊈': '⊈', # nsubseteq
|
||||
'⊉': '⊉', # nsupseteq
|
||||
'≤': '≤', # leq
|
||||
'≥': '≥', # geq
|
||||
'≠': '≠', # neq
|
||||
'≡': '≡', # equiv
|
||||
'≈': '≈', # approx
|
||||
'≃': '≃', # simeq
|
||||
'≅': '≅', # cong
|
||||
'∂': '∂', # partial
|
||||
'∇': '∇', # nabla
|
||||
'∀': '∀', # forall
|
||||
'∃': '∃', # exists
|
||||
'∄': '∄', # nexists
|
||||
'¬': '¬', # neg/lnot
|
||||
'∧': '∧', # wedge/land
|
||||
'∨': '∨', # vee/lor
|
||||
'→': '→', # to/rightarrow
|
||||
'←': '←', # leftarrow
|
||||
'↔': '↔', # leftrightarrow
|
||||
'⇒': '⇒', # Rightarrow
|
||||
'⇐': '⇐', # Leftarrow
|
||||
'⇔': '⇔', # Leftrightarrow
|
||||
'↑': '↑', # uparrow
|
||||
'↓': '↓', # downarrow
|
||||
'⇑': '⇑', # Uparrow
|
||||
'⇓': '⇓', # Downarrow
|
||||
'↕': '↕', # updownarrow
|
||||
'⇕': '⇕', # Updownarrow
|
||||
'≠': '≠', # ne
|
||||
'≪': '≪', # ll
|
||||
'≫': '≫', # gg
|
||||
'⩽': '⩽', # leqslant
|
||||
'⩾': '⩾', # geqslant
|
||||
'⊥': '⊥', # perp
|
||||
'∥': '∥', # parallel
|
||||
'∠': '∠', # angle
|
||||
'△': '△', # triangle
|
||||
'□': '□', # square
|
||||
'◊': '◊', # diamond
|
||||
'♠': '♠', # spadesuit
|
||||
'♡': '♡', # heartsuit
|
||||
'♢': '♢', # diamondsuit
|
||||
'♣': '♣', # clubsuit
|
||||
'ℓ': 'ℓ', # ell
|
||||
'℘': '℘', # wp (Weierstrass p)
|
||||
'ℜ': 'ℜ', # Re (real part)
|
||||
'ℑ': 'ℑ', # Im (imaginary part)
|
||||
'ℵ': 'ℵ', # aleph
|
||||
'ℶ': 'ℶ', # beth
|
||||
}
|
||||
|
||||
for entity, char in unicode_map.items():
|
||||
mathml = mathml.replace(entity, char)
|
||||
|
||||
# Also handle decimal entity format (&#NNNN;) for common characters
|
||||
# Convert decimal to hex-based lookup
|
||||
decimal_patterns = [
|
||||
(r'λ', 'λ'), # lambda (decimal 955 = hex 03BB)
|
||||
(r'⋮', '⋮'), # vdots (decimal 8942 = hex 22EE)
|
||||
(r'⋯', '⋯'), # cdots (decimal 8943 = hex 22EF)
|
||||
(r'…', '…'), # ldots (decimal 8230 = hex 2026)
|
||||
(r'∞', '∞'), # infty (decimal 8734 = hex 221E)
|
||||
(r'∑', '∑'), # sum (decimal 8721 = hex 2211)
|
||||
(r'∏', '∏'), # prod (decimal 8719 = hex 220F)
|
||||
(r'√', '√'), # sqrt (decimal 8730 = hex 221A)
|
||||
(r'∈', '∈'), # in (decimal 8712 = hex 2208)
|
||||
(r'∉', '∉'), # notin (decimal 8713 = hex 2209)
|
||||
(r'∩', '∩'), # cap (decimal 8745 = hex 2229)
|
||||
(r'∪', '∪'), # cup (decimal 8746 = hex 222A)
|
||||
(r'≤', '≤'), # leq (decimal 8804 = hex 2264)
|
||||
(r'≥', '≥'), # geq (decimal 8805 = hex 2265)
|
||||
(r'≠', '≠'), # neq (decimal 8800 = hex 2260)
|
||||
(r'≈', '≈'), # approx (decimal 8776 = hex 2248)
|
||||
(r'≡', '≡'), # equiv (decimal 8801 = hex 2261)
|
||||
]
|
||||
|
||||
for pattern, char in decimal_patterns:
|
||||
mathml = mathml.replace(pattern, char)
|
||||
|
||||
# Step 8: Clean up extra whitespace
|
||||
mathml = re.sub(r'>\s+<', '><', mathml)
|
||||
|
||||
|
||||
@@ -48,8 +48,13 @@ _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
||||
|
||||
# stage2: differentials inside math segments
|
||||
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
|
||||
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
|
||||
# IMPORTANT: Very conservative pattern to avoid breaking LaTeX commands and variables
|
||||
# Only match differentials in specific contexts (after integrals, in fractions)
|
||||
# (?<!\\) - not preceded by backslash (not a LaTeX command)
|
||||
# (?<![a-zA-Z]) - not preceded by any letter (not inside a word/command)
|
||||
# (?![a-zA-Z]) - not followed by another letter (avoid matching "dx" in "dxyz")
|
||||
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([A-Z])(?![a-zA-Z])")
|
||||
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)(?<![a-zA-Z])d([a-z])(?![a-zA-Z])")
|
||||
|
||||
|
||||
def _split_glued_command_token(token: str) -> str:
|
||||
@@ -84,14 +89,71 @@ def _split_glued_command_token(token: str) -> str:
|
||||
|
||||
|
||||
def _postprocess_math(expr: str) -> str:
|
||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$).
|
||||
|
||||
Processing stages:
|
||||
1. Fix OCR number errors (spaces in numbers)
|
||||
2. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
|
||||
3. Normalize differentials (DISABLED by default to avoid breaking variables)
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression without delimiters.
|
||||
|
||||
Returns:
|
||||
Processed LaTeX expression.
|
||||
"""
|
||||
# stage0: fix OCR number errors (digits with spaces)
|
||||
expr = _fix_ocr_number_errors(expr)
|
||||
|
||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||
# stage2: normalize differentials (keep conservative)
|
||||
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
|
||||
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
|
||||
|
||||
# stage2: normalize differentials - DISABLED
|
||||
# This feature is disabled because it's too aggressive and can break:
|
||||
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
|
||||
# - Variable names: dx, dy, dz might be variable names, not differentials
|
||||
# - Subscripts: x_{dx}, y_{dy}
|
||||
# - Function names or custom notation
|
||||
#
|
||||
# The risk of false positives (breaking valid LaTeX) outweighs the benefit
|
||||
# of normalizing differentials for OCR output.
|
||||
#
|
||||
# If differential normalization is needed, implement a context-aware version:
|
||||
# expr = _normalize_differentials_contextaware(expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
def _normalize_differentials_contextaware(expr: str) -> str:
|
||||
"""Context-aware differential normalization (optional, not used by default).
|
||||
|
||||
Only normalizes differentials in specific mathematical contexts:
|
||||
1. After integral symbols: \\int dx, \\iint dA, \\oint dr
|
||||
2. In fraction denominators: \\frac{dy}{dx}
|
||||
3. In explicit differential notation: f(x)dx (function followed by differential)
|
||||
|
||||
This avoids false positives like variable names, subscripts, or LaTeX commands.
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression.
|
||||
|
||||
Returns:
|
||||
Expression with differentials normalized in safe contexts only.
|
||||
"""
|
||||
# Pattern 1: After integral commands
|
||||
# \int dx -> \int d x
|
||||
integral_pattern = re.compile(
|
||||
r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
|
||||
)
|
||||
expr = integral_pattern.sub(r'\1 \2 d \3', expr)
|
||||
|
||||
# Pattern 2: In fraction denominators
|
||||
# \frac{...}{dx} -> \frac{...}{d x}
|
||||
frac_pattern = re.compile(
|
||||
r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
|
||||
)
|
||||
expr = frac_pattern.sub(r'\1d \2\3', expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user