fix: add post markdown
This commit is contained in:
@@ -85,6 +85,8 @@ def _split_glued_command_token(token: str) -> str:
|
||||
|
||||
def _postprocess_math(expr: str) -> str:
|
||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
||||
# stage0: fix OCR number errors (digits with spaces)
|
||||
expr = _fix_ocr_number_errors(expr)
|
||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||
# stage2: normalize differentials (keep conservative)
|
||||
@@ -93,6 +95,42 @@ def _postprocess_math(expr: str) -> str:
|
||||
return expr
|
||||
|
||||
|
||||
def _fix_ocr_number_errors(expr: str) -> str:
|
||||
"""Fix common OCR errors in LaTeX math expressions.
|
||||
|
||||
OCR often splits numbers incorrectly, especially decimals:
|
||||
- "2 2. 2" should be "22.2"
|
||||
- "3 0. 4" should be "30.4"
|
||||
- "1 5 0" should be "150"
|
||||
|
||||
This function merges digit sequences that are separated by spaces.
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression.
|
||||
|
||||
Returns:
|
||||
LaTeX expression with number errors fixed.
|
||||
"""
|
||||
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
|
||||
# Example: "2 2. 2" → "22.2"
|
||||
expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
|
||||
|
||||
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
|
||||
# Example: "22. 2" → "22.2"
|
||||
expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
|
||||
|
||||
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
|
||||
# Be careful: only merge if followed by decimal point or comma/end
|
||||
# Example: "1 5 0" → "150" when followed by comma or end
|
||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
|
||||
|
||||
# Fix pattern 4: Multiple spaces in decimal numbers
|
||||
# Example: "2 2 . 2" → "22.2"
|
||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
def _postprocess_markdown(markdown_content: str) -> str:
|
||||
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
|
||||
if not markdown_content:
|
||||
|
||||
Reference in New Issue
Block a user