fix: add post markdown
This commit is contained in:
@@ -248,17 +248,19 @@ class Converter:
|
||||
consistency across all conversion paths. This fixes common issues that
|
||||
cause Pandoc conversion to fail.
|
||||
|
||||
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
|
||||
so we don't need to handle them here.
|
||||
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula.
|
||||
|
||||
Returns:
|
||||
Preprocessed LaTeX formula.
|
||||
"""
|
||||
# Use the same preprocessing methods as export
|
||||
# 1. Convert matrix environments
|
||||
latex_formula = self._convert_matrix_environments(latex_formula)
|
||||
|
||||
# 2. Fix array column specifiers (remove spaces) - THIS IS THE KEY FIX
|
||||
# 2. Fix array column specifiers (remove spaces)
|
||||
latex_formula = self._fix_array_column_specifiers(latex_formula)
|
||||
|
||||
# 3. Fix brace spacing
|
||||
|
||||
@@ -85,6 +85,8 @@ def _split_glued_command_token(token: str) -> str:
|
||||
|
||||
def _postprocess_math(expr: str) -> str:
|
||||
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
||||
# stage0: fix OCR number errors (digits with spaces)
|
||||
expr = _fix_ocr_number_errors(expr)
|
||||
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||
# stage2: normalize differentials (keep conservative)
|
||||
@@ -93,6 +95,42 @@ def _postprocess_math(expr: str) -> str:
|
||||
return expr
|
||||
|
||||
|
||||
def _fix_ocr_number_errors(expr: str) -> str:
|
||||
"""Fix common OCR errors in LaTeX math expressions.
|
||||
|
||||
OCR often splits numbers incorrectly, especially decimals:
|
||||
- "2 2. 2" should be "22.2"
|
||||
- "3 0. 4" should be "30.4"
|
||||
- "1 5 0" should be "150"
|
||||
|
||||
This function merges digit sequences that are separated by spaces.
|
||||
|
||||
Args:
|
||||
expr: LaTeX math expression.
|
||||
|
||||
Returns:
|
||||
LaTeX expression with number errors fixed.
|
||||
"""
|
||||
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
|
||||
# Example: "2 2. 2" → "22.2"
|
||||
expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
|
||||
|
||||
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
|
||||
# Example: "22. 2" → "22.2"
|
||||
expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
|
||||
|
||||
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
|
||||
# Be careful: only merge if followed by decimal point or comma/end
|
||||
# Example: "1 5 0" → "150" when followed by comma or end
|
||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
|
||||
|
||||
# Fix pattern 4: Multiple spaces in decimal numbers
|
||||
# Example: "2 2 . 2" → "22.2"
|
||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
|
||||
|
||||
return expr
|
||||
|
||||
|
||||
def _postprocess_markdown(markdown_content: str) -> str:
|
||||
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
|
||||
if not markdown_content:
|
||||
|
||||
Reference in New Issue
Block a user