fix: add post markdown

2026-02-04 16:04:18 +08:00
parent 720cd05add
commit 61fd5441b7
4 changed files with 601 additions and 2 deletions
--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -248,17 +248,19 @@ class Converter:
        consistency across all conversion paths. This fixes common issues that 
        cause Pandoc conversion to fail.

+        Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
+        so we don't need to handle them here.
+
        Args:
            latex_formula: Pure LaTeX formula.

        Returns:
            Preprocessed LaTeX formula.
        """
-        # Use the same preprocessing methods as export
        # 1. Convert matrix environments
        latex_formula = self._convert_matrix_environments(latex_formula)
        
-        # 2. Fix array column specifiers (remove spaces) - THIS IS THE KEY FIX
+        # 2. Fix array column specifiers (remove spaces)
        latex_formula = self._fix_array_column_specifiers(latex_formula)
        
        # 3. Fix brace spacing
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -85,6 +85,8 @@ def _split_glued_command_token(token: str) -> str:

 def _postprocess_math(expr: str) -> str:
    """Postprocess a *math* expression (already inside $...$ or $$...$$)."""
+    # stage0: fix OCR number errors (digits with spaces)
+    expr = _fix_ocr_number_errors(expr)
    # stage1: split glued command tokens (e.g. \cdotdS)
    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
    # stage2: normalize differentials (keep conservative)
@@ -93,6 +95,42 @@ def _postprocess_math(expr: str) -> str:
    return expr


+def _fix_ocr_number_errors(expr: str) -> str:
+    """Fix common OCR errors in LaTeX math expressions.
+
+    OCR often splits numbers incorrectly, especially decimals:
+    - "2 2. 2" should be "22.2"
+    - "3 0. 4" should be "30.4"
+    - "1 5 0" should be "150"
+
+    This function merges digit sequences that are separated by spaces.
+
+    Args:
+        expr: LaTeX math expression.
+
+    Returns:
+        LaTeX expression with number errors fixed.
+    """
+    # Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
+    # Example: "2 2. 2" → "22.2"
+    expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
+    
+    # Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
+    # Example: "22. 2" → "22.2"
+    expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
+    
+    # Fix pattern 3: "digit space digit" (no decimal point, within same number context)
+    # Be careful: only merge if followed by decimal point or comma/end
+    # Example: "1 5 0" → "150" when followed by comma or end
+    expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
+    
+    # Fix pattern 4: Multiple spaces in decimal numbers
+    # Example: "2  2  .  2" → "22.2"
+    expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
+    
+    return expr
+
+
 def _postprocess_markdown(markdown_content: str) -> str:
    """Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
    if not markdown_content: