feat: add paddleocr-vl

2026-02-05 20:32:26 +08:00
parent 767006ee38
commit 4de9aefa68
4 changed files with 351 additions and 308 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -5,6 +5,7 @@ import numpy as np
 import cv2
 import requests
 from io import BytesIO
+import base64
 from app.core.config import get_settings
 from paddleocr import PaddleOCRVL
 from typing import Optional
@@ -12,6 +13,7 @@ from app.services.layout_detector import LayoutDetector
 from app.services.image_processor import ImageProcessor
 from app.services.converter import Converter
 from abc import ABC, abstractmethod
+from openai import OpenAI

 settings = get_settings()

@@ -90,42 +92,42 @@ def _split_glued_command_token(token: str) -> str:

 def _clean_latex_syntax_spaces(expr: str) -> str:
    """Clean unwanted spaces in LaTeX syntax (common OCR errors).
-    
+
    OCR often adds spaces in LaTeX syntax structures where they shouldn't be:
    - Subscripts: a _ {i 1} -> a_{i1}
    - Superscripts: x ^ {2 3} -> x^{23}
    - Fractions: \\frac { a } { b } -> \\frac{a}{b}
    - Commands: \\ alpha -> \\alpha
    - Braces: { a b } -> {ab} (within subscripts/superscripts)
-    
+
    This is safe because these spaces are always OCR errors - LaTeX doesn't
    need or want spaces in these positions.
-    
+
    Args:
        expr: LaTeX math expression.
-        
+
    Returns:
        Expression with LaTeX syntax spaces cleaned.
    """
    # Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
    # a _ {i} -> a_{i}, x ^ {2} -> x^{2}
-    expr = re.sub(r'\s*_\s*', '_', expr)
-    expr = re.sub(r'\s*\^\s*', '^', expr)
-    
+    expr = re.sub(r"\s*_\s*", "_", expr)
+    expr = re.sub(r"\s*\^\s*", "^", expr)
+
    # Pattern 2: Spaces inside braces that follow _ or ^
    # _{i 1} -> _{i1}, ^{2 3} -> ^{23}
    # This is safe because spaces inside subscript/superscript braces are usually OCR errors
    def clean_subscript_superscript_braces(match):
        operator = match.group(1)  # _ or ^
-        content = match.group(2)   # content inside braces
+        content = match.group(2)  # content inside braces
        # Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
        # Only remove spaces between non-backslash characters
-        cleaned = re.sub(r'(?<!\\)\s+(?!\\)', '', content)
+        cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
        return f"{operator}{{{cleaned}}}"
-    
+
    # Match _{ ... } or ^{ ... }
-    expr = re.sub(r'([_^])\{([^}]+)\}', clean_subscript_superscript_braces, expr)
-    
+    expr = re.sub(r"([_^])\{([^}]+)\}", clean_subscript_superscript_braces, expr)
+
    # Pattern 3: Spaces inside \frac arguments
    # \frac { a } { b } -> \frac{a}{b}
    # \frac{ a + b }{ c } -> \frac{a+b}{c}
@@ -133,47 +135,46 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
        numerator = match.group(1).strip()
        denominator = match.group(2).strip()
        return f"\\frac{{{numerator}}}{{{denominator}}}"
-    
-    expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}', 
-                  clean_frac_braces, expr)
-    
+
+    expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr)
+
    # Pattern 4: Spaces after backslash in LaTeX commands
    # \ alpha -> \alpha, \ beta -> \beta
-    expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr)
-    
+    expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr)
+
    # Pattern 5: Spaces before/after braces in general contexts (conservative)
    # Only remove if the space is clearly wrong (e.g., after operators)
    # { x } in standalone context is kept as-is to avoid breaking valid spacing
    # But after operators like \sqrt{ x } -> \sqrt{x}
-    expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr)  # \sqrt { -> \sqrt{
-    
+    expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr)  # \sqrt { -> \sqrt{
+
    return expr


 def _postprocess_math(expr: str) -> str:
    """Postprocess a *math* expression (already inside $...$ or $$...$$).
-    
+
    Processing stages:
    0. Fix OCR number errors (spaces in numbers)
    1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
    2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
    3. Normalize differentials (DISABLED by default to avoid breaking variables)
-    
+
    Args:
        expr: LaTeX math expression without delimiters.
-        
+
    Returns:
        Processed LaTeX expression.
    """
    # stage0: fix OCR number errors (digits with spaces)
    expr = _fix_ocr_number_errors(expr)
-    
+
    # stage1: split glued command tokens (e.g. \cdotdS)
    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
-    
+
    # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
    expr = _clean_latex_syntax_spaces(expr)
-    
+
    # stage3: normalize differentials - DISABLED
    # This feature is disabled because it's too aggressive and can break:
    # - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
@@ -186,40 +187,36 @@ def _postprocess_math(expr: str) -> str:
    #
    # If differential normalization is needed, implement a context-aware version:
    # expr = _normalize_differentials_contextaware(expr)
-    
+
    return expr


 def _normalize_differentials_contextaware(expr: str) -> str:
    """Context-aware differential normalization (optional, not used by default).
-    
+
    Only normalizes differentials in specific mathematical contexts:
    1. After integral symbols: \\int dx, \\iint dA, \\oint dr
    2. In fraction denominators: \\frac{dy}{dx}
    3. In explicit differential notation: f(x)dx (function followed by differential)
-    
+
    This avoids false positives like variable names, subscripts, or LaTeX commands.
-    
+
    Args:
        expr: LaTeX math expression.
-        
+
    Returns:
        Expression with differentials normalized in safe contexts only.
    """
    # Pattern 1: After integral commands
    # \int dx -> \int d x
-    integral_pattern = re.compile(
-        r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
-    )
-    expr = integral_pattern.sub(r'\1 \2 d \3', expr)
-    
+    integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])")
+    expr = integral_pattern.sub(r"\1 \2 d \3", expr)
+
    # Pattern 2: In fraction denominators
    # \frac{...}{dx} -> \frac{...}{d x}
-    frac_pattern = re.compile(
-        r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
-    )
-    expr = frac_pattern.sub(r'\1d \2\3', expr)
-    
+    frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})")
+    expr = frac_pattern.sub(r"\1d \2\3", expr)
+
    return expr


@@ -241,21 +238,21 @@ def _fix_ocr_number_errors(expr: str) -> str:
    """
    # Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
    # Example: "2 2. 2" → "22.2"
-    expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
-    
+    expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
+
    # Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
    # Example: "22. 2" → "22.2"
-    expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
-    
+    expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
+
    # Fix pattern 3: "digit space digit" (no decimal point, within same number context)
    # Be careful: only merge if followed by decimal point or comma/end
    # Example: "1 5 0" → "150" when followed by comma or end
-    expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
-    
+    expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
+
    # Fix pattern 4: Multiple spaces in decimal numbers
    # Example: "2  2  .  2" → "22.2"
-    expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
-    
+    expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
+
    return expr


@@ -273,76 +270,76 @@ def _postprocess_markdown(markdown_content: str) -> str:
        return seg

    markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
-    
+
    # Apply markdown-level postprocessing (after LaTeX processing)
    markdown_content = _remove_false_heading_from_single_formula(markdown_content)
-    
+
    return markdown_content


 def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
    """Remove false heading markers from single-formula content.
-    
+
    OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
    This function detects and removes the heading marker when:
    1. The content contains only one formula (display or inline)
    2. The formula line starts with '#' (heading marker)
    3. No other non-formula text content exists
-    
+
    Examples:
        Input:  "# $$E = mc^2$$"
        Output: "$$E = mc^2$$"
-        
+
        Input:  "# $x = y$"
        Output: "$x = y$"
-        
+
        Input:  "# Introduction\n$$E = mc^2$$"  (has text, keep heading)
        Output: "# Introduction\n$$E = mc^2$$"
-    
+
    Args:
        markdown_content: Markdown text with potential false headings.
-        
+
    Returns:
        Markdown text with false heading markers removed.
    """
    if not markdown_content or not markdown_content.strip():
        return markdown_content
-    
-    lines = markdown_content.split('\n')
-    
+
+    lines = markdown_content.split("\n")
+
    # Count formulas and heading lines
    formula_count = 0
    heading_lines = []
    has_non_formula_text = False
-    
+
    for i, line in enumerate(lines):
        line_stripped = line.strip()
-        
+
        if not line_stripped:
            continue
-        
+
        # Check if line starts with heading marker
-        heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
-        
+        heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped)
+
        if heading_match:
            heading_level = heading_match.group(1)
            content = heading_match.group(2)
-            
+
            # Check if the heading content is a formula
-            if re.fullmatch(r'\$\$?.+\$\$?', content):
+            if re.fullmatch(r"\$\$?.+\$\$?", content):
                # This is a heading with a formula
                heading_lines.append((i, heading_level, content))
                formula_count += 1
            else:
                # This is a real heading with text
                has_non_formula_text = True
-        elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
+        elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped):
            # Standalone formula line (not in a heading)
            formula_count += 1
-        elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
+        elif line_stripped and not re.match(r"^#+\s*$", line_stripped):
            # Non-empty, non-heading, non-formula line
            has_non_formula_text = True
-    
+
    # Only remove heading markers if:
    # 1. There's exactly one formula
    # 2. That formula is in a heading line
@@ -351,8 +348,8 @@ def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
        # Remove the heading marker from the formula
        line_idx, heading_level, formula_content = heading_lines[0]
        lines[line_idx] = formula_content
-    
-    return '\n'.join(lines)
+
+    return "\n".join(lines)


 class OCRServiceBase(ABC):
@@ -492,16 +489,87 @@ class MineruOCRService(OCRServiceBase):
        api_url: str = "http://127.0.0.1:8000/file_parse",
        image_processor: Optional[ImageProcessor] = None,
        converter: Optional[Converter] = None,
+        paddleocr_vl_url: str = "http://localhost:8000/v1",
    ):
        """Initialize Local API service.

        Args:
            api_url: URL of the local file_parse API endpoint.
            converter: Optional converter instance for format conversion.
+            paddleocr_vl_url: URL of the PaddleOCR-VL vLLM server.
        """
        self.api_url = api_url
        self.image_processor = image_processor
        self.converter = converter
+        self.paddleocr_vl_url = paddleocr_vl_url
+        self.openai_client = OpenAI(api_key="EMPTY", base_url=paddleocr_vl_url, timeout=3600)
+
+    def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
+        """Recognize formula using PaddleOCR-VL API.
+
+        Args:
+            image: Input image as numpy array in BGR format.
+            prompt: Recognition prompt (default: "Formula Recognition:")
+
+        Returns:
+            Recognized formula text (LaTeX format).
+        """
+        try:
+            # Encode image to base64
+            success, encoded_image = cv2.imencode(".png", image)
+            if not success:
+                raise RuntimeError("Failed to encode image")
+
+            image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
+            image_url = f"data:image/png;base64,{image_base64}"
+
+            # Call OpenAI-compatible API
+            messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
+
+            response = self.openai_client.chat.completions.create(
+                model="PaddlePaddle/PaddleOCR-VL",
+                messages=messages,
+                temperature=0.0,
+            )
+
+            return response.choices[0].message.content
+
+        except Exception as e:
+            raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
+
+    def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
+        """Extract image references from markdown and recognize formulas.
+
+        Args:
+            markdown_content: Markdown content with potential image references.
+            original_image: Original input image.
+
+        Returns:
+            Markdown content with formulas recognized by PaddleOCR-VL.
+        """
+        # Pattern to match image references: ![](images/xxx.png)
+        image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
+
+        if not image_pattern.search(markdown_content):
+            return markdown_content
+
+        try:
+            # For now, use the entire image for formula recognition
+            # TODO: Extract specific regions if image paths contain coordinates
+            formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
+
+            # Replace image references with recognized formulas
+            # Wrap in display math delimiters if not already wrapped
+            if not formula_text.startswith("$$"):
+                formula_text = f"$${formula_text}$$"
+
+            markdown_content = image_pattern.sub(formula_text, markdown_content)
+
+        except Exception as e:
+            # If formula recognition fails, keep original content
+            print(f"Warning: Formula recognition failed: {e}")
+
+        return markdown_content

    def recognize(self, image: np.ndarray) -> dict:
        """Recognize content using local file_parse API.
@@ -554,6 +622,11 @@ class MineruOCRService(OCRServiceBase):
            if "results" in result and "image" in result["results"]:
                markdown_content = result["results"]["image"].get("md_content", "")

+            # Check if markdown contains formula image references
+            if "![](images/" in markdown_content:
+                # Use PaddleOCR-VL to recognize the formula
+                markdown_content = self._extract_and_recognize_formulas(markdown_content, image)
+
            # Apply postprocessing to fix OCR errors
            markdown_content = _postprocess_markdown(markdown_content)

--- a/diagnose_latex_rendering.py
+++ b/diagnose_latex_rendering.py
@@ -0,0 +1,202 @@
+"""Diagnostic tool for LaTeX rendering issues.
+
+Usage:
+    python diagnose_latex_rendering.py "\\lambda + \\vdots"
+    python diagnose_latex_rendering.py "$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$"
+"""
+
+import sys
+import re
+from typing import Dict, Any
+
+# Simulate the OCR postprocessing pipeline
+_COMMANDS_NEED_SPACE = {
+    "cdot",
+    "times",
+    "div",
+    "pm",
+    "mp",
+    "int",
+    "iint",
+    "iiint",
+    "oint",
+    "sum",
+    "prod",
+    "lim",
+    "sin",
+    "cos",
+    "tan",
+    "cot",
+    "sec",
+    "csc",
+    "log",
+    "ln",
+    "exp",
+    "partial",
+    "nabla",
+}
+
+_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
+_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
+_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
+_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
+
+
+def _split_glued_command_token(token: str) -> str:
+    """Split OCR-glued LaTeX command token by whitelist longest-prefix."""
+    if not token.startswith("\\"):
+        return token
+
+    body = token[1:]
+    if len(body) < 2:
+        return token
+
+    best = None
+    for i in range(1, len(body)):
+        prefix = body[:i]
+        if prefix in _COMMANDS_NEED_SPACE:
+            best = prefix
+
+    if not best:
+        return token
+
+    suffix = body[len(best) :]
+    if not suffix:
+        return token
+
+    return f"\\{best} {suffix}"
+
+
+def _fix_ocr_number_errors(expr: str) -> str:
+    """Fix common OCR errors in LaTeX math expressions."""
+    expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
+    expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
+    expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
+    expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
+    return expr
+
+
+def _postprocess_math(expr: str) -> str:
+    """Postprocess a *math* expression (already inside $...$ or $$...$$)."""
+    original = expr
+
+    # Stage 0: fix OCR number errors
+    expr = _fix_ocr_number_errors(expr)
+    stage0 = expr
+
+    # Stage 1: split glued command tokens
+    expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
+    stage1 = expr
+
+    # Stage 2: normalize differentials
+    expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
+    expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
+    stage2 = expr
+
+    return {"original": original, "after_stage0_numbers": stage0, "after_stage1_commands": stage1, "after_stage2_differentials": stage2, "final": expr}
+
+
+def _postprocess_markdown(markdown_content: str) -> Dict[str, Any]:
+    """Apply LaTeX postprocessing to markdown segments."""
+    if not markdown_content:
+        return {"original": markdown_content, "final": markdown_content, "segments": []}
+
+    segments = []
+
+    def _fix_segment(m: re.Match) -> str:
+        seg = m.group(0)
+        inner = None
+
+        if seg.startswith("$$") and seg.endswith("$$"):
+            inner = seg[2:-2]
+            result = _postprocess_math(inner)
+            segments.append({"type": "display", "original": seg, "processing": result})
+            return f"$${result['final']}$$"
+        elif seg.startswith("$") and seg.endswith("$"):
+            inner = seg[1:-1]
+            result = _postprocess_math(inner)
+            segments.append({"type": "inline", "original": seg, "processing": result})
+            return f"${result['final']}$"
+
+        return seg
+
+    final = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
+
+    return {"original": markdown_content, "final": final, "segments": segments, "changed": markdown_content != final}
+
+
+def diagnose(latex_input: str) -> None:
+    """Run diagnostic on LaTeX input."""
+    print("=" * 80)
+    print("LaTeX Rendering Diagnostic Tool")
+    print("=" * 80)
+    print(f"\nInput: {latex_input}")
+    print("-" * 80)
+
+    # Check if input contains problematic characters
+    print("\n1. Character Detection:")
+    if "\\lambda" in latex_input:
+        print("   ✅ Found \\lambda")
+    if "\\vdots" in latex_input:
+        print("   ✅ Found \\vdots")
+    if "\\cdots" in latex_input:
+        print("   ℹ️  Found \\cdots (similar to \\vdots)")
+    if "\\ldots" in latex_input:
+        print("   ℹ️  Found \\ldots (similar to \\vdots)")
+
+    # Run postprocessing pipeline
+    print("\n2. Postprocessing Pipeline:")
+    result = _postprocess_markdown(latex_input)
+
+    if result["segments"]:
+        for i, seg in enumerate(result["segments"], 1):
+            print(f"\n   Segment {i} ({seg['type']}):")
+            print(f"     Original: {seg['original']}")
+
+            proc = seg["processing"]
+
+            # Check each stage for changes
+            if proc["original"] != proc["after_stage0_numbers"]:
+                print(f"     ⚠️  Stage 0 (numbers): {proc['after_stage0_numbers']}")
+            else:
+                print(f"     ✅ Stage 0 (numbers): No change")
+
+            if proc["after_stage0_numbers"] != proc["after_stage1_commands"]:
+                print(f"     ⚠️  Stage 1 (commands): {proc['after_stage1_commands']}")
+            else:
+                print(f"     ✅ Stage 1 (commands): No change")
+
+            if proc["after_stage1_commands"] != proc["after_stage2_differentials"]:
+                print(f"     ⚠️  Stage 2 (differentials): {proc['after_stage2_differentials']}")
+            else:
+                print(f"     ✅ Stage 2 (differentials): No change")
+
+            print(f"     Final: {proc['final']}")
+    else:
+        print("   ℹ️  No math segments found (not wrapped in $ or $$)")
+
+    print("\n3. Final Output:")
+    print(f"   {result['final']}")
+
+    if result["changed"]:
+        print("\n   ⚠️  WARNING: The input was modified during postprocessing!")
+        print("   This could be the cause of rendering issues.")
+    else:
+        print("\n   ✅ No changes made during postprocessing.")
+        print("   If rendering fails, the issue is likely in:")
+        print("      - Pandoc conversion (LaTeX → MathML)")
+        print("      - Frontend rendering (MathJax/KaTeX)")
+
+    print("\n" + "=" * 80)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print('Usage: python diagnose_latex_rendering.py "<latex_formula>"')
+        print("\nExamples:")
+        print('  python diagnose_latex_rendering.py "$\\lambda + \\vdots$"')
+        print('  python diagnose_latex_rendering.py "$$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$$"')
+        sys.exit(1)
+
+    latex_input = sys.argv[1]
+    diagnose(latex_input)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,11 +27,12 @@ dependencies = [
    "paddlepaddle",
    "paddleocr[doc-parser]",
    "safetensors",
-    "lxml>=5.0.0"
+    "lxml>=5.0.0",
+    "openai",
 ]

-[tool.uv.sources]
-paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
+# [tool.uv.sources]
+# paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }

 [project.optional-dependencies]
 dev = [
--- a/test_remove_false_heading.py
+++ b/test_remove_false_heading.py
@@ -1,233 +0,0 @@
-"""Test for removing false heading markers from single-formula content.
-
-OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
-This test verifies that the heading marker is correctly removed.
-"""
-
-import re
-
-
-def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
-    """Remove false heading markers from single-formula content."""
-    if not markdown_content or not markdown_content.strip():
-        return markdown_content
-    
-    lines = markdown_content.split('\n')
-    
-    # Count formulas and heading lines
-    formula_count = 0
-    heading_lines = []
-    has_non_formula_text = False
-    
-    for i, line in enumerate(lines):
-        line_stripped = line.strip()
-        
-        if not line_stripped:
-            continue
-        
-        # Check if line starts with heading marker
-        heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
-        
-        if heading_match:
-            heading_level = heading_match.group(1)
-            content = heading_match.group(2)
-            
-            # Check if the heading content is a formula
-            if re.fullmatch(r'\$\$?.+\$\$?', content):
-                # This is a heading with a formula
-                heading_lines.append((i, heading_level, content))
-                formula_count += 1
-            else:
-                # This is a real heading with text
-                has_non_formula_text = True
-        elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
-            # Standalone formula line (not in a heading)
-            formula_count += 1
-        elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
-            # Non-empty, non-heading, non-formula line
-            has_non_formula_text = True
-    
-    # Only remove heading markers if:
-    # 1. There's exactly one formula
-    # 2. That formula is in a heading line
-    # 3. There's no other text content
-    if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
-        # Remove the heading marker from the formula
-        line_idx, heading_level, formula_content = heading_lines[0]
-        lines[line_idx] = formula_content
-    
-    return '\n'.join(lines)
-
-
-# Test cases
-test_cases = [
-    # Should remove heading marker (single formula with heading)
-    (
-        "# $$E = mc^2$$",
-        "$$E = mc^2$$",
-        "Single display formula with heading"
-    ),
-    (
-        "# $x = y$",
-        "$x = y$",
-        "Single inline formula with heading"
-    ),
-    (
-        "## $$\\frac{a}{b}$$",
-        "$$\\frac{a}{b}$$",
-        "Single formula with level-2 heading"
-    ),
-    (
-        "### $$\\lambda_{1}$$",
-        "$$\\lambda_{1}$$",
-        "Single formula with level-3 heading"
-    ),
-    
-    # Should NOT remove heading marker (has text content)
-    (
-        "# Introduction\n$$E = mc^2$$",
-        "# Introduction\n$$E = mc^2$$",
-        "Heading with text + formula (keep heading)"
-    ),
-    (
-        "# Title\nSome text\n$$E = mc^2$$",
-        "# Title\nSome text\n$$E = mc^2$$",
-        "Heading + text + formula (keep heading)"
-    ),
-    (
-        "$$E = mc^2$$\n# Summary",
-        "$$E = mc^2$$\n# Summary",
-        "Formula + heading with text (keep heading)"
-    ),
-    
-    # Should NOT remove heading marker (multiple formulas)
-    (
-        "# $$x = y$$\n$$a = b$$",
-        "# $$x = y$$\n$$a = b$$",
-        "Multiple formulas (keep heading)"
-    ),
-    (
-        "$$x = y$$\n# $$a = b$$",
-        "$$x = y$$\n# $$a = b$$",
-        "Two formulas, one with heading (keep heading)"
-    ),
-    
-    # Should NOT remove heading marker (standalone formula without heading)
-    (
-        "$$E = mc^2$$",
-        "$$E = mc^2$$",
-        "Single formula without heading (no change)"
-    ),
-    (
-        "$x = y$",
-        "$x = y$",
-        "Single inline formula without heading (no change)"
-    ),
-    
-    # Edge cases
-    (
-        "",
-        "",
-        "Empty string"
-    ),
-    (
-        "# ",
-        "# ",
-        "Empty heading"
-    ),
-    (
-        "#",
-        "#",
-        "Just hash symbol"
-    ),
-    (
-        "# $$E = mc^2$$\n\n",
-        "$$E = mc^2$$\n\n",
-        "Formula with heading and trailing newlines"
-    ),
-    (
-        "\n\n# $$E = mc^2$$",
-        "\n\n$$E = mc^2$$",
-        "Formula with heading and leading newlines"
-    ),
-    
-    # Complex formulas
-    (
-        "# $$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
-        "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
-        "Complex integral formula with heading"
-    ),
-    (
-        "# $$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
-        "$$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
-        "Matrix formula with heading"
-    ),
-]
-
-print("=" * 80)
-print("Remove False Heading from Single Formula - Test")
-print("=" * 80)
-
-passed = 0
-failed = 0
-
-for i, (input_text, expected, description) in enumerate(test_cases, 1):
-    result = _remove_false_heading_from_single_formula(input_text)
-    
-    if result == expected:
-        status = "✅ PASS"
-        passed += 1
-    else:
-        status = "❌ FAIL"
-        failed += 1
-    
-    print(f"\n{status} Test {i}: {description}")
-    print(f"  Input:    {repr(input_text)}")
-    print(f"  Expected: {repr(expected)}")
-    print(f"  Got:      {repr(result)}")
-    if result != expected:
-        print(f"  >>> MISMATCH!")
-
-print("\n" + "=" * 80)
-print("SUMMARY")
-print("=" * 80)
-print(f"Total tests: {len(test_cases)}")
-print(f"✅ Passed: {passed}")
-print(f"❌ Failed: {failed}")
-
-if failed == 0:
-    print("\n✅ All tests passed!")
-else:
-    print(f"\n⚠️  {failed} test(s) failed")
-
-print("\n" + "=" * 80)
-print("KEY SCENARIOS")
-print("=" * 80)
-
-key_scenarios = [
-    ("# $$E = mc^2$$", "$$E = mc^2$$", "✅ Remove heading"),
-    ("# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "❌ Keep heading (has text)"),
-    ("# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "❌ Keep heading (multiple formulas)"),
-    ("$$E = mc^2$$", "$$E = mc^2$$", "→ No change (no heading)"),
-]
-
-print("\nBehavior Summary:")
-for input_text, expected, explanation in key_scenarios:
-    result = _remove_false_heading_from_single_formula(input_text)
-    match = "✓" if result == expected else "✗"
-    print(f"  {match} {explanation}")
-    print(f"     {repr(input_text)} → {repr(result)}")
-
-print("\n" + "=" * 80)
-print("DECISION LOGIC")
-print("=" * 80)
-print("""
-Remove heading marker ONLY when ALL conditions are met:
-1. ✅ Exactly ONE formula in the entire content
-2. ✅ That formula is on a line starting with '#' (heading marker)
-3. ✅ No other text content exists (only formula and empty lines)
-
-Otherwise: Keep the heading marker as-is.
-""")
-
-print("=" * 80)