From 4de9aefa689ae7fe0d794af3b9006e5fbb08613a Mon Sep 17 00:00:00 2001 From: liuyuanchuang Date: Thu, 5 Feb 2026 20:32:26 +0800 Subject: [PATCH] feat: add paddleocr-vl --- app/services/ocr_service.py | 217 +++++++++++++++++++++----------- diagnose_latex_rendering.py | 202 ++++++++++++++++++++++++++++++ pyproject.toml | 7 +- test_remove_false_heading.py | 233 ----------------------------------- 4 files changed, 351 insertions(+), 308 deletions(-) create mode 100644 diagnose_latex_rendering.py delete mode 100644 test_remove_false_heading.py diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 3bcc8d3..e4736e0 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -5,6 +5,7 @@ import numpy as np import cv2 import requests from io import BytesIO +import base64 from app.core.config import get_settings from paddleocr import PaddleOCRVL from typing import Optional @@ -12,6 +13,7 @@ from app.services.layout_detector import LayoutDetector from app.services.image_processor import ImageProcessor from app.services.converter import Converter from abc import ABC, abstractmethod +from openai import OpenAI settings = get_settings() @@ -90,42 +92,42 @@ def _split_glued_command_token(token: str) -> str: def _clean_latex_syntax_spaces(expr: str) -> str: """Clean unwanted spaces in LaTeX syntax (common OCR errors). - + OCR often adds spaces in LaTeX syntax structures where they shouldn't be: - Subscripts: a _ {i 1} -> a_{i1} - Superscripts: x ^ {2 3} -> x^{23} - Fractions: \\frac { a } { b } -> \\frac{a}{b} - Commands: \\ alpha -> \\alpha - Braces: { a b } -> {ab} (within subscripts/superscripts) - + This is safe because these spaces are always OCR errors - LaTeX doesn't need or want spaces in these positions. - + Args: expr: LaTeX math expression. - + Returns: Expression with LaTeX syntax spaces cleaned. """ # Pattern 1: Spaces around _ and ^ (subscript/superscript operators) # a _ {i} -> a_{i}, x ^ {2} -> x^{2} - expr = re.sub(r'\s*_\s*', '_', expr) - expr = re.sub(r'\s*\^\s*', '^', expr) - + expr = re.sub(r"\s*_\s*", "_", expr) + expr = re.sub(r"\s*\^\s*", "^", expr) + # Pattern 2: Spaces inside braces that follow _ or ^ # _{i 1} -> _{i1}, ^{2 3} -> ^{23} # This is safe because spaces inside subscript/superscript braces are usually OCR errors def clean_subscript_superscript_braces(match): operator = match.group(1) # _ or ^ - content = match.group(2) # content inside braces + content = match.group(2) # content inside braces # Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta) # Only remove spaces between non-backslash characters - cleaned = re.sub(r'(? \frac{a}{b} # \frac{ a + b }{ c } -> \frac{a+b}{c} @@ -133,47 +135,46 @@ def _clean_latex_syntax_spaces(expr: str) -> str: numerator = match.group(1).strip() denominator = match.group(2).strip() return f"\\frac{{{numerator}}}{{{denominator}}}" - - expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}', - clean_frac_braces, expr) - + + expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr) + # Pattern 4: Spaces after backslash in LaTeX commands # \ alpha -> \alpha, \ beta -> \beta - expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr) - + expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr) + # Pattern 5: Spaces before/after braces in general contexts (conservative) # Only remove if the space is clearly wrong (e.g., after operators) # { x } in standalone context is kept as-is to avoid breaking valid spacing # But after operators like \sqrt{ x } -> \sqrt{x} - expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr) # \sqrt { -> \sqrt{ - + expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr) # \sqrt { -> \sqrt{ + return expr def _postprocess_math(expr: str) -> str: """Postprocess a *math* expression (already inside $...$ or $$...$$). - + Processing stages: 0. Fix OCR number errors (spaces in numbers) 1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS) 2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1}) 3. Normalize differentials (DISABLED by default to avoid breaking variables) - + Args: expr: LaTeX math expression without delimiters. - + Returns: Processed LaTeX expression. """ # stage0: fix OCR number errors (digits with spaces) expr = _fix_ocr_number_errors(expr) - + # stage1: split glued command tokens (e.g. \cdotdS) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) - + # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces) expr = _clean_latex_syntax_spaces(expr) - + # stage3: normalize differentials - DISABLED # This feature is disabled because it's too aggressive and can break: # - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc. @@ -186,40 +187,36 @@ def _postprocess_math(expr: str) -> str: # # If differential normalization is needed, implement a context-aware version: # expr = _normalize_differentials_contextaware(expr) - + return expr def _normalize_differentials_contextaware(expr: str) -> str: """Context-aware differential normalization (optional, not used by default). - + Only normalizes differentials in specific mathematical contexts: 1. After integral symbols: \\int dx, \\iint dA, \\oint dr 2. In fraction denominators: \\frac{dy}{dx} 3. In explicit differential notation: f(x)dx (function followed by differential) - + This avoids false positives like variable names, subscripts, or LaTeX commands. - + Args: expr: LaTeX math expression. - + Returns: Expression with differentials normalized in safe contexts only. """ # Pattern 1: After integral commands # \int dx -> \int d x - integral_pattern = re.compile( - r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])' - ) - expr = integral_pattern.sub(r'\1 \2 d \3', expr) - + integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])") + expr = integral_pattern.sub(r"\1 \2 d \3", expr) + # Pattern 2: In fraction denominators # \frac{...}{dx} -> \frac{...}{d x} - frac_pattern = re.compile( - r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})' - ) - expr = frac_pattern.sub(r'\1d \2\3', expr) - + frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})") + expr = frac_pattern.sub(r"\1d \2\3", expr) + return expr @@ -241,21 +238,21 @@ def _fix_ocr_number_errors(expr: str) -> str: """ # Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)" # Example: "2 2. 2" → "22.2" - expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr) - + expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr) + # Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)" # Example: "22. 2" → "22.2" - expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr) - + expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr) + # Fix pattern 3: "digit space digit" (no decimal point, within same number context) # Be careful: only merge if followed by decimal point or comma/end # Example: "1 5 0" → "150" when followed by comma or end - expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr) - + expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr) + # Fix pattern 4: Multiple spaces in decimal numbers # Example: "2 2 . 2" → "22.2" - expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr) - + expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr) + return expr @@ -273,76 +270,76 @@ def _postprocess_markdown(markdown_content: str) -> str: return seg markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content) - + # Apply markdown-level postprocessing (after LaTeX processing) markdown_content = _remove_false_heading_from_single_formula(markdown_content) - + return markdown_content def _remove_false_heading_from_single_formula(markdown_content: str) -> str: """Remove false heading markers from single-formula content. - + OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix. This function detects and removes the heading marker when: 1. The content contains only one formula (display or inline) 2. The formula line starts with '#' (heading marker) 3. No other non-formula text content exists - + Examples: Input: "# $$E = mc^2$$" Output: "$$E = mc^2$$" - + Input: "# $x = y$" Output: "$x = y$" - + Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading) Output: "# Introduction\n$$E = mc^2$$" - + Args: markdown_content: Markdown text with potential false headings. - + Returns: Markdown text with false heading markers removed. """ if not markdown_content or not markdown_content.strip(): return markdown_content - - lines = markdown_content.split('\n') - + + lines = markdown_content.split("\n") + # Count formulas and heading lines formula_count = 0 heading_lines = [] has_non_formula_text = False - + for i, line in enumerate(lines): line_stripped = line.strip() - + if not line_stripped: continue - + # Check if line starts with heading marker - heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped) - + heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped) + if heading_match: heading_level = heading_match.group(1) content = heading_match.group(2) - + # Check if the heading content is a formula - if re.fullmatch(r'\$\$?.+\$\$?', content): + if re.fullmatch(r"\$\$?.+\$\$?", content): # This is a heading with a formula heading_lines.append((i, heading_level, content)) formula_count += 1 else: # This is a real heading with text has_non_formula_text = True - elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped): + elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped): # Standalone formula line (not in a heading) formula_count += 1 - elif line_stripped and not re.match(r'^#+\s*$', line_stripped): + elif line_stripped and not re.match(r"^#+\s*$", line_stripped): # Non-empty, non-heading, non-formula line has_non_formula_text = True - + # Only remove heading markers if: # 1. There's exactly one formula # 2. That formula is in a heading line @@ -351,8 +348,8 @@ def _remove_false_heading_from_single_formula(markdown_content: str) -> str: # Remove the heading marker from the formula line_idx, heading_level, formula_content = heading_lines[0] lines[line_idx] = formula_content - - return '\n'.join(lines) + + return "\n".join(lines) class OCRServiceBase(ABC): @@ -492,16 +489,87 @@ class MineruOCRService(OCRServiceBase): api_url: str = "http://127.0.0.1:8000/file_parse", image_processor: Optional[ImageProcessor] = None, converter: Optional[Converter] = None, + paddleocr_vl_url: str = "http://localhost:8000/v1", ): """Initialize Local API service. Args: api_url: URL of the local file_parse API endpoint. converter: Optional converter instance for format conversion. + paddleocr_vl_url: URL of the PaddleOCR-VL vLLM server. """ self.api_url = api_url self.image_processor = image_processor self.converter = converter + self.paddleocr_vl_url = paddleocr_vl_url + self.openai_client = OpenAI(api_key="EMPTY", base_url=paddleocr_vl_url, timeout=3600) + + def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str: + """Recognize formula using PaddleOCR-VL API. + + Args: + image: Input image as numpy array in BGR format. + prompt: Recognition prompt (default: "Formula Recognition:") + + Returns: + Recognized formula text (LaTeX format). + """ + try: + # Encode image to base64 + success, encoded_image = cv2.imencode(".png", image) + if not success: + raise RuntimeError("Failed to encode image") + + image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8") + image_url = f"data:image/png;base64,{image_base64}" + + # Call OpenAI-compatible API + messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}] + + response = self.openai_client.chat.completions.create( + model="PaddlePaddle/PaddleOCR-VL", + messages=messages, + temperature=0.0, + ) + + return response.choices[0].message.content + + except Exception as e: + raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e + + def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str: + """Extract image references from markdown and recognize formulas. + + Args: + markdown_content: Markdown content with potential image references. + original_image: Original input image. + + Returns: + Markdown content with formulas recognized by PaddleOCR-VL. + """ + # Pattern to match image references: ![](images/xxx.png) + image_pattern = re.compile(r"!\[\]\(images/[^)]+\)") + + if not image_pattern.search(markdown_content): + return markdown_content + + try: + # For now, use the entire image for formula recognition + # TODO: Extract specific regions if image paths contain coordinates + formula_text = self._recognize_formula_with_paddleocr_vl(original_image) + + # Replace image references with recognized formulas + # Wrap in display math delimiters if not already wrapped + if not formula_text.startswith("$$"): + formula_text = f"$${formula_text}$$" + + markdown_content = image_pattern.sub(formula_text, markdown_content) + + except Exception as e: + # If formula recognition fails, keep original content + print(f"Warning: Formula recognition failed: {e}") + + return markdown_content def recognize(self, image: np.ndarray) -> dict: """Recognize content using local file_parse API. @@ -554,6 +622,11 @@ class MineruOCRService(OCRServiceBase): if "results" in result and "image" in result["results"]: markdown_content = result["results"]["image"].get("md_content", "") + # Check if markdown contains formula image references + if "![](images/" in markdown_content: + # Use PaddleOCR-VL to recognize the formula + markdown_content = self._extract_and_recognize_formulas(markdown_content, image) + # Apply postprocessing to fix OCR errors markdown_content = _postprocess_markdown(markdown_content) diff --git a/diagnose_latex_rendering.py b/diagnose_latex_rendering.py new file mode 100644 index 0000000..07e7700 --- /dev/null +++ b/diagnose_latex_rendering.py @@ -0,0 +1,202 @@ +"""Diagnostic tool for LaTeX rendering issues. + +Usage: + python diagnose_latex_rendering.py "\\lambda + \\vdots" + python diagnose_latex_rendering.py "$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$" +""" + +import sys +import re +from typing import Dict, Any + +# Simulate the OCR postprocessing pipeline +_COMMANDS_NEED_SPACE = { + "cdot", + "times", + "div", + "pm", + "mp", + "int", + "iint", + "iiint", + "oint", + "sum", + "prod", + "lim", + "sin", + "cos", + "tan", + "cot", + "sec", + "csc", + "log", + "ln", + "exp", + "partial", + "nabla", +} + +_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+") +_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(? str: + """Split OCR-glued LaTeX command token by whitelist longest-prefix.""" + if not token.startswith("\\"): + return token + + body = token[1:] + if len(body) < 2: + return token + + best = None + for i in range(1, len(body)): + prefix = body[:i] + if prefix in _COMMANDS_NEED_SPACE: + best = prefix + + if not best: + return token + + suffix = body[len(best) :] + if not suffix: + return token + + return f"\\{best} {suffix}" + + +def _fix_ocr_number_errors(expr: str) -> str: + """Fix common OCR errors in LaTeX math expressions.""" + expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr) + expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr) + expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr) + expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr) + return expr + + +def _postprocess_math(expr: str) -> str: + """Postprocess a *math* expression (already inside $...$ or $$...$$).""" + original = expr + + # Stage 0: fix OCR number errors + expr = _fix_ocr_number_errors(expr) + stage0 = expr + + # Stage 1: split glued command tokens + expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) + stage1 = expr + + # Stage 2: normalize differentials + expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr) + expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr) + stage2 = expr + + return {"original": original, "after_stage0_numbers": stage0, "after_stage1_commands": stage1, "after_stage2_differentials": stage2, "final": expr} + + +def _postprocess_markdown(markdown_content: str) -> Dict[str, Any]: + """Apply LaTeX postprocessing to markdown segments.""" + if not markdown_content: + return {"original": markdown_content, "final": markdown_content, "segments": []} + + segments = [] + + def _fix_segment(m: re.Match) -> str: + seg = m.group(0) + inner = None + + if seg.startswith("$$") and seg.endswith("$$"): + inner = seg[2:-2] + result = _postprocess_math(inner) + segments.append({"type": "display", "original": seg, "processing": result}) + return f"$${result['final']}$$" + elif seg.startswith("$") and seg.endswith("$"): + inner = seg[1:-1] + result = _postprocess_math(inner) + segments.append({"type": "inline", "original": seg, "processing": result}) + return f"${result['final']}$" + + return seg + + final = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content) + + return {"original": markdown_content, "final": final, "segments": segments, "changed": markdown_content != final} + + +def diagnose(latex_input: str) -> None: + """Run diagnostic on LaTeX input.""" + print("=" * 80) + print("LaTeX Rendering Diagnostic Tool") + print("=" * 80) + print(f"\nInput: {latex_input}") + print("-" * 80) + + # Check if input contains problematic characters + print("\n1. Character Detection:") + if "\\lambda" in latex_input: + print(" ✅ Found \\lambda") + if "\\vdots" in latex_input: + print(" ✅ Found \\vdots") + if "\\cdots" in latex_input: + print(" ℹ️ Found \\cdots (similar to \\vdots)") + if "\\ldots" in latex_input: + print(" ℹ️ Found \\ldots (similar to \\vdots)") + + # Run postprocessing pipeline + print("\n2. Postprocessing Pipeline:") + result = _postprocess_markdown(latex_input) + + if result["segments"]: + for i, seg in enumerate(result["segments"], 1): + print(f"\n Segment {i} ({seg['type']}):") + print(f" Original: {seg['original']}") + + proc = seg["processing"] + + # Check each stage for changes + if proc["original"] != proc["after_stage0_numbers"]: + print(f" ⚠️ Stage 0 (numbers): {proc['after_stage0_numbers']}") + else: + print(f" ✅ Stage 0 (numbers): No change") + + if proc["after_stage0_numbers"] != proc["after_stage1_commands"]: + print(f" ⚠️ Stage 1 (commands): {proc['after_stage1_commands']}") + else: + print(f" ✅ Stage 1 (commands): No change") + + if proc["after_stage1_commands"] != proc["after_stage2_differentials"]: + print(f" ⚠️ Stage 2 (differentials): {proc['after_stage2_differentials']}") + else: + print(f" ✅ Stage 2 (differentials): No change") + + print(f" Final: {proc['final']}") + else: + print(" ℹ️ No math segments found (not wrapped in $ or $$)") + + print("\n3. Final Output:") + print(f" {result['final']}") + + if result["changed"]: + print("\n ⚠️ WARNING: The input was modified during postprocessing!") + print(" This could be the cause of rendering issues.") + else: + print("\n ✅ No changes made during postprocessing.") + print(" If rendering fails, the issue is likely in:") + print(" - Pandoc conversion (LaTeX → MathML)") + print(" - Frontend rendering (MathJax/KaTeX)") + + print("\n" + "=" * 80) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print('Usage: python diagnose_latex_rendering.py ""') + print("\nExamples:") + print(' python diagnose_latex_rendering.py "$\\lambda + \\vdots$"') + print(' python diagnose_latex_rendering.py "$$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$$"') + sys.exit(1) + + latex_input = sys.argv[1] + diagnose(latex_input) diff --git a/pyproject.toml b/pyproject.toml index 73defc8..38eb970 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,11 +27,12 @@ dependencies = [ "paddlepaddle", "paddleocr[doc-parser]", "safetensors", - "lxml>=5.0.0" + "lxml>=5.0.0", + "openai", ] -[tool.uv.sources] -paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" } +# [tool.uv.sources] +# paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" } [project.optional-dependencies] dev = [ diff --git a/test_remove_false_heading.py b/test_remove_false_heading.py deleted file mode 100644 index 02af147..0000000 --- a/test_remove_false_heading.py +++ /dev/null @@ -1,233 +0,0 @@ -"""Test for removing false heading markers from single-formula content. - -OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix. -This test verifies that the heading marker is correctly removed. -""" - -import re - - -def _remove_false_heading_from_single_formula(markdown_content: str) -> str: - """Remove false heading markers from single-formula content.""" - if not markdown_content or not markdown_content.strip(): - return markdown_content - - lines = markdown_content.split('\n') - - # Count formulas and heading lines - formula_count = 0 - heading_lines = [] - has_non_formula_text = False - - for i, line in enumerate(lines): - line_stripped = line.strip() - - if not line_stripped: - continue - - # Check if line starts with heading marker - heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped) - - if heading_match: - heading_level = heading_match.group(1) - content = heading_match.group(2) - - # Check if the heading content is a formula - if re.fullmatch(r'\$\$?.+\$\$?', content): - # This is a heading with a formula - heading_lines.append((i, heading_level, content)) - formula_count += 1 - else: - # This is a real heading with text - has_non_formula_text = True - elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped): - # Standalone formula line (not in a heading) - formula_count += 1 - elif line_stripped and not re.match(r'^#+\s*$', line_stripped): - # Non-empty, non-heading, non-formula line - has_non_formula_text = True - - # Only remove heading markers if: - # 1. There's exactly one formula - # 2. That formula is in a heading line - # 3. There's no other text content - if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text: - # Remove the heading marker from the formula - line_idx, heading_level, formula_content = heading_lines[0] - lines[line_idx] = formula_content - - return '\n'.join(lines) - - -# Test cases -test_cases = [ - # Should remove heading marker (single formula with heading) - ( - "# $$E = mc^2$$", - "$$E = mc^2$$", - "Single display formula with heading" - ), - ( - "# $x = y$", - "$x = y$", - "Single inline formula with heading" - ), - ( - "## $$\\frac{a}{b}$$", - "$$\\frac{a}{b}$$", - "Single formula with level-2 heading" - ), - ( - "### $$\\lambda_{1}$$", - "$$\\lambda_{1}$$", - "Single formula with level-3 heading" - ), - - # Should NOT remove heading marker (has text content) - ( - "# Introduction\n$$E = mc^2$$", - "# Introduction\n$$E = mc^2$$", - "Heading with text + formula (keep heading)" - ), - ( - "# Title\nSome text\n$$E = mc^2$$", - "# Title\nSome text\n$$E = mc^2$$", - "Heading + text + formula (keep heading)" - ), - ( - "$$E = mc^2$$\n# Summary", - "$$E = mc^2$$\n# Summary", - "Formula + heading with text (keep heading)" - ), - - # Should NOT remove heading marker (multiple formulas) - ( - "# $$x = y$$\n$$a = b$$", - "# $$x = y$$\n$$a = b$$", - "Multiple formulas (keep heading)" - ), - ( - "$$x = y$$\n# $$a = b$$", - "$$x = y$$\n# $$a = b$$", - "Two formulas, one with heading (keep heading)" - ), - - # Should NOT remove heading marker (standalone formula without heading) - ( - "$$E = mc^2$$", - "$$E = mc^2$$", - "Single formula without heading (no change)" - ), - ( - "$x = y$", - "$x = y$", - "Single inline formula without heading (no change)" - ), - - # Edge cases - ( - "", - "", - "Empty string" - ), - ( - "# ", - "# ", - "Empty heading" - ), - ( - "#", - "#", - "Just hash symbol" - ), - ( - "# $$E = mc^2$$\n\n", - "$$E = mc^2$$\n\n", - "Formula with heading and trailing newlines" - ), - ( - "\n\n# $$E = mc^2$$", - "\n\n$$E = mc^2$$", - "Formula with heading and leading newlines" - ), - - # Complex formulas - ( - "# $$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$", - "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$", - "Complex integral formula with heading" - ), - ( - "# $$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$", - "$$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$", - "Matrix formula with heading" - ), -] - -print("=" * 80) -print("Remove False Heading from Single Formula - Test") -print("=" * 80) - -passed = 0 -failed = 0 - -for i, (input_text, expected, description) in enumerate(test_cases, 1): - result = _remove_false_heading_from_single_formula(input_text) - - if result == expected: - status = "✅ PASS" - passed += 1 - else: - status = "❌ FAIL" - failed += 1 - - print(f"\n{status} Test {i}: {description}") - print(f" Input: {repr(input_text)}") - print(f" Expected: {repr(expected)}") - print(f" Got: {repr(result)}") - if result != expected: - print(f" >>> MISMATCH!") - -print("\n" + "=" * 80) -print("SUMMARY") -print("=" * 80) -print(f"Total tests: {len(test_cases)}") -print(f"✅ Passed: {passed}") -print(f"❌ Failed: {failed}") - -if failed == 0: - print("\n✅ All tests passed!") -else: - print(f"\n⚠️ {failed} test(s) failed") - -print("\n" + "=" * 80) -print("KEY SCENARIOS") -print("=" * 80) - -key_scenarios = [ - ("# $$E = mc^2$$", "$$E = mc^2$$", "✅ Remove heading"), - ("# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "❌ Keep heading (has text)"), - ("# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "❌ Keep heading (multiple formulas)"), - ("$$E = mc^2$$", "$$E = mc^2$$", "→ No change (no heading)"), -] - -print("\nBehavior Summary:") -for input_text, expected, explanation in key_scenarios: - result = _remove_false_heading_from_single_formula(input_text) - match = "✓" if result == expected else "✗" - print(f" {match} {explanation}") - print(f" {repr(input_text)} → {repr(result)}") - -print("\n" + "=" * 80) -print("DECISION LOGIC") -print("=" * 80) -print(""" -Remove heading marker ONLY when ALL conditions are met: -1. ✅ Exactly ONE formula in the entire content -2. ✅ That formula is on a line starting with '#' (heading marker) -3. ✅ No other text content exists (only formula and empty lines) - -Otherwise: Keep the heading marker as-is. -""") - -print("=" * 80)